mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-03 18:00:10 +01:00
gallium/swr: Remove driver source
The OpenSWR will be maintained on a classic/LTS branch. Reviewed-by: Dylan Baker <dylan@pnwbakers.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11264>
This commit is contained in:
parent
d22d328859
commit
855793c6c6
178 changed files with 0 additions and 85594 deletions
|
|
@ -1,64 +0,0 @@
|
|||
---
|
||||
Language: Cpp
|
||||
AccessModifierOffset: -3
|
||||
AlignAfterOpenBracket: true
|
||||
AlignEscapedNewlinesLeft: false
|
||||
AlignOperands: false
|
||||
AlignTrailingComments: false
|
||||
AllowAllParametersOfDeclarationOnNextLine: true
|
||||
AllowShortBlocksOnASingleLine: false
|
||||
AllowShortCaseLabelsOnASingleLine: false
|
||||
AllowShortIfStatementsOnASingleLine: false
|
||||
AllowShortLoopsOnASingleLine: false
|
||||
AllowShortFunctionsOnASingleLine: All
|
||||
AlwaysBreakAfterDefinitionReturnType: true
|
||||
AlwaysBreakTemplateDeclarations: false
|
||||
AlwaysBreakBeforeMultilineStrings: false
|
||||
BreakBeforeBinaryOperators: NonAssignment
|
||||
BreakBeforeTernaryOperators: true
|
||||
BreakConstructorInitializersBeforeComma: true
|
||||
BinPackParameters: false
|
||||
BinPackArguments: false
|
||||
ColumnLimit: 78
|
||||
ConstructorInitializerAllOnOneLineOrOnePerLine: false
|
||||
ConstructorInitializerIndentWidth: 3
|
||||
DerivePointerAlignment: false
|
||||
ExperimentalAutoDetectBinPacking: false
|
||||
IndentCaseLabels: false
|
||||
IndentWrappedFunctionNames: false
|
||||
IndentFunctionDeclarationAfterType: false
|
||||
MaxEmptyLinesToKeep: 2
|
||||
KeepEmptyLinesAtTheStartOfBlocks: true
|
||||
NamespaceIndentation: Inner
|
||||
ObjCBlockIndentWidth: 3
|
||||
ObjCSpaceAfterProperty: true
|
||||
ObjCSpaceBeforeProtocolList: true
|
||||
PenaltyBreakBeforeFirstCallParameter: 19
|
||||
PenaltyBreakComment: 300
|
||||
PenaltyBreakString: 1000
|
||||
PenaltyBreakFirstLessLess: 120
|
||||
PenaltyExcessCharacter: 1000000
|
||||
PenaltyReturnTypeOnItsOwnLine: 0
|
||||
PointerAlignment: Right
|
||||
SpacesBeforeTrailingComments: 1
|
||||
Cpp11BracedListStyle: true
|
||||
Standard: Cpp11
|
||||
IndentWidth: 3
|
||||
TabWidth: 8
|
||||
UseTab: Never
|
||||
BreakBeforeBraces: Linux
|
||||
SpacesInParentheses: false
|
||||
SpacesInSquareBrackets: false
|
||||
SpacesInAngles: false
|
||||
SpaceInEmptyParentheses: false
|
||||
SpacesInCStyleCastParentheses: false
|
||||
SpaceAfterCStyleCast: false
|
||||
SpacesInContainerLiterals: true
|
||||
SpaceBeforeAssignmentOperators: true
|
||||
ContinuationIndentWidth: 3
|
||||
CommentPragmas: '^ IWYU pragma:'
|
||||
ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
|
||||
SpaceBeforeParens: ControlStatements
|
||||
DisableFormat: false
|
||||
...
|
||||
|
||||
|
|
@ -1,411 +0,0 @@
|
|||
# Copyright © 2017-2020 Intel Corporation
|
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
files_swr_common = files(
|
||||
'rasterizer/common/formats.cpp',
|
||||
'rasterizer/common/formats.h',
|
||||
'rasterizer/common/intrin.h',
|
||||
'rasterizer/common/isa.hpp',
|
||||
'rasterizer/common/os.cpp',
|
||||
'rasterizer/common/os.h',
|
||||
'rasterizer/common/rdtsc_buckets.cpp',
|
||||
'rasterizer/common/rdtsc_buckets.h',
|
||||
'rasterizer/common/rdtsc_buckets_shared.h',
|
||||
'rasterizer/common/rdtsc_buckets_shared.h',
|
||||
'rasterizer/common/simd16intrin.h',
|
||||
'rasterizer/common/simdintrin.h',
|
||||
'rasterizer/common/simdlib.hpp',
|
||||
'rasterizer/common/simdlib_interface.hpp',
|
||||
'rasterizer/common/simdlib_types.hpp',
|
||||
'rasterizer/common/swr_assert.cpp',
|
||||
'rasterizer/common/swr_assert.h',
|
||||
)
|
||||
|
||||
files_swr_mesa = files(
|
||||
'swr_loader.cpp',
|
||||
'swr_clear.cpp',
|
||||
'swr_context.cpp',
|
||||
'swr_context.h',
|
||||
'swr_draw.cpp',
|
||||
'swr_public.h',
|
||||
'swr_resource.h',
|
||||
'swr_screen.cpp',
|
||||
'swr_screen.h',
|
||||
'swr_state.cpp',
|
||||
'swr_state.h',
|
||||
'swr_tex_sample.cpp',
|
||||
'swr_tex_sample.h',
|
||||
'swr_scratch.h',
|
||||
'swr_scratch.cpp',
|
||||
'swr_shader.cpp',
|
||||
'swr_shader.h',
|
||||
'swr_memory.h',
|
||||
'swr_fence.h',
|
||||
'swr_fence.cpp',
|
||||
'swr_fence_work.h',
|
||||
'swr_fence_work.cpp',
|
||||
'swr_query.h',
|
||||
'swr_query.cpp',
|
||||
'rasterizer/jitter/blend_jit.cpp',
|
||||
'rasterizer/jitter/blend_jit.h',
|
||||
'rasterizer/jitter/builder.cpp',
|
||||
'rasterizer/jitter/builder.h',
|
||||
'rasterizer/jitter/builder_math.h',
|
||||
'rasterizer/jitter/builder_mem.cpp',
|
||||
'rasterizer/jitter/builder_mem.h',
|
||||
'rasterizer/jitter/builder_gfx_mem.cpp',
|
||||
'rasterizer/jitter/builder_gfx_mem.h',
|
||||
'rasterizer/jitter/builder_misc.cpp',
|
||||
'rasterizer/jitter/builder_misc.h',
|
||||
'rasterizer/jitter/fetch_jit.cpp',
|
||||
'rasterizer/jitter/fetch_jit.h',
|
||||
'rasterizer/jitter/jit_api.h',
|
||||
'rasterizer/jitter/JitManager.cpp',
|
||||
'rasterizer/jitter/JitManager.h',
|
||||
'rasterizer/jitter/streamout_jit.cpp',
|
||||
'rasterizer/jitter/streamout_jit.h',
|
||||
'rasterizer/jitter/shader_lib/DebugOutput.cpp',
|
||||
'rasterizer/jitter/shader_lib/Scatter.cpp',
|
||||
'rasterizer/jitter/functionpasses/lower_x86.cpp',
|
||||
'rasterizer/memory/SurfaceState.h'
|
||||
)
|
||||
|
||||
files_swr_arch = files(
|
||||
'rasterizer/archrast/archrast.cpp',
|
||||
'rasterizer/archrast/archrast.h',
|
||||
'rasterizer/archrast/eventmanager.h',
|
||||
'rasterizer/core/api.cpp',
|
||||
'rasterizer/core/api.h',
|
||||
'rasterizer/core/arena.h',
|
||||
'rasterizer/core/backend.cpp',
|
||||
'rasterizer/core/backend_clear.cpp',
|
||||
'rasterizer/core/backend_sample.cpp',
|
||||
'rasterizer/core/backend_singlesample.cpp',
|
||||
'rasterizer/core/backend.h',
|
||||
'rasterizer/core/backend_impl.h',
|
||||
'rasterizer/core/binner.cpp',
|
||||
'rasterizer/core/binner.h',
|
||||
'rasterizer/core/blend.h',
|
||||
'rasterizer/core/clip.cpp',
|
||||
'rasterizer/core/clip.h',
|
||||
'rasterizer/core/conservativeRast.h',
|
||||
'rasterizer/core/context.h',
|
||||
'rasterizer/core/depthstencil.h',
|
||||
'rasterizer/core/fifo.hpp',
|
||||
'rasterizer/core/format_conversion.h',
|
||||
'rasterizer/core/format_traits.h',
|
||||
'rasterizer/core/format_types.h',
|
||||
'rasterizer/core/format_utils.h',
|
||||
'rasterizer/core/frontend.cpp',
|
||||
'rasterizer/core/frontend.h',
|
||||
'rasterizer/core/knobs.h',
|
||||
'rasterizer/core/knobs_init.h',
|
||||
'rasterizer/core/multisample.h',
|
||||
'rasterizer/core/pa_avx.cpp',
|
||||
'rasterizer/core/pa.h',
|
||||
'rasterizer/core/rasterizer.cpp',
|
||||
'rasterizer/core/rasterizer.h',
|
||||
'rasterizer/core/rasterizer_impl.h',
|
||||
'rasterizer/core/rdtsc_core.cpp',
|
||||
'rasterizer/core/rdtsc_core.h',
|
||||
'rasterizer/core/ringbuffer.h',
|
||||
'rasterizer/core/state.h',
|
||||
'rasterizer/core/state_funcs.h',
|
||||
'rasterizer/core/tessellator.h',
|
||||
'rasterizer/core/tessellator.hpp',
|
||||
'rasterizer/core/tessellator.cpp',
|
||||
'rasterizer/core/threads.cpp',
|
||||
'rasterizer/core/threads.h',
|
||||
'rasterizer/core/tilemgr.cpp',
|
||||
'rasterizer/core/tilemgr.h',
|
||||
'rasterizer/core/tileset.h',
|
||||
'rasterizer/core/utils.h',
|
||||
'rasterizer/memory/ClearTile.cpp',
|
||||
'rasterizer/memory/Convert.h',
|
||||
'rasterizer/memory/LoadTile.cpp',
|
||||
'rasterizer/memory/LoadTile.h',
|
||||
'rasterizer/memory/LoadTile_Linear.cpp',
|
||||
'rasterizer/memory/LoadTile_TileX.cpp',
|
||||
'rasterizer/memory/LoadTile_TileY.cpp',
|
||||
'rasterizer/memory/StoreTile.cpp',
|
||||
'rasterizer/memory/StoreTile.h',
|
||||
'rasterizer/memory/StoreTile_Linear2.cpp',
|
||||
'rasterizer/memory/StoreTile_Linear.cpp',
|
||||
'rasterizer/memory/StoreTile_TileW.cpp',
|
||||
'rasterizer/memory/StoreTile_TileX2.cpp',
|
||||
'rasterizer/memory/StoreTile_TileX.cpp',
|
||||
'rasterizer/memory/StoreTile_TileY2.cpp',
|
||||
'rasterizer/memory/StoreTile_TileY.cpp',
|
||||
'rasterizer/memory/TilingFunctions.h',
|
||||
'rasterizer/memory/tilingtraits.h',
|
||||
'rasterizer/memory/InitMemory.h',
|
||||
'rasterizer/memory/InitMemory.cpp',
|
||||
'rasterizer/memory/SurfaceState.h'
|
||||
)
|
||||
|
||||
swr_context_files = files('swr_context.h')
|
||||
swr_state_files = files('rasterizer/core/state.h')
|
||||
swr_surf_state_files = files('rasterizer/memory/SurfaceState.h')
|
||||
swr_event_proto_files = files('rasterizer/archrast/events.proto')
|
||||
swr_event_pproto_files = files('rasterizer/archrast/events_private.proto')
|
||||
swr_gen_backend_files = files('rasterizer/codegen/templates/gen_backend.cpp')
|
||||
swr_gen_rasterizer_files = files('rasterizer/codegen/templates/gen_rasterizer.cpp')
|
||||
swr_gen_header_init_files = files('rasterizer/codegen/templates/gen_header_init.hpp')
|
||||
|
||||
swr_gen_llvm_ir_macros_py = files('rasterizer/codegen/gen_llvm_ir_macros.py')
|
||||
swr_gen_backends_py = files('rasterizer/codegen/gen_backends.py')
|
||||
|
||||
swr_gen_builder_depends = files(
|
||||
'rasterizer/codegen/templates/gen_builder.hpp',
|
||||
'rasterizer/codegen/gen_common.py'
|
||||
)
|
||||
|
||||
|
||||
subdir('rasterizer/jitter')
|
||||
subdir('rasterizer/codegen')
|
||||
subdir('rasterizer/core/backends')
|
||||
|
||||
swr_incs = include_directories(
|
||||
'rasterizer/codegen', 'rasterizer/core', 'rasterizer/jitter',
|
||||
'rasterizer/archrast', 'rasterizer',
|
||||
)
|
||||
|
||||
swr_cpp_args = []
|
||||
if cpp.has_argument('-fno-strict-aliasing')
|
||||
swr_cpp_args += '-fno-strict-aliasing'
|
||||
endif
|
||||
if cpp.has_argument('-Wno-aligned-new')
|
||||
swr_cpp_args += '-Wno-aligned-new'
|
||||
endif
|
||||
|
||||
|
||||
swr_arch_libs = []
|
||||
swr_defines = []
|
||||
|
||||
swr_avx_args = cpp.first_supported_argument(
|
||||
'-target-cpu=sandybridge', '-mavx', '-march=core-avx', '-tp=sandybridge',
|
||||
'/arch:AVX',
|
||||
)
|
||||
if swr_avx_args == []
|
||||
error('Cannot find AVX support for swr. (these are required for SWR an all architectures.)')
|
||||
endif
|
||||
|
||||
shared_swr = get_option('shared-swr')
|
||||
if not shared_swr
|
||||
if with_swr_arches.length() > 1
|
||||
error('When SWR is linked statically only one architecture is allowed.')
|
||||
endif
|
||||
swr_defines += '-DHAVE_SWR_BUILTIN'
|
||||
endif
|
||||
|
||||
if with_swr_arches.contains('skx')
|
||||
swr_skx_args = cpp.first_supported_argument(
|
||||
'-march=skylake-avx512', '-target-cpu=x86-skylake', '-xCORE-AVX512',
|
||||
)
|
||||
if swr_skx_args == []
|
||||
error('Cannot find SKX support for swr.')
|
||||
endif
|
||||
|
||||
swr_defines += '-DHAVE_SWR_SKX'
|
||||
if shared_swr
|
||||
swr_arch_libs += shared_library(
|
||||
'swrSKX',
|
||||
[files_swr_common, files_swr_arch],
|
||||
cpp_args : [
|
||||
cpp_msvc_compat_args, swr_cpp_args, swr_skx_args,
|
||||
'-DKNOB_ARCH=KNOB_ARCH_AVX512',
|
||||
],
|
||||
gnu_symbol_visibility : 'hidden',
|
||||
link_args : [ld_args_gc_sections],
|
||||
include_directories : [swr_incs],
|
||||
dependencies : [dep_thread, dep_llvm],
|
||||
version : '0.0.0',
|
||||
soversion : host_machine.system() == 'windows' ? '' : '0',
|
||||
install : true,
|
||||
name_prefix : host_machine.system() == 'windows' ? '' : 'lib',
|
||||
)
|
||||
else
|
||||
swr_arch_libs += static_library(
|
||||
'swrSKX',
|
||||
[files_swr_common, files_swr_arch],
|
||||
cpp_args : [
|
||||
cpp_msvc_compat_args, swr_cpp_args, swr_skx_args,
|
||||
'-DKNOB_ARCH=KNOB_ARCH_AVX512',
|
||||
],
|
||||
gnu_symbol_visibility : 'hidden',
|
||||
link_args : [ld_args_gc_sections],
|
||||
include_directories : [swr_incs],
|
||||
dependencies : [dep_thread, dep_llvm],
|
||||
)
|
||||
endif
|
||||
endif
|
||||
|
||||
if with_swr_arches.contains('knl')
|
||||
swr_knl_args = cpp.first_supported_argument(
|
||||
'-march=knl', '-target-cpu=mic-knl', '-xMIC-AVX512',
|
||||
)
|
||||
if swr_knl_args == []
|
||||
error('Cannot find KNL support for swr.')
|
||||
endif
|
||||
|
||||
swr_defines += '-DHAVE_SWR_KNL'
|
||||
if shared_swr
|
||||
swr_arch_libs += shared_library(
|
||||
'swrKNL',
|
||||
[files_swr_common, files_swr_arch],
|
||||
cpp_args : [
|
||||
cpp_msvc_compat_args, swr_cpp_args, swr_knl_args,
|
||||
'-DKNOB_ARCH=KNOB_ARCH_AVX512', '-DSIMD_ARCH_KNIGHTS',
|
||||
],
|
||||
gnu_symbol_visibility : 'hidden',
|
||||
link_args : [ld_args_gc_sections],
|
||||
include_directories : [swr_incs],
|
||||
dependencies : [dep_thread, dep_llvm],
|
||||
version : '0.0.0',
|
||||
soversion : host_machine.system() == 'windows' ? '' : '0',
|
||||
install : true,
|
||||
name_prefix : host_machine.system() == 'windows' ? '' : 'lib',
|
||||
)
|
||||
else
|
||||
swr_arch_libs += static_library(
|
||||
'swrKNL',
|
||||
[files_swr_common, files_swr_arch],
|
||||
cpp_args : [
|
||||
cpp_msvc_compat_args, swr_cpp_args, swr_knl_args,
|
||||
'-DKNOB_ARCH=KNOB_ARCH_AVX512', '-DSIMD_ARCH_KNIGHTS',
|
||||
],
|
||||
gnu_symbol_visibility : 'hidden',
|
||||
link_args : [ld_args_gc_sections],
|
||||
include_directories : [swr_incs],
|
||||
dependencies : [dep_thread, dep_llvm],
|
||||
)
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
if with_swr_arches.contains('avx2')
|
||||
swr_avx2_args = cpp.first_supported_argument(
|
||||
'-target-cpu=haswell', '-march=core-avx2', '-tp=haswell', '/arch:AVX2',
|
||||
)
|
||||
if swr_avx2_args == []
|
||||
if cpp.has_argument(['-mavx2', '-mfma', '-mbmi2', '-mf16c'])
|
||||
swr_avx2_args = ['-mavx2', '-mfma', '-mbmi2', '-mf16c']
|
||||
else
|
||||
error('Cannot find AVX2 support for swr.')
|
||||
endif
|
||||
endif
|
||||
|
||||
swr_defines += '-DHAVE_SWR_AVX2'
|
||||
if shared_swr
|
||||
swr_arch_libs += shared_library(
|
||||
'swrAVX2',
|
||||
[files_swr_common, files_swr_arch],
|
||||
cpp_args : [
|
||||
cpp_msvc_compat_args, swr_cpp_args, swr_avx2_args,
|
||||
'-DKNOB_ARCH=KNOB_ARCH_AVX2',
|
||||
],
|
||||
gnu_symbol_visibility : 'hidden',
|
||||
link_args : [ld_args_gc_sections],
|
||||
include_directories : [swr_incs],
|
||||
dependencies : [dep_thread, dep_llvm],
|
||||
version : '0.0.0',
|
||||
soversion : host_machine.system() == 'windows' ? '' : '0',
|
||||
install : true,
|
||||
name_prefix : host_machine.system() == 'windows' ? '' : 'lib',
|
||||
)
|
||||
else
|
||||
swr_arch_libs += static_library(
|
||||
'swrAVX2',
|
||||
[files_swr_common, files_swr_arch],
|
||||
cpp_args : [
|
||||
cpp_msvc_compat_args, swr_cpp_args, swr_avx2_args,
|
||||
'-DKNOB_ARCH=KNOB_ARCH_AVX2',
|
||||
],
|
||||
gnu_symbol_visibility : 'hidden',
|
||||
link_args : [ld_args_gc_sections],
|
||||
include_directories : [swr_incs],
|
||||
dependencies : [dep_thread, dep_llvm],
|
||||
)
|
||||
endif
|
||||
endif
|
||||
|
||||
if with_swr_arches.contains('avx')
|
||||
swr_defines += '-DHAVE_SWR_AVX'
|
||||
if shared_swr
|
||||
swr_arch_libs += shared_library(
|
||||
'swrAVX',
|
||||
[files_swr_common, files_swr_arch],
|
||||
cpp_args : [
|
||||
cpp_msvc_compat_args, swr_cpp_args, swr_avx_args,
|
||||
'-DKNOB_ARCH=KNOB_ARCH_AVX',
|
||||
],
|
||||
gnu_symbol_visibility : 'hidden',
|
||||
link_args : [ld_args_gc_sections],
|
||||
include_directories : [swr_incs],
|
||||
dependencies : [dep_thread, dep_llvm],
|
||||
version : '0.0.0',
|
||||
soversion : host_machine.system() == 'windows' ? '' : '0',
|
||||
install : true,
|
||||
name_prefix : host_machine.system() == 'windows' ? '' : 'lib',
|
||||
)
|
||||
else
|
||||
swr_arch_libs += static_library(
|
||||
'swrAVX',
|
||||
[files_swr_common, files_swr_arch],
|
||||
cpp_args : [
|
||||
cpp_msvc_compat_args, swr_cpp_args, swr_avx_args,
|
||||
'-DKNOB_ARCH=KNOB_ARCH_AVX',
|
||||
],
|
||||
gnu_symbol_visibility : 'hidden',
|
||||
link_args : [ld_args_gc_sections],
|
||||
include_directories : [swr_incs],
|
||||
dependencies : [dep_thread, dep_llvm],
|
||||
)
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
if swr_arch_libs == []
|
||||
error('SWR configured, but no SWR architectures configured')
|
||||
endif
|
||||
|
||||
# The swr_avx_args are needed for intrensic usage in swr api headers.
|
||||
libmesaswr = static_library(
|
||||
'mesaswr',
|
||||
[files_swr_mesa, files_swr_common, gen_knobs_h, gen_knobs_cpp,
|
||||
gen_builder_hpp, gen_builder_meta_hpp, gen_builder_intrin_hpp],
|
||||
cpp_args : [
|
||||
cpp_msvc_compat_args, swr_cpp_args, swr_avx_args,
|
||||
swr_defines,
|
||||
],
|
||||
gnu_symbol_visibility : 'hidden',
|
||||
include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, swr_incs],
|
||||
dependencies : [dep_llvm, idep_mesautil],
|
||||
)
|
||||
|
||||
link_libs = [libmesaswr]
|
||||
if not shared_swr
|
||||
link_libs += swr_arch_libs
|
||||
endif
|
||||
|
||||
driver_swr = declare_dependency(
|
||||
compile_args : '-DGALLIUM_SWR',
|
||||
link_with : link_libs
|
||||
)
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
((prog-mode
|
||||
(c-basic-offset . 4)
|
||||
(c-file-style . "k&r")
|
||||
(fill-column . 78)
|
||||
(indent-tabs-mode . nil)
|
||||
(show-trailing-whitespace . t)
|
||||
)
|
||||
)
|
||||
|
|
@ -1,114 +0,0 @@
|
|||
---
|
||||
Language: Cpp
|
||||
# BasedOnStyle: LLVM
|
||||
AccessModifierOffset: -4
|
||||
AlignAfterOpenBracket: Align
|
||||
AlignConsecutiveAssignments: true
|
||||
AlignConsecutiveDeclarations: true
|
||||
AlignEscapedNewlines: Left
|
||||
AlignOperands: true
|
||||
AlignTrailingComments: true
|
||||
AllowAllParametersOfDeclarationOnNextLine: true
|
||||
AllowShortBlocksOnASingleLine: false
|
||||
AllowShortCaseLabelsOnASingleLine: false
|
||||
AllowShortFunctionsOnASingleLine: Inline
|
||||
AllowShortIfStatementsOnASingleLine: false
|
||||
AllowShortLoopsOnASingleLine: false
|
||||
AlwaysBreakAfterDefinitionReturnType: None
|
||||
AlwaysBreakAfterReturnType: None
|
||||
AlwaysBreakBeforeMultilineStrings: false
|
||||
AlwaysBreakTemplateDeclarations: true
|
||||
BinPackArguments: false
|
||||
BinPackParameters: false
|
||||
BraceWrapping:
|
||||
AfterClass: true
|
||||
AfterControlStatement: true
|
||||
AfterEnum: true
|
||||
AfterFunction: true
|
||||
AfterNamespace: true
|
||||
AfterObjCDeclaration: true
|
||||
AfterStruct: true
|
||||
AfterUnion: true
|
||||
#AfterExternBlock: false
|
||||
BeforeCatch: true
|
||||
BeforeElse: true
|
||||
IndentBraces: false
|
||||
SplitEmptyFunction: true
|
||||
SplitEmptyRecord: true
|
||||
SplitEmptyNamespace: true
|
||||
BreakBeforeBinaryOperators: None
|
||||
BreakBeforeBraces: Custom
|
||||
BreakBeforeInheritanceComma: false
|
||||
BreakBeforeTernaryOperators: true
|
||||
BreakConstructorInitializersBeforeComma: false
|
||||
BreakConstructorInitializers: AfterColon
|
||||
BreakAfterJavaFieldAnnotations: false
|
||||
BreakStringLiterals: true
|
||||
ColumnLimit: 100
|
||||
CommentPragmas: '^ IWYU pragma:'
|
||||
CompactNamespaces: false
|
||||
ConstructorInitializerAllOnOneLineOrOnePerLine: false
|
||||
ConstructorInitializerIndentWidth: 4
|
||||
ContinuationIndentWidth: 4
|
||||
Cpp11BracedListStyle: true
|
||||
DerivePointerAlignment: false
|
||||
DisableFormat: false
|
||||
ExperimentalAutoDetectBinPacking: false
|
||||
FixNamespaceComments: true
|
||||
ForEachMacros:
|
||||
- foreach
|
||||
- Q_FOREACH
|
||||
- BOOST_FOREACH
|
||||
#IncludeBlocks: Preserve
|
||||
IncludeCategories:
|
||||
- Regex: '^"(llvm|llvm-c|clang|clang-c)/'
|
||||
Priority: 2
|
||||
- Regex: '^(<|"(gtest|gmock|isl|json)/)'
|
||||
Priority: 3
|
||||
- Regex: '.*'
|
||||
Priority: 1
|
||||
IncludeIsMainRegex: '(Test)?$'
|
||||
IndentCaseLabels: false
|
||||
#IndentPPDirectives: AfterHash
|
||||
IndentWidth: 4
|
||||
IndentWrappedFunctionNames: false
|
||||
JavaScriptQuotes: Leave
|
||||
JavaScriptWrapImports: true
|
||||
KeepEmptyLinesAtTheStartOfBlocks: false
|
||||
MacroBlockBegin: ''
|
||||
MacroBlockEnd: ''
|
||||
MaxEmptyLinesToKeep: 1
|
||||
NamespaceIndentation: All
|
||||
ObjCBlockIndentWidth: 4
|
||||
ObjCSpaceAfterProperty: false
|
||||
ObjCSpaceBeforeProtocolList: true
|
||||
PenaltyBreakAssignment: 2
|
||||
PenaltyBreakBeforeFirstCallParameter: 19
|
||||
PenaltyBreakComment: 300
|
||||
PenaltyBreakFirstLessLess: 120
|
||||
PenaltyBreakString: 1000
|
||||
PenaltyExcessCharacter: 1000000
|
||||
PenaltyReturnTypeOnItsOwnLine: 60
|
||||
PointerAlignment: Left
|
||||
#RawStringFormats:
|
||||
# - Delimiter: pb
|
||||
# Language: TextProto
|
||||
# BasedOnStyle: google
|
||||
ReflowComments: true
|
||||
SortIncludes: false
|
||||
SortUsingDeclarations: true
|
||||
SpaceAfterCStyleCast: false
|
||||
SpaceAfterTemplateKeyword: true
|
||||
SpaceBeforeAssignmentOperators: true
|
||||
SpaceBeforeParens: ControlStatements
|
||||
SpaceInEmptyParentheses: false
|
||||
SpacesBeforeTrailingComments: 1
|
||||
SpacesInAngles: false
|
||||
SpacesInContainerLiterals: true
|
||||
SpacesInCStyleCastParentheses: false
|
||||
SpacesInParentheses: false
|
||||
SpacesInSquareBrackets: false
|
||||
Standard: Cpp11
|
||||
TabWidth: 4
|
||||
UseTab: Never
|
||||
...
|
||||
|
|
@ -1,708 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file archrast.cpp
|
||||
*
|
||||
* @brief Implementation for archrast.
|
||||
*
|
||||
******************************************************************************/
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <map>
|
||||
|
||||
#include "common/os.h"
|
||||
#include "archrast/archrast.h"
|
||||
#include "archrast/eventmanager.h"
|
||||
#include "gen_ar_event.hpp"
|
||||
#include "gen_ar_eventhandlerfile.hpp"
|
||||
|
||||
namespace ArchRast
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief struct that keeps track of depth and stencil event information
|
||||
struct DepthStencilStats
|
||||
{
|
||||
uint32_t earlyZTestPassCount = 0;
|
||||
uint32_t earlyZTestFailCount = 0;
|
||||
uint32_t lateZTestPassCount = 0;
|
||||
uint32_t lateZTestFailCount = 0;
|
||||
uint32_t earlyStencilTestPassCount = 0;
|
||||
uint32_t earlyStencilTestFailCount = 0;
|
||||
uint32_t lateStencilTestPassCount = 0;
|
||||
uint32_t lateStencilTestFailCount = 0;
|
||||
};
|
||||
|
||||
struct CStats
|
||||
{
|
||||
uint32_t trivialRejectCount;
|
||||
uint32_t trivialAcceptCount;
|
||||
uint32_t mustClipCount;
|
||||
};
|
||||
|
||||
struct TEStats
|
||||
{
|
||||
uint32_t inputPrims = 0;
|
||||
//@todo:: Change this to numPatches. Assumed: 1 patch per prim. If holds, its fine.
|
||||
};
|
||||
|
||||
struct GSStateInfo
|
||||
{
|
||||
uint32_t inputPrimCount;
|
||||
uint32_t primGeneratedCount;
|
||||
uint32_t vertsInput;
|
||||
};
|
||||
|
||||
struct RastStats
|
||||
{
|
||||
uint32_t rasterTiles = 0;
|
||||
};
|
||||
|
||||
struct CullStats
|
||||
{
|
||||
uint32_t degeneratePrimCount = 0;
|
||||
uint32_t backfacePrimCount = 0;
|
||||
};
|
||||
|
||||
struct AlphaStats
|
||||
{
|
||||
uint32_t alphaTestCount = 0;
|
||||
uint32_t alphaBlendCount = 0;
|
||||
};
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Event handler that handles API thread events. This is shared
|
||||
/// between the API and its caller (e.g. driver shim) but typically
|
||||
/// there is only a single API thread per context. So you can save
|
||||
/// information in the class to be used for other events.
|
||||
class EventHandlerApiStats : public EventHandlerFile
|
||||
{
|
||||
public:
|
||||
EventHandlerApiStats(uint32_t id) : EventHandlerFile(id)
|
||||
{
|
||||
#if defined(_WIN32)
|
||||
// Attempt to copy the events.proto file to the ArchRast output dir. It's common for
|
||||
// tools to place the events.proto file in the DEBUG_OUTPUT_DIR when launching AR. If it
|
||||
// exists, this will attempt to copy it the first time we get here to package it with
|
||||
// the stats. Otherwise, the user would need to specify the events.proto location when
|
||||
// parsing the stats in post.
|
||||
std::stringstream eventsProtoSrcFilename, eventsProtoDstFilename;
|
||||
eventsProtoSrcFilename << KNOB_DEBUG_OUTPUT_DIR << "\\events.proto" << std::ends;
|
||||
eventsProtoDstFilename << mOutputDir.substr(0, mOutputDir.size() - 1)
|
||||
<< "\\events.proto" << std::ends;
|
||||
|
||||
// If event.proto already exists, we're done; else do the copy
|
||||
struct stat buf; // Use a Posix stat for file existence check
|
||||
if (!stat(eventsProtoDstFilename.str().c_str(), &buf) == 0)
|
||||
{
|
||||
// Now check to make sure the events.proto source exists
|
||||
if (stat(eventsProtoSrcFilename.str().c_str(), &buf) == 0)
|
||||
{
|
||||
std::ifstream srcFile;
|
||||
srcFile.open(eventsProtoSrcFilename.str().c_str(), std::ios::binary);
|
||||
if (srcFile.is_open())
|
||||
{
|
||||
// Just do a binary buffer copy
|
||||
std::ofstream dstFile;
|
||||
dstFile.open(eventsProtoDstFilename.str().c_str(), std::ios::binary);
|
||||
dstFile << srcFile.rdbuf();
|
||||
dstFile.close();
|
||||
}
|
||||
srcFile.close();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual void Handle(const DrawInstancedEvent& event)
|
||||
{
|
||||
DrawInfoEvent e(event.data.drawId,
|
||||
ArchRast::Instanced,
|
||||
event.data.topology,
|
||||
event.data.numVertices,
|
||||
0,
|
||||
0,
|
||||
event.data.startVertex,
|
||||
event.data.numInstances,
|
||||
event.data.startInstance,
|
||||
event.data.tsEnable,
|
||||
event.data.gsEnable,
|
||||
event.data.soEnable,
|
||||
event.data.soTopology,
|
||||
event.data.splitId);
|
||||
|
||||
EventHandlerFile::Handle(e);
|
||||
}
|
||||
|
||||
virtual void Handle(const DrawIndexedInstancedEvent& event)
|
||||
{
|
||||
DrawInfoEvent e(event.data.drawId,
|
||||
ArchRast::IndexedInstanced,
|
||||
event.data.topology,
|
||||
0,
|
||||
event.data.numIndices,
|
||||
event.data.indexOffset,
|
||||
event.data.baseVertex,
|
||||
event.data.numInstances,
|
||||
event.data.startInstance,
|
||||
event.data.tsEnable,
|
||||
event.data.gsEnable,
|
||||
event.data.soEnable,
|
||||
event.data.soTopology,
|
||||
event.data.splitId);
|
||||
|
||||
EventHandlerFile::Handle(e);
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Event handler that handles worker thread events. There is one
|
||||
/// event handler per thread. The python script will need to sum
|
||||
/// up counters across all of the threads.
|
||||
class EventHandlerWorkerStats : public EventHandlerFile
|
||||
{
|
||||
public:
|
||||
EventHandlerWorkerStats(uint32_t id) : EventHandlerFile(id), mNeedFlush(false)
|
||||
{
|
||||
memset(mShaderStats, 0, sizeof(mShaderStats));
|
||||
}
|
||||
|
||||
virtual void Handle(const EarlyDepthStencilInfoSingleSample& event)
|
||||
{
|
||||
// earlyZ test compute
|
||||
mDSSingleSample.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
|
||||
mDSSingleSample.earlyZTestFailCount +=
|
||||
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
|
||||
|
||||
// earlyStencil test compute
|
||||
mDSSingleSample.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
|
||||
mDSSingleSample.earlyStencilTestFailCount +=
|
||||
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
|
||||
|
||||
// earlyZ test single and multi sample
|
||||
mDSCombined.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
|
||||
mDSCombined.earlyZTestFailCount +=
|
||||
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
|
||||
|
||||
// earlyStencil test single and multi sample
|
||||
mDSCombined.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
|
||||
mDSCombined.earlyStencilTestFailCount +=
|
||||
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
|
||||
|
||||
mNeedFlush = true;
|
||||
}
|
||||
|
||||
virtual void Handle(const EarlyDepthStencilInfoSampleRate& event)
|
||||
{
|
||||
// earlyZ test compute
|
||||
mDSSampleRate.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
|
||||
mDSSampleRate.earlyZTestFailCount +=
|
||||
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
|
||||
|
||||
// earlyStencil test compute
|
||||
mDSSampleRate.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
|
||||
mDSSampleRate.earlyStencilTestFailCount +=
|
||||
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
|
||||
|
||||
// earlyZ test single and multi sample
|
||||
mDSCombined.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
|
||||
mDSCombined.earlyZTestFailCount +=
|
||||
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
|
||||
|
||||
// earlyStencil test single and multi sample
|
||||
mDSCombined.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
|
||||
mDSCombined.earlyStencilTestFailCount +=
|
||||
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
|
||||
|
||||
mNeedFlush = true;
|
||||
}
|
||||
|
||||
virtual void Handle(const EarlyDepthStencilInfoNullPS& event)
|
||||
{
|
||||
// earlyZ test compute
|
||||
mDSNullPS.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
|
||||
mDSNullPS.earlyZTestFailCount +=
|
||||
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
|
||||
|
||||
// earlyStencil test compute
|
||||
mDSNullPS.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
|
||||
mDSNullPS.earlyStencilTestFailCount +=
|
||||
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
|
||||
mNeedFlush = true;
|
||||
}
|
||||
|
||||
virtual void Handle(const LateDepthStencilInfoSingleSample& event)
|
||||
{
|
||||
// lateZ test compute
|
||||
mDSSingleSample.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
|
||||
mDSSingleSample.lateZTestFailCount +=
|
||||
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
|
||||
|
||||
// lateStencil test compute
|
||||
mDSSingleSample.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
|
||||
mDSSingleSample.lateStencilTestFailCount +=
|
||||
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
|
||||
|
||||
// lateZ test single and multi sample
|
||||
mDSCombined.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
|
||||
mDSCombined.lateZTestFailCount +=
|
||||
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
|
||||
|
||||
// lateStencil test single and multi sample
|
||||
mDSCombined.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
|
||||
mDSCombined.lateStencilTestFailCount +=
|
||||
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
|
||||
|
||||
mNeedFlush = true;
|
||||
}
|
||||
|
||||
virtual void Handle(const LateDepthStencilInfoSampleRate& event)
|
||||
{
|
||||
// lateZ test compute
|
||||
mDSSampleRate.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
|
||||
mDSSampleRate.lateZTestFailCount +=
|
||||
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
|
||||
|
||||
// lateStencil test compute
|
||||
mDSSampleRate.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
|
||||
mDSSampleRate.lateStencilTestFailCount +=
|
||||
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
|
||||
|
||||
// lateZ test single and multi sample
|
||||
mDSCombined.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
|
||||
mDSCombined.lateZTestFailCount +=
|
||||
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
|
||||
|
||||
// lateStencil test single and multi sample
|
||||
mDSCombined.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
|
||||
mDSCombined.lateStencilTestFailCount +=
|
||||
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
|
||||
|
||||
mNeedFlush = true;
|
||||
}
|
||||
|
||||
virtual void Handle(const LateDepthStencilInfoNullPS& event)
|
||||
{
|
||||
// lateZ test compute
|
||||
mDSNullPS.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
|
||||
mDSNullPS.lateZTestFailCount +=
|
||||
_mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
|
||||
|
||||
// lateStencil test compute
|
||||
mDSNullPS.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
|
||||
mDSNullPS.lateStencilTestFailCount +=
|
||||
_mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
|
||||
mNeedFlush = true;
|
||||
}
|
||||
|
||||
virtual void Handle(const EarlyDepthInfoPixelRate& event)
|
||||
{
|
||||
// earlyZ test compute
|
||||
mDSPixelRate.earlyZTestPassCount += event.data.depthPassCount;
|
||||
mDSPixelRate.earlyZTestFailCount +=
|
||||
(_mm_popcnt_u32(event.data.activeLanes) - event.data.depthPassCount);
|
||||
mNeedFlush = true;
|
||||
}
|
||||
|
||||
|
||||
virtual void Handle(const LateDepthInfoPixelRate& event)
|
||||
{
|
||||
// lateZ test compute
|
||||
mDSPixelRate.lateZTestPassCount += event.data.depthPassCount;
|
||||
mDSPixelRate.lateZTestFailCount +=
|
||||
(_mm_popcnt_u32(event.data.activeLanes) - event.data.depthPassCount);
|
||||
mNeedFlush = true;
|
||||
}
|
||||
|
||||
|
||||
virtual void Handle(const ClipInfoEvent& event)
|
||||
{
|
||||
mClipper.mustClipCount += _mm_popcnt_u32(event.data.clipMask);
|
||||
mClipper.trivialRejectCount +=
|
||||
event.data.numInvocations - _mm_popcnt_u32(event.data.validMask);
|
||||
mClipper.trivialAcceptCount +=
|
||||
_mm_popcnt_u32(event.data.validMask & ~event.data.clipMask);
|
||||
}
|
||||
|
||||
void UpdateStats(SWR_SHADER_STATS* pStatTotals, const SWR_SHADER_STATS* pStatUpdate)
|
||||
{
|
||||
pStatTotals->numInstExecuted += pStatUpdate->numInstExecuted;
|
||||
pStatTotals->numSampleExecuted += pStatUpdate->numSampleExecuted;
|
||||
pStatTotals->numSampleLExecuted += pStatUpdate->numSampleLExecuted;
|
||||
pStatTotals->numSampleBExecuted += pStatUpdate->numSampleBExecuted;
|
||||
pStatTotals->numSampleCExecuted += pStatUpdate->numSampleCExecuted;
|
||||
pStatTotals->numSampleCLZExecuted += pStatUpdate->numSampleCLZExecuted;
|
||||
pStatTotals->numSampleCDExecuted += pStatUpdate->numSampleCDExecuted;
|
||||
pStatTotals->numGather4Executed += pStatUpdate->numGather4Executed;
|
||||
pStatTotals->numGather4CExecuted += pStatUpdate->numGather4CExecuted;
|
||||
pStatTotals->numGather4CPOExecuted += pStatUpdate->numGather4CPOExecuted;
|
||||
pStatTotals->numGather4CPOCExecuted += pStatUpdate->numGather4CPOCExecuted;
|
||||
pStatTotals->numLodExecuted += pStatUpdate->numLodExecuted;
|
||||
}
|
||||
|
||||
virtual void Handle(const VSStats& event)
|
||||
{
|
||||
SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
|
||||
UpdateStats(&mShaderStats[SHADER_VERTEX], pStats);
|
||||
}
|
||||
|
||||
virtual void Handle(const GSStats& event)
|
||||
{
|
||||
SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
|
||||
UpdateStats(&mShaderStats[SHADER_GEOMETRY], pStats);
|
||||
}
|
||||
|
||||
virtual void Handle(const DSStats& event)
|
||||
{
|
||||
SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
|
||||
UpdateStats(&mShaderStats[SHADER_DOMAIN], pStats);
|
||||
}
|
||||
|
||||
virtual void Handle(const HSStats& event)
|
||||
{
|
||||
SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
|
||||
UpdateStats(&mShaderStats[SHADER_HULL], pStats);
|
||||
}
|
||||
|
||||
virtual void Handle(const PSStats& event)
|
||||
{
|
||||
SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
|
||||
UpdateStats(&mShaderStats[SHADER_PIXEL], pStats);
|
||||
mNeedFlush = true;
|
||||
}
|
||||
|
||||
virtual void Handle(const CSStats& event)
|
||||
{
|
||||
SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
|
||||
UpdateStats(&mShaderStats[SHADER_COMPUTE], pStats);
|
||||
mNeedFlush = true;
|
||||
}
|
||||
|
||||
// Flush cached events for this draw
|
||||
virtual void FlushDraw(uint32_t drawId)
|
||||
{
|
||||
if (mNeedFlush == false)
|
||||
return;
|
||||
|
||||
EventHandlerFile::Handle(PSInfo(drawId,
|
||||
mShaderStats[SHADER_PIXEL].numInstExecuted,
|
||||
mShaderStats[SHADER_PIXEL].numSampleExecuted,
|
||||
mShaderStats[SHADER_PIXEL].numSampleLExecuted,
|
||||
mShaderStats[SHADER_PIXEL].numSampleBExecuted,
|
||||
mShaderStats[SHADER_PIXEL].numSampleCExecuted,
|
||||
mShaderStats[SHADER_PIXEL].numSampleCLZExecuted,
|
||||
mShaderStats[SHADER_PIXEL].numSampleCDExecuted,
|
||||
mShaderStats[SHADER_PIXEL].numGather4Executed,
|
||||
mShaderStats[SHADER_PIXEL].numGather4CExecuted,
|
||||
mShaderStats[SHADER_PIXEL].numGather4CPOExecuted,
|
||||
mShaderStats[SHADER_PIXEL].numGather4CPOCExecuted,
|
||||
mShaderStats[SHADER_PIXEL].numLodExecuted));
|
||||
EventHandlerFile::Handle(CSInfo(drawId,
|
||||
mShaderStats[SHADER_COMPUTE].numInstExecuted,
|
||||
mShaderStats[SHADER_COMPUTE].numSampleExecuted,
|
||||
mShaderStats[SHADER_COMPUTE].numSampleLExecuted,
|
||||
mShaderStats[SHADER_COMPUTE].numSampleBExecuted,
|
||||
mShaderStats[SHADER_COMPUTE].numSampleCExecuted,
|
||||
mShaderStats[SHADER_COMPUTE].numSampleCLZExecuted,
|
||||
mShaderStats[SHADER_COMPUTE].numSampleCDExecuted,
|
||||
mShaderStats[SHADER_COMPUTE].numGather4Executed,
|
||||
mShaderStats[SHADER_COMPUTE].numGather4CExecuted,
|
||||
mShaderStats[SHADER_COMPUTE].numGather4CPOExecuted,
|
||||
mShaderStats[SHADER_COMPUTE].numGather4CPOCExecuted,
|
||||
mShaderStats[SHADER_COMPUTE].numLodExecuted));
|
||||
|
||||
// singleSample
|
||||
EventHandlerFile::Handle(EarlyZSingleSample(
|
||||
drawId, mDSSingleSample.earlyZTestPassCount, mDSSingleSample.earlyZTestFailCount));
|
||||
EventHandlerFile::Handle(LateZSingleSample(
|
||||
drawId, mDSSingleSample.lateZTestPassCount, mDSSingleSample.lateZTestFailCount));
|
||||
EventHandlerFile::Handle(
|
||||
EarlyStencilSingleSample(drawId,
|
||||
mDSSingleSample.earlyStencilTestPassCount,
|
||||
mDSSingleSample.earlyStencilTestFailCount));
|
||||
EventHandlerFile::Handle(
|
||||
LateStencilSingleSample(drawId,
|
||||
mDSSingleSample.lateStencilTestPassCount,
|
||||
mDSSingleSample.lateStencilTestFailCount));
|
||||
|
||||
// sampleRate
|
||||
EventHandlerFile::Handle(EarlyZSampleRate(
|
||||
drawId, mDSSampleRate.earlyZTestPassCount, mDSSampleRate.earlyZTestFailCount));
|
||||
EventHandlerFile::Handle(LateZSampleRate(
|
||||
drawId, mDSSampleRate.lateZTestPassCount, mDSSampleRate.lateZTestFailCount));
|
||||
EventHandlerFile::Handle(
|
||||
EarlyStencilSampleRate(drawId,
|
||||
mDSSampleRate.earlyStencilTestPassCount,
|
||||
mDSSampleRate.earlyStencilTestFailCount));
|
||||
EventHandlerFile::Handle(LateStencilSampleRate(drawId,
|
||||
mDSSampleRate.lateStencilTestPassCount,
|
||||
mDSSampleRate.lateStencilTestFailCount));
|
||||
|
||||
// combined
|
||||
EventHandlerFile::Handle(
|
||||
EarlyZ(drawId, mDSCombined.earlyZTestPassCount, mDSCombined.earlyZTestFailCount));
|
||||
EventHandlerFile::Handle(
|
||||
LateZ(drawId, mDSCombined.lateZTestPassCount, mDSCombined.lateZTestFailCount));
|
||||
EventHandlerFile::Handle(EarlyStencil(drawId,
|
||||
mDSCombined.earlyStencilTestPassCount,
|
||||
mDSCombined.earlyStencilTestFailCount));
|
||||
EventHandlerFile::Handle(LateStencil(drawId,
|
||||
mDSCombined.lateStencilTestPassCount,
|
||||
mDSCombined.lateStencilTestFailCount));
|
||||
|
||||
// pixelRate
|
||||
EventHandlerFile::Handle(EarlyZPixelRate(
|
||||
drawId, mDSPixelRate.earlyZTestPassCount, mDSPixelRate.earlyZTestFailCount));
|
||||
EventHandlerFile::Handle(LateZPixelRate(
|
||||
drawId, mDSPixelRate.lateZTestPassCount, mDSPixelRate.lateZTestFailCount));
|
||||
|
||||
|
||||
// NullPS
|
||||
EventHandlerFile::Handle(
|
||||
EarlyZNullPS(drawId, mDSNullPS.earlyZTestPassCount, mDSNullPS.earlyZTestFailCount));
|
||||
EventHandlerFile::Handle(EarlyStencilNullPS(
|
||||
drawId, mDSNullPS.earlyStencilTestPassCount, mDSNullPS.earlyStencilTestFailCount));
|
||||
|
||||
// Rasterized Subspans
|
||||
EventHandlerFile::Handle(RasterTiles(drawId, rastStats.rasterTiles));
|
||||
|
||||
// Alpha Subspans
|
||||
EventHandlerFile::Handle(
|
||||
AlphaEvent(drawId, mAlphaStats.alphaTestCount, mAlphaStats.alphaBlendCount));
|
||||
|
||||
// Primitive Culling
|
||||
EventHandlerFile::Handle(
|
||||
CullEvent(drawId, mCullStats.backfacePrimCount, mCullStats.degeneratePrimCount));
|
||||
|
||||
mDSSingleSample = {};
|
||||
mDSSampleRate = {};
|
||||
mDSCombined = {};
|
||||
mDSPixelRate = {};
|
||||
mDSNullPS = {};
|
||||
|
||||
rastStats = {};
|
||||
mCullStats = {};
|
||||
mAlphaStats = {};
|
||||
|
||||
mShaderStats[SHADER_PIXEL] = {};
|
||||
mShaderStats[SHADER_COMPUTE] = {};
|
||||
|
||||
mNeedFlush = false;
|
||||
}
|
||||
|
||||
virtual void Handle(const FrontendDrawEndEvent& event)
|
||||
{
|
||||
// Clipper
|
||||
EventHandlerFile::Handle(ClipperEvent(event.data.drawId,
|
||||
mClipper.trivialRejectCount,
|
||||
mClipper.trivialAcceptCount,
|
||||
mClipper.mustClipCount));
|
||||
|
||||
// Tesselator
|
||||
EventHandlerFile::Handle(TessPrims(event.data.drawId, mTS.inputPrims));
|
||||
|
||||
// Geometry Shader
|
||||
EventHandlerFile::Handle(GSInputPrims(event.data.drawId, mGS.inputPrimCount));
|
||||
EventHandlerFile::Handle(GSPrimsGen(event.data.drawId, mGS.primGeneratedCount));
|
||||
EventHandlerFile::Handle(GSVertsInput(event.data.drawId, mGS.vertsInput));
|
||||
|
||||
EventHandlerFile::Handle(VSInfo(event.data.drawId,
|
||||
mShaderStats[SHADER_VERTEX].numInstExecuted,
|
||||
mShaderStats[SHADER_VERTEX].numSampleExecuted,
|
||||
mShaderStats[SHADER_VERTEX].numSampleLExecuted,
|
||||
mShaderStats[SHADER_VERTEX].numSampleBExecuted,
|
||||
mShaderStats[SHADER_VERTEX].numSampleCExecuted,
|
||||
mShaderStats[SHADER_VERTEX].numSampleCLZExecuted,
|
||||
mShaderStats[SHADER_VERTEX].numSampleCDExecuted,
|
||||
mShaderStats[SHADER_VERTEX].numGather4Executed,
|
||||
mShaderStats[SHADER_VERTEX].numGather4CExecuted,
|
||||
mShaderStats[SHADER_VERTEX].numGather4CPOExecuted,
|
||||
mShaderStats[SHADER_VERTEX].numGather4CPOCExecuted,
|
||||
mShaderStats[SHADER_VERTEX].numLodExecuted));
|
||||
EventHandlerFile::Handle(HSInfo(event.data.drawId,
|
||||
mShaderStats[SHADER_HULL].numInstExecuted,
|
||||
mShaderStats[SHADER_HULL].numSampleExecuted,
|
||||
mShaderStats[SHADER_HULL].numSampleLExecuted,
|
||||
mShaderStats[SHADER_HULL].numSampleBExecuted,
|
||||
mShaderStats[SHADER_HULL].numSampleCExecuted,
|
||||
mShaderStats[SHADER_HULL].numSampleCLZExecuted,
|
||||
mShaderStats[SHADER_HULL].numSampleCDExecuted,
|
||||
mShaderStats[SHADER_HULL].numGather4Executed,
|
||||
mShaderStats[SHADER_HULL].numGather4CExecuted,
|
||||
mShaderStats[SHADER_HULL].numGather4CPOExecuted,
|
||||
mShaderStats[SHADER_HULL].numGather4CPOCExecuted,
|
||||
mShaderStats[SHADER_HULL].numLodExecuted));
|
||||
EventHandlerFile::Handle(DSInfo(event.data.drawId,
|
||||
mShaderStats[SHADER_DOMAIN].numInstExecuted,
|
||||
mShaderStats[SHADER_DOMAIN].numSampleExecuted,
|
||||
mShaderStats[SHADER_DOMAIN].numSampleLExecuted,
|
||||
mShaderStats[SHADER_DOMAIN].numSampleBExecuted,
|
||||
mShaderStats[SHADER_DOMAIN].numSampleCExecuted,
|
||||
mShaderStats[SHADER_DOMAIN].numSampleCLZExecuted,
|
||||
mShaderStats[SHADER_DOMAIN].numSampleCDExecuted,
|
||||
mShaderStats[SHADER_DOMAIN].numGather4Executed,
|
||||
mShaderStats[SHADER_DOMAIN].numGather4CExecuted,
|
||||
mShaderStats[SHADER_DOMAIN].numGather4CPOExecuted,
|
||||
mShaderStats[SHADER_DOMAIN].numGather4CPOCExecuted,
|
||||
mShaderStats[SHADER_DOMAIN].numLodExecuted));
|
||||
EventHandlerFile::Handle(GSInfo(event.data.drawId,
|
||||
mShaderStats[SHADER_GEOMETRY].numInstExecuted,
|
||||
mShaderStats[SHADER_GEOMETRY].numSampleExecuted,
|
||||
mShaderStats[SHADER_GEOMETRY].numSampleLExecuted,
|
||||
mShaderStats[SHADER_GEOMETRY].numSampleBExecuted,
|
||||
mShaderStats[SHADER_GEOMETRY].numSampleCExecuted,
|
||||
mShaderStats[SHADER_GEOMETRY].numSampleCLZExecuted,
|
||||
mShaderStats[SHADER_GEOMETRY].numSampleCDExecuted,
|
||||
mShaderStats[SHADER_GEOMETRY].numGather4Executed,
|
||||
mShaderStats[SHADER_GEOMETRY].numGather4CExecuted,
|
||||
mShaderStats[SHADER_GEOMETRY].numGather4CPOExecuted,
|
||||
mShaderStats[SHADER_GEOMETRY].numGather4CPOCExecuted,
|
||||
mShaderStats[SHADER_GEOMETRY].numLodExecuted));
|
||||
|
||||
mShaderStats[SHADER_VERTEX] = {};
|
||||
mShaderStats[SHADER_HULL] = {};
|
||||
mShaderStats[SHADER_DOMAIN] = {};
|
||||
mShaderStats[SHADER_GEOMETRY] = {};
|
||||
|
||||
// Reset Internal Counters
|
||||
mClipper = {};
|
||||
mTS = {};
|
||||
mGS = {};
|
||||
}
|
||||
|
||||
virtual void Handle(const GSPrimInfo& event)
|
||||
{
|
||||
mGS.inputPrimCount += event.data.inputPrimCount;
|
||||
mGS.primGeneratedCount += event.data.primGeneratedCount;
|
||||
mGS.vertsInput += event.data.vertsInput;
|
||||
}
|
||||
|
||||
virtual void Handle(const TessPrimCount& event) { mTS.inputPrims += event.data.primCount; }
|
||||
|
||||
virtual void Handle(const RasterTileCount& event)
|
||||
{
|
||||
rastStats.rasterTiles += event.data.rasterTiles;
|
||||
}
|
||||
|
||||
virtual void Handle(const CullInfoEvent& event)
|
||||
{
|
||||
mCullStats.degeneratePrimCount += _mm_popcnt_u32(
|
||||
event.data.validMask ^ (event.data.validMask & ~event.data.degeneratePrimMask));
|
||||
mCullStats.backfacePrimCount += _mm_popcnt_u32(
|
||||
event.data.validMask ^ (event.data.validMask & ~event.data.backfacePrimMask));
|
||||
}
|
||||
|
||||
virtual void Handle(const AlphaInfoEvent& event)
|
||||
{
|
||||
mAlphaStats.alphaTestCount += event.data.alphaTestEnable;
|
||||
mAlphaStats.alphaBlendCount += event.data.alphaBlendEnable;
|
||||
}
|
||||
|
||||
protected:
|
||||
bool mNeedFlush;
|
||||
// Per draw stats
|
||||
DepthStencilStats mDSSingleSample = {};
|
||||
DepthStencilStats mDSSampleRate = {};
|
||||
DepthStencilStats mDSPixelRate = {};
|
||||
DepthStencilStats mDSCombined = {};
|
||||
DepthStencilStats mDSNullPS = {};
|
||||
DepthStencilStats mDSOmZ = {};
|
||||
CStats mClipper = {};
|
||||
TEStats mTS = {};
|
||||
GSStateInfo mGS = {};
|
||||
RastStats rastStats = {};
|
||||
CullStats mCullStats = {};
|
||||
AlphaStats mAlphaStats = {};
|
||||
|
||||
SWR_SHADER_STATS mShaderStats[NUM_SHADER_TYPES];
|
||||
|
||||
};
|
||||
|
||||
static EventManager* FromHandle(HANDLE hThreadContext)
|
||||
{
|
||||
return reinterpret_cast<EventManager*>(hThreadContext);
|
||||
}
|
||||
|
||||
// Construct an event manager and associate a handler with it.
|
||||
HANDLE CreateThreadContext(AR_THREAD type)
|
||||
{
|
||||
// Can we assume single threaded here?
|
||||
static std::atomic<uint32_t> counter(0);
|
||||
uint32_t id = counter.fetch_add(1);
|
||||
|
||||
EventManager* pManager = new EventManager();
|
||||
|
||||
if (pManager)
|
||||
{
|
||||
EventHandlerFile* pHandler = nullptr;
|
||||
|
||||
if (type == AR_THREAD::API)
|
||||
{
|
||||
pHandler = new EventHandlerApiStats(id);
|
||||
pManager->Attach(pHandler);
|
||||
pHandler->Handle(ThreadStartApiEvent());
|
||||
}
|
||||
else
|
||||
{
|
||||
pHandler = new EventHandlerWorkerStats(id);
|
||||
pManager->Attach(pHandler);
|
||||
pHandler->Handle(ThreadStartWorkerEvent());
|
||||
}
|
||||
|
||||
pHandler->MarkHeader();
|
||||
|
||||
return pManager;
|
||||
}
|
||||
|
||||
SWR_INVALID("Failed to register thread.");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void DestroyThreadContext(HANDLE hThreadContext)
|
||||
{
|
||||
EventManager* pManager = FromHandle(hThreadContext);
|
||||
SWR_ASSERT(pManager != nullptr);
|
||||
|
||||
delete pManager;
|
||||
}
|
||||
|
||||
// Dispatch event for this thread.
|
||||
void Dispatch(HANDLE hThreadContext, const Event& event)
|
||||
{
|
||||
if (event.IsEnabled())
|
||||
{
|
||||
EventManager* pManager = reinterpret_cast<EventManager*>(hThreadContext);
|
||||
SWR_ASSERT(pManager != nullptr);
|
||||
pManager->Dispatch(event);
|
||||
}
|
||||
}
|
||||
|
||||
// Flush for this thread.
|
||||
void FlushDraw(HANDLE hThreadContext, uint32_t drawId)
|
||||
{
|
||||
EventManager* pManager = FromHandle(hThreadContext);
|
||||
SWR_ASSERT(pManager != nullptr);
|
||||
|
||||
pManager->FlushDraw(drawId);
|
||||
}
|
||||
} // namespace ArchRast
|
||||
|
|
@ -1,49 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file archrast.h
|
||||
*
|
||||
* @brief Definitions for archrast.
|
||||
*
|
||||
******************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#include "common/os.h"
|
||||
#include "gen_ar_event.hpp"
|
||||
#include "eventmanager.h"
|
||||
|
||||
namespace ArchRast
|
||||
{
|
||||
enum class AR_THREAD
|
||||
{
|
||||
API = 0,
|
||||
WORKER = 1
|
||||
};
|
||||
|
||||
HANDLE CreateThreadContext(AR_THREAD type);
|
||||
void DestroyThreadContext(HANDLE hThreadContext);
|
||||
|
||||
// Dispatch event for this thread.
|
||||
void Dispatch(HANDLE hThreadContext, const Event& event);
|
||||
|
||||
void FlushDraw(HANDLE hThreadContext, uint32_t drawId);
|
||||
}; // namespace ArchRast
|
||||
|
|
@ -1,88 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file archrast.h
|
||||
*
|
||||
* @brief Definitions for the event manager.
|
||||
*
|
||||
******************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#include "common/os.h"
|
||||
|
||||
#include "gen_ar_event.hpp"
|
||||
#include "gen_ar_eventhandler.hpp"
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace ArchRast
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// EventManager - interface to dispatch events to handlers.
|
||||
/// Event handling occurs only on a single thread.
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
class EventManager
|
||||
{
|
||||
public:
|
||||
EventManager() {}
|
||||
|
||||
~EventManager()
|
||||
{
|
||||
// Event manager owns destroying handler objects once attached.
|
||||
///@note See comment for Detach.
|
||||
for (auto pHandler : mHandlers)
|
||||
{
|
||||
delete pHandler;
|
||||
}
|
||||
}
|
||||
|
||||
void Attach(EventHandler* pHandler)
|
||||
{
|
||||
SWR_ASSERT(pHandler != nullptr);
|
||||
mHandlers.push_back(pHandler);
|
||||
}
|
||||
|
||||
void Dispatch(const Event& event)
|
||||
{
|
||||
///@todo Add event filter check here.
|
||||
|
||||
for (auto pHandler : mHandlers)
|
||||
{
|
||||
event.Accept(pHandler);
|
||||
}
|
||||
}
|
||||
|
||||
void FlushDraw(uint32_t drawId)
|
||||
{
|
||||
for (auto pHandler : mHandlers)
|
||||
{
|
||||
pHandler->FlushDraw(drawId);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// Handlers stay registered for life
|
||||
void Detach(EventHandler* pHandler) { SWR_INVALID("Should not be called"); }
|
||||
|
||||
std::vector<EventHandler*> mHandlers;
|
||||
};
|
||||
}; // namespace ArchRast
|
||||
|
|
@ -1,427 +0,0 @@
|
|||
# Copyright (C) 2016 Intel Corporation. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice (including the next
|
||||
# paragraph) shall be included in all copies or substantial portions of the
|
||||
# Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
# IN THE SOFTWARE.
|
||||
#
|
||||
# Provides definitions for events.
|
||||
|
||||
enum AR_DRAW_TYPE
|
||||
{
|
||||
Instanced = 0,
|
||||
IndexedInstanced = 1,
|
||||
InstancedSplit = 2,
|
||||
IndexedInstancedSplit = 3
|
||||
};
|
||||
|
||||
event Framework::ThreadStartApiEvent
|
||||
{
|
||||
};
|
||||
|
||||
event Framework::ThreadStartWorkerEvent
|
||||
{
|
||||
};
|
||||
|
||||
///@brief Used as a helper event to indicate end of frame. Does not guarantee to capture end of frame on all APIs
|
||||
event ApiSwr::FrameEndEvent
|
||||
{
|
||||
uint32_t frameId; // current frame id
|
||||
uint32_t nextDrawId; // next draw id (always incremental - does not reset)
|
||||
};
|
||||
|
||||
///@brief Synchronization event.
|
||||
event ApiSwr::SwrSyncEvent
|
||||
{
|
||||
uint32_t drawId;
|
||||
};
|
||||
|
||||
///@brief Invalidate hot tiles (i.e. tile cache)
|
||||
event ApiSwr::SwrInvalidateTilesEvent
|
||||
{
|
||||
uint32_t drawId;
|
||||
};
|
||||
|
||||
///@brief Invalidate and discard hot tiles within pixel region
|
||||
event ApiSwr::SwrDiscardRectEvent
|
||||
{
|
||||
uint32_t drawId;
|
||||
};
|
||||
|
||||
///@brief Flush tiles out to memory that is typically owned by driver (e.g. Flush RT cache)
|
||||
event ApiSwr::SwrStoreTilesEvent
|
||||
{
|
||||
uint32_t drawId;
|
||||
};
|
||||
|
||||
event PipelineStats::DrawInfoEvent
|
||||
{
|
||||
uint32_t drawId;
|
||||
AR_DRAW_TYPE type; // type of draw (indexed, instanced, etc)
|
||||
uint32_t topology; // topology of draw
|
||||
uint32_t numVertices; // number of vertices for draw
|
||||
uint32_t numIndices; // number of indices for draw
|
||||
int32_t indexOffset; // offset into index buffer
|
||||
int32_t baseVertex; // which vertex to start with
|
||||
uint32_t numInstances; // number of instances to draw
|
||||
uint32_t startInstance; // which instance to start fetching
|
||||
uint32_t tsEnable; // tesselation enabled
|
||||
uint32_t gsEnable; // geometry shader enabled
|
||||
uint32_t soEnable; // stream-out enabled
|
||||
uint32_t soTopology; // topology of stream-out
|
||||
uint32_t splitId; // split draw count or id
|
||||
};
|
||||
|
||||
event PipelineStats::DispatchEvent
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint32_t threadGroupCountX; // num thread groups in X dimension
|
||||
uint32_t threadGroupCountY; // num thread groups in Y dimension
|
||||
uint32_t threadGroupCountZ; // num thread groups in Z dimension
|
||||
};
|
||||
|
||||
event PipelineStats::FrontendStatsEvent
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t IaVertices;
|
||||
uint64_t IaPrimitives;
|
||||
uint64_t VsInvocations;
|
||||
uint64_t HsInvocations;
|
||||
uint64_t DsInvocations;
|
||||
uint64_t GsInvocations;
|
||||
uint64_t GsPrimitives;
|
||||
uint64_t CInvocations;
|
||||
uint64_t CPrimitives;
|
||||
uint64_t SoPrimStorageNeeded0;
|
||||
uint64_t SoPrimStorageNeeded1;
|
||||
uint64_t SoPrimStorageNeeded2;
|
||||
uint64_t SoPrimStorageNeeded3;
|
||||
uint64_t SoNumPrimsWritten0;
|
||||
uint64_t SoNumPrimsWritten1;
|
||||
uint64_t SoNumPrimsWritten2;
|
||||
uint64_t SoNumPrimsWritten3;
|
||||
};
|
||||
|
||||
event PipelineStats::BackendStatsEvent
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t DepthPassCount;
|
||||
uint64_t PsInvocations;
|
||||
uint64_t CsInvocations;
|
||||
|
||||
};
|
||||
|
||||
event PipelineStats::EarlyZSingleSample
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t passCount;
|
||||
uint64_t failCount;
|
||||
};
|
||||
|
||||
event PipelineStats::LateZSingleSample
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t passCount;
|
||||
uint64_t failCount;
|
||||
};
|
||||
|
||||
event PipelineStats::EarlyStencilSingleSample
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t passCount;
|
||||
uint64_t failCount;
|
||||
};
|
||||
|
||||
event PipelineStats::LateStencilSingleSample
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t passCount;
|
||||
uint64_t failCount;
|
||||
};
|
||||
|
||||
event PipelineStats::EarlyZSampleRate
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t passCount;
|
||||
uint64_t failCount;
|
||||
};
|
||||
|
||||
event PipelineStats::LateZSampleRate
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t passCount;
|
||||
uint64_t failCount;
|
||||
};
|
||||
|
||||
event PipelineStats::EarlyStencilSampleRate
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t passCount;
|
||||
uint64_t failCount;
|
||||
};
|
||||
|
||||
event PipelineStats::LateStencilSampleRate
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t passCount;
|
||||
uint64_t failCount;
|
||||
};
|
||||
|
||||
// Total Early-Z counts, SingleSample and SampleRate
|
||||
event PipelineStats::EarlyZ
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t passCount;
|
||||
uint64_t failCount;
|
||||
};
|
||||
|
||||
// Total LateZ counts, SingleSample and SampleRate
|
||||
event PipelineStats::LateZ
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t passCount;
|
||||
uint64_t failCount;
|
||||
};
|
||||
|
||||
// Total EarlyStencil counts, SingleSample and SampleRate
|
||||
event PipelineStats::EarlyStencil
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t passCount;
|
||||
uint64_t failCount;
|
||||
};
|
||||
|
||||
// Total LateStencil counts, SingleSample and SampleRate
|
||||
event PipelineStats::LateStencil
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t passCount;
|
||||
uint64_t failCount;
|
||||
};
|
||||
|
||||
event PipelineStats::EarlyZNullPS
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t passCount;
|
||||
uint64_t failCount;
|
||||
};
|
||||
|
||||
event PipelineStats::EarlyStencilNullPS
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t passCount;
|
||||
uint64_t failCount;
|
||||
};
|
||||
|
||||
event PipelineStats::EarlyZPixelRate
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t passCount;
|
||||
uint64_t failCount;
|
||||
};
|
||||
|
||||
event PipelineStats::LateZPixelRate
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t passCount;
|
||||
uint64_t failCount;
|
||||
};
|
||||
|
||||
|
||||
event PipelineStats::EarlyOmZ
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t passCount;
|
||||
uint64_t failCount;
|
||||
};
|
||||
|
||||
event PipelineStats::EarlyOmStencil
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t passCount;
|
||||
uint64_t failCount;
|
||||
};
|
||||
|
||||
event PipelineStats::LateOmZ
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t passCount;
|
||||
uint64_t failCount;
|
||||
};
|
||||
|
||||
event PipelineStats::LateOmStencil
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t passCount;
|
||||
uint64_t failCount;
|
||||
};
|
||||
|
||||
event PipelineStats::GSInputPrims
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t inputPrimCount;
|
||||
};
|
||||
|
||||
event PipelineStats::GSPrimsGen
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t primGeneratedCount;
|
||||
};
|
||||
|
||||
event PipelineStats::GSVertsInput
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t vertsInput;
|
||||
};
|
||||
|
||||
event PipelineStats::TessPrims
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t primCount;
|
||||
};
|
||||
|
||||
event PipelineStats::RasterTiles
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint32_t rastTileCount;
|
||||
};
|
||||
|
||||
event PipelineStats::ClipperEvent
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint32_t trivialRejectCount;
|
||||
uint32_t trivialAcceptCount;
|
||||
uint32_t mustClipCount;
|
||||
};
|
||||
|
||||
event PipelineStats::CullEvent
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t backfacePrimCount;
|
||||
uint64_t degeneratePrimCount;
|
||||
};
|
||||
|
||||
event PipelineStats::AlphaEvent
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint32_t alphaTestCount;
|
||||
uint32_t alphaBlendCount;
|
||||
};
|
||||
|
||||
event ShaderStats::VSInfo
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint32_t numInstExecuted;
|
||||
uint32_t numSampleExecuted;
|
||||
uint32_t numSampleLExecuted;
|
||||
uint32_t numSampleBExecuted;
|
||||
uint32_t numSampleCExecuted;
|
||||
uint32_t numSampleCLZExecuted;
|
||||
uint32_t numSampleCDExecuted;
|
||||
uint32_t numGather4Executed;
|
||||
uint32_t numGather4CExecuted;
|
||||
uint32_t numGather4CPOExecuted;
|
||||
uint32_t numGather4CPOCExecuted;
|
||||
uint32_t numLodExecuted;
|
||||
};
|
||||
|
||||
event ShaderStats::HSInfo
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint32_t numInstExecuted;
|
||||
uint32_t numSampleExecuted;
|
||||
uint32_t numSampleLExecuted;
|
||||
uint32_t numSampleBExecuted;
|
||||
uint32_t numSampleCExecuted;
|
||||
uint32_t numSampleCLZExecuted;
|
||||
uint32_t numSampleCDExecuted;
|
||||
uint32_t numGather4Executed;
|
||||
uint32_t numGather4CExecuted;
|
||||
uint32_t numGather4CPOExecuted;
|
||||
uint32_t numGather4CPOCExecuted;
|
||||
uint32_t numLodExecuted;
|
||||
};
|
||||
|
||||
event ShaderStats::DSInfo
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint32_t numInstExecuted;
|
||||
uint32_t numSampleExecuted;
|
||||
uint32_t numSampleLExecuted;
|
||||
uint32_t numSampleBExecuted;
|
||||
uint32_t numSampleCExecuted;
|
||||
uint32_t numSampleCLZExecuted;
|
||||
uint32_t numSampleCDExecuted;
|
||||
uint32_t numGather4Executed;
|
||||
uint32_t numGather4CExecuted;
|
||||
uint32_t numGather4CPOExecuted;
|
||||
uint32_t numGather4CPOCExecuted;
|
||||
uint32_t numLodExecuted;
|
||||
};
|
||||
|
||||
event ShaderStats::GSInfo
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint32_t numInstExecuted;
|
||||
uint32_t numSampleExecuted;
|
||||
uint32_t numSampleLExecuted;
|
||||
uint32_t numSampleBExecuted;
|
||||
uint32_t numSampleCExecuted;
|
||||
uint32_t numSampleCLZExecuted;
|
||||
uint32_t numSampleCDExecuted;
|
||||
uint32_t numGather4Executed;
|
||||
uint32_t numGather4CExecuted;
|
||||
uint32_t numGather4CPOExecuted;
|
||||
uint32_t numGather4CPOCExecuted;
|
||||
uint32_t numLodExecuted;
|
||||
|
||||
};
|
||||
|
||||
event ShaderStats::PSInfo
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint32_t numInstExecuted;
|
||||
uint32_t numSampleExecuted;
|
||||
uint32_t numSampleLExecuted;
|
||||
uint32_t numSampleBExecuted;
|
||||
uint32_t numSampleCExecuted;
|
||||
uint32_t numSampleCLZExecuted;
|
||||
uint32_t numSampleCDExecuted;
|
||||
uint32_t numGather4Executed;
|
||||
uint32_t numGather4CExecuted;
|
||||
uint32_t numGather4CPOExecuted;
|
||||
uint32_t numGather4CPOCExecuted;
|
||||
uint32_t numLodExecuted;
|
||||
};
|
||||
|
||||
event ShaderStats::CSInfo
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint32_t numInstExecuted;
|
||||
uint32_t numSampleExecuted;
|
||||
uint32_t numSampleLExecuted;
|
||||
uint32_t numSampleBExecuted;
|
||||
uint32_t numSampleCExecuted;
|
||||
uint32_t numSampleCLZExecuted;
|
||||
uint32_t numSampleCDExecuted;
|
||||
uint32_t numGather4Executed;
|
||||
uint32_t numGather4CExecuted;
|
||||
uint32_t numGather4CPOExecuted;
|
||||
uint32_t numGather4CPOCExecuted;
|
||||
uint32_t numLodExecuted;
|
||||
};
|
||||
|
||||
|
|
@ -1,212 +0,0 @@
|
|||
# Copyright (C) 2018 Intel Corporation. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice (including the next
|
||||
# paragraph) shall be included in all copies or substantial portions of the
|
||||
# Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
# IN THE SOFTWARE.
|
||||
#
|
||||
# Provides definitions for private internal events that are only used internally
|
||||
# to rasty for communicating information between Rasty and Archrast. One goal for
|
||||
# ArchRast is to not pollute the Rasty code with lots of calculations, etc. that
|
||||
# are needed to compute per draw statistics, etc.
|
||||
|
||||
event PipelineStats::EarlyDepthStencilInfoSingleSample
|
||||
{
|
||||
uint64_t depthPassMask;
|
||||
uint64_t stencilPassMask;
|
||||
uint64_t coverageMask;
|
||||
};
|
||||
|
||||
event PipelineStats::EarlyDepthStencilInfoSampleRate
|
||||
{
|
||||
uint64_t depthPassMask;
|
||||
uint64_t stencilPassMask;
|
||||
uint64_t coverageMask;
|
||||
};
|
||||
|
||||
event PipelineStats::EarlyDepthStencilInfoNullPS
|
||||
{
|
||||
uint64_t depthPassMask;
|
||||
uint64_t stencilPassMask;
|
||||
uint64_t coverageMask;
|
||||
};
|
||||
|
||||
event PipelineStats::LateDepthStencilInfoSingleSample
|
||||
{
|
||||
uint64_t depthPassMask;
|
||||
uint64_t stencilPassMask;
|
||||
uint64_t coverageMask;
|
||||
};
|
||||
|
||||
event PipelineStats::LateDepthStencilInfoSampleRate
|
||||
{
|
||||
uint64_t depthPassMask;
|
||||
uint64_t stencilPassMask;
|
||||
uint64_t coverageMask;
|
||||
};
|
||||
|
||||
event PipelineStats::LateDepthStencilInfoNullPS
|
||||
{
|
||||
uint64_t depthPassMask;
|
||||
uint64_t stencilPassMask;
|
||||
uint64_t coverageMask;
|
||||
};
|
||||
|
||||
event PipelineStats::EarlyDepthInfoPixelRate
|
||||
{
|
||||
uint64_t depthPassCount;
|
||||
uint64_t activeLanes;
|
||||
};
|
||||
|
||||
|
||||
event PipelineStats::LateDepthInfoPixelRate
|
||||
{
|
||||
uint64_t depthPassCount;
|
||||
uint64_t activeLanes;
|
||||
};
|
||||
|
||||
|
||||
event PipelineStats::BackendDrawEndEvent
|
||||
{
|
||||
uint32_t drawId;
|
||||
};
|
||||
|
||||
event PipelineStats::FrontendDrawEndEvent
|
||||
{
|
||||
uint32_t drawId;
|
||||
};
|
||||
|
||||
event Memory::MemoryAccessEvent
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t tsc;
|
||||
uint64_t ptr;
|
||||
uint32_t size;
|
||||
uint8_t isRead;
|
||||
uint8_t client;
|
||||
};
|
||||
|
||||
event Memory::MemoryStatsEndEvent
|
||||
{
|
||||
uint32_t drawId;
|
||||
};
|
||||
|
||||
event PipelineStats::TessPrimCount
|
||||
{
|
||||
uint64_t primCount;
|
||||
};
|
||||
|
||||
event PipelineStats::RasterTileCount
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t rasterTiles;
|
||||
};
|
||||
|
||||
event PipelineStats::GSPrimInfo
|
||||
{
|
||||
uint64_t inputPrimCount;
|
||||
uint64_t primGeneratedCount;
|
||||
uint64_t vertsInput;
|
||||
};
|
||||
|
||||
// validMask is primitives that still need to be clipped. They weren't rejected due to trivial reject or nan.
|
||||
// clipMask is primitives that need to be clipped. So trivial accepts will be 0 while validMask for that is 1.
|
||||
// Trivial reject is numInvocations - pop_cnt32(validMask)
|
||||
// Trivial accept is validMask & ~clipMask
|
||||
// Must clip count is pop_cnt32(clipMask)
|
||||
event PipelineStats::ClipInfoEvent
|
||||
{
|
||||
uint32_t numInvocations;
|
||||
uint32_t validMask;
|
||||
uint32_t clipMask;
|
||||
};
|
||||
|
||||
event PipelineStats::CullInfoEvent
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint64_t degeneratePrimMask;
|
||||
uint64_t backfacePrimMask;
|
||||
uint32_t validMask;
|
||||
};
|
||||
|
||||
event PipelineStats::AlphaInfoEvent
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint32_t alphaTestEnable;
|
||||
uint32_t alphaBlendEnable;
|
||||
};
|
||||
|
||||
event PipelineStats::DrawInstancedEvent
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint32_t topology;
|
||||
uint32_t numVertices;
|
||||
int32_t startVertex;
|
||||
uint32_t numInstances;
|
||||
uint32_t startInstance;
|
||||
uint32_t tsEnable;
|
||||
uint32_t gsEnable;
|
||||
uint32_t soEnable;
|
||||
uint32_t soTopology;
|
||||
uint32_t splitId; // Split draw count or id.
|
||||
};
|
||||
|
||||
event PipelineStats::DrawIndexedInstancedEvent
|
||||
{
|
||||
uint32_t drawId;
|
||||
uint32_t topology;
|
||||
uint32_t numIndices;
|
||||
int32_t indexOffset;
|
||||
int32_t baseVertex;
|
||||
uint32_t numInstances;
|
||||
uint32_t startInstance;
|
||||
uint32_t tsEnable;
|
||||
uint32_t gsEnable;
|
||||
uint32_t soEnable;
|
||||
uint32_t soTopology;
|
||||
uint32_t splitId; // Split draw count or id.
|
||||
};
|
||||
|
||||
event ShaderStats::VSStats
|
||||
{
|
||||
HANDLE hStats; // SWR_SHADER_STATS
|
||||
};
|
||||
|
||||
event ShaderStats::HSStats
|
||||
{
|
||||
HANDLE hStats; // SWR_SHADER_STATS
|
||||
};
|
||||
|
||||
event ShaderStats::DSStats
|
||||
{
|
||||
HANDLE hStats; // SWR_SHADER_STATS
|
||||
};
|
||||
|
||||
event ShaderStats::GSStats
|
||||
{
|
||||
HANDLE hStats; // SWR_SHADER_STATS
|
||||
};
|
||||
|
||||
event ShaderStats::PSStats
|
||||
{
|
||||
HANDLE hStats; // SWR_SHADER_STATS
|
||||
};
|
||||
|
||||
event ShaderStats::CSStats
|
||||
{
|
||||
HANDLE hStats; // SWR_SHADER_STATS
|
||||
};
|
||||
|
|
@ -1,327 +0,0 @@
|
|||
# Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice (including the next
|
||||
# paragraph) shall be included in all copies or substantial portions of the
|
||||
# Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
# IN THE SOFTWARE.
|
||||
|
||||
# Python source
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
from gen_common import *
|
||||
|
||||
def parse_event_fields(lines, idx, event_dict):
|
||||
"""
|
||||
Parses lines from a proto file that contain an event definition and stores it in event_dict
|
||||
"""
|
||||
fields = []
|
||||
end_of_event = False
|
||||
|
||||
# record all fields in event definition.
|
||||
# note: we don't check if there's a leading brace.
|
||||
while not end_of_event and idx < len(lines):
|
||||
line = lines[idx].rstrip()
|
||||
idx += 1
|
||||
|
||||
# ex 1: uint32_t numSampleCLZExecuted; // number of sample_cl_z instructions executed
|
||||
# ex 2: char reason[256]; // size of reason
|
||||
match = re.match(r'^(\s*)([\w\*]+)(\s+)([\w]+)(\[\d+\])*;\s*(\/\/.*)*$', line)
|
||||
# group 1 -
|
||||
# group 2 type
|
||||
# group 3 -
|
||||
# group 4 name
|
||||
# group 5 [array size]
|
||||
# group 6 //comment
|
||||
|
||||
if match:
|
||||
field = {
|
||||
"type": match.group(2),
|
||||
"name": match.group(4),
|
||||
"size": int(match.group(5)[1:-1]) if match.group(5) else 1,
|
||||
"desc": match.group(6)[2:].strip() if match.group(6) else "",
|
||||
}
|
||||
fields.append(field)
|
||||
|
||||
end_of_event = re.match(r'(\s*)};', line)
|
||||
|
||||
event_dict['fields'] = fields
|
||||
event_dict['num_fields'] = len(fields)
|
||||
|
||||
return idx
|
||||
|
||||
def parse_enums(lines, idx, event_dict):
|
||||
"""
|
||||
Parses lines from a proto file that contain an enum definition and stores it in event_dict
|
||||
"""
|
||||
enum_names = []
|
||||
end_of_enum = False
|
||||
|
||||
# record all enum values in enumeration
|
||||
# note: we don't check if there's a leading brace.
|
||||
while not end_of_enum and idx < len(lines):
|
||||
line = lines[idx].rstrip()
|
||||
idx += 1
|
||||
|
||||
preprocessor = re.search(r'#if|#endif', line)
|
||||
|
||||
if not preprocessor:
|
||||
enum = re.match(r'(\s*)(\w+)(\s*)', line)
|
||||
|
||||
if enum:
|
||||
enum_names.append(line)
|
||||
|
||||
end_of_enum = re.match(r'(\s*)};', line)
|
||||
|
||||
event_dict['names'] = enum_names
|
||||
return idx
|
||||
|
||||
def parse_protos(files, verbose=False):
|
||||
"""
|
||||
Parses a proto file and returns a dictionary of event definitions
|
||||
"""
|
||||
|
||||
# Protos structure:
|
||||
#
|
||||
# {
|
||||
# "events": {
|
||||
# "defs": { // dict of event definitions where keys are 'group_name::event_name"
|
||||
# ...,
|
||||
# "ApiStat::DrawInfoEvent": {
|
||||
# "id": 3,
|
||||
# "group": "ApiStat",
|
||||
# "name": "DrawInfoEvent", // name of event without 'group_name::' prefix
|
||||
# "desc": "",
|
||||
# "fields": [
|
||||
# {
|
||||
# "type": "uint32_t",
|
||||
# "name": "drawId",
|
||||
# "size": 1,
|
||||
# "desc": "",
|
||||
# },
|
||||
# ...
|
||||
# ]
|
||||
# },
|
||||
# ...
|
||||
# },
|
||||
# "groups": { // dict of groups with lists of event keys
|
||||
# "ApiStat": [
|
||||
# "ApiStat::DispatchEvent",
|
||||
# "ApiStat::DrawInfoEvent",
|
||||
# ...
|
||||
# ],
|
||||
# "Framework": [
|
||||
# "Framework::ThreadStartApiEvent",
|
||||
# "Framework::ThreadStartWorkerEvent",
|
||||
# ...
|
||||
# ],
|
||||
# ...
|
||||
# },
|
||||
# "map": { // map of event ids to match archrast output to event key
|
||||
# "1": "Framework::ThreadStartApiEvent",
|
||||
# "2": "Framework::ThreadStartWorkerEvent",
|
||||
# "3": "ApiStat::DrawInfoEvent",
|
||||
# ...
|
||||
# }
|
||||
# },
|
||||
# "enums": { ... } // enums follow similar defs, map (groups?) structure
|
||||
# }
|
||||
|
||||
protos = {
|
||||
'events': {
|
||||
'defs': {}, # event dictionary containing events with their fields
|
||||
'map': {}, # dictionary to map event ids to event names
|
||||
'groups': {} # event keys stored by groups
|
||||
},
|
||||
'enums': {
|
||||
'defs': {},
|
||||
'map': {}
|
||||
}
|
||||
}
|
||||
|
||||
event_id = 0
|
||||
enum_id = 0
|
||||
|
||||
if type(files) is not list:
|
||||
files = [files]
|
||||
|
||||
for filename in files:
|
||||
if verbose:
|
||||
print("Parsing proto file: %s" % os.path.normpath(filename))
|
||||
|
||||
with open(filename, 'r') as f:
|
||||
lines = f.readlines()
|
||||
in_brief = False
|
||||
brief = []
|
||||
idx = 0
|
||||
while idx < len(lines):
|
||||
line = lines[idx].strip()
|
||||
idx += 1
|
||||
|
||||
# If currently processing a brief, keep processing or change state
|
||||
if in_brief:
|
||||
match = re.match(r'^\s*\/\/\/\s*(.*)$', line) # i.e. "/// more event desc..."
|
||||
if match:
|
||||
brief.append(match.group(1).strip())
|
||||
continue
|
||||
else:
|
||||
in_brief = False
|
||||
|
||||
# Match event/enum brief
|
||||
match = re.match(r'^\s*\/\/\/\s*@(brief|breif)\s*(.*)$', line) # i.e. "///@brief My event desc..."
|
||||
if match:
|
||||
in_brief = True
|
||||
brief.append(match.group(2).strip())
|
||||
continue
|
||||
|
||||
# Match event definition
|
||||
match = re.match(r'event(\s*)(((\w*)::){0,1}(\w+))', line) # i.e. "event SWTag::CounterEvent"
|
||||
if match:
|
||||
event_id += 1
|
||||
|
||||
# Parse event attributes
|
||||
event_key = match.group(2) # i.e. SWTag::CounterEvent
|
||||
event_group = match.group(4) if match.group(4) else "" # i.e. SWTag
|
||||
event_name = match.group(5) # i.e. CounterEvent
|
||||
|
||||
# Define event attributes
|
||||
event = {
|
||||
'id': event_id,
|
||||
'group': event_group,
|
||||
'name': event_name,
|
||||
'desc': ' '.join(brief)
|
||||
}
|
||||
# Add period at end of event desc if necessary
|
||||
if event["desc"] and event["desc"][-1] != '.':
|
||||
event["desc"] += '.'
|
||||
|
||||
# Reset brief
|
||||
brief = []
|
||||
|
||||
# Now add event fields
|
||||
idx = parse_event_fields(lines, idx, event)
|
||||
|
||||
# Register event and mapping
|
||||
protos['events']['defs'][event_key] = event
|
||||
protos['events']['map'][event_id] = event_key
|
||||
|
||||
continue
|
||||
|
||||
# Match enum definition
|
||||
match = re.match(r'enum(\s*)(\w+)', line)
|
||||
if match:
|
||||
enum_id += 1
|
||||
|
||||
# Parse enum attributes
|
||||
enum_name = match.group(2)
|
||||
|
||||
# Define enum attr
|
||||
enum = {
|
||||
'name': enum_name,
|
||||
'desc': ' '.join(brief)
|
||||
}
|
||||
# Add period at end of event desc if necessary
|
||||
if enum["desc"] and enum["desc"][-1] != '.':
|
||||
enum["desc"] += '.'
|
||||
|
||||
# Reset brief
|
||||
brief = []
|
||||
|
||||
# Now add enum fields
|
||||
idx = parse_enums(lines, idx, enum)
|
||||
|
||||
# Register enum and mapping
|
||||
protos['enums']['defs'][enum_name] = enum
|
||||
protos['enums']['map'][enum_id] = enum_name
|
||||
|
||||
continue
|
||||
|
||||
# Sort and group events
|
||||
event_groups = protos['events']['groups']
|
||||
for key in sorted(protos['events']['defs']):
|
||||
group = protos['events']['defs'][key]['group']
|
||||
if group not in event_groups:
|
||||
event_groups[group] = []
|
||||
event_groups[group].append(key)
|
||||
|
||||
return protos
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
# Parse args...
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument("--proto", "-p", dest="protos", nargs='+', help="Path to all proto file(s) to process. Accepts one or more paths (i.e. events.proto and events_private.proto)", required=True)
|
||||
parser.add_argument("--output-dir", help="Output dir (defaults to ./codegen). Will create folder if it does not exist.", required=False, default="codegen")
|
||||
parser.add_argument("--verbose", "-v", help="Verbose", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.exists(args.output_dir):
|
||||
MakeDir(args.output_dir)
|
||||
|
||||
for f in args.protos:
|
||||
if not os.path.exists(f):
|
||||
print('Error: Could not find proto file %s' % f, file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Parse each proto file and add to protos container
|
||||
protos = parse_protos(args.protos, args.verbose)
|
||||
|
||||
files = [
|
||||
["gen_ar_event.hpp", ""],
|
||||
["gen_ar_event.cpp", ""],
|
||||
["gen_ar_eventhandler.hpp", "gen_ar_event.hpp"],
|
||||
["gen_ar_eventhandlerfile.hpp", "gen_ar_eventhandler.hpp"]
|
||||
]
|
||||
|
||||
rval = 0
|
||||
|
||||
try:
|
||||
# Delete existing files
|
||||
for f in files:
|
||||
filename = f[0]
|
||||
output_fullpath = os.path.join(args.output_dir, filename)
|
||||
if os.path.exists(output_fullpath):
|
||||
if args.verbose:
|
||||
print("Deleting existing file: %s" % output_fullpath)
|
||||
os.remove(output_fullpath)
|
||||
|
||||
# Generate files from templates
|
||||
print("Generating c++ from proto files...")
|
||||
for f in files:
|
||||
filename = f[0]
|
||||
event_header = f[1]
|
||||
curdir = os.path.dirname(os.path.abspath(__file__))
|
||||
template_file = os.path.join(curdir, 'templates', filename)
|
||||
output_fullpath = os.path.join(args.output_dir, filename)
|
||||
|
||||
if args.verbose:
|
||||
print("Generating: %s" % output_fullpath)
|
||||
MakoTemplateWriter.to_file(template_file, output_fullpath,
|
||||
cmdline=sys.argv,
|
||||
filename=filename,
|
||||
protos=protos,
|
||||
event_header=event_header)
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
rval = 1
|
||||
|
||||
return rval
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
|
@ -1,164 +0,0 @@
|
|||
# Copyright (C) 2017-2018 Intel Corporation. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the 'Software'),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice (including the next
|
||||
# paragraph) shall be included in all copies or substantial portions of the
|
||||
# Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
# IN THE SOFTWARE.
|
||||
|
||||
# Python source
|
||||
|
||||
import itertools
|
||||
import os
|
||||
import sys
|
||||
from gen_common import *
|
||||
|
||||
|
||||
def main(args=sys.argv[1:]):
|
||||
thisDir = os.path.dirname(os.path.realpath(__file__))
|
||||
parser = ArgumentParser('Generate files and initialization functions for all permutations of BackendPixelRate.')
|
||||
parser.add_argument('--dim', help='gBackendPixelRateTable array dimensions', nargs='+', type=int, required=True)
|
||||
parser.add_argument('--outdir', help='output directory', nargs='?', type=str, default=thisDir)
|
||||
parser.add_argument('--split', help='how many lines of initialization per file [0=no split]', nargs='?', type=int, default='512')
|
||||
parser.add_argument('--numfiles', help='how many output files to generate', nargs='?', type=int, default='0')
|
||||
parser.add_argument('--cpp', help='Generate cpp file(s)', action='store_true', default=False)
|
||||
parser.add_argument('--hpp', help='Generate hpp file', action='store_true', default=False)
|
||||
parser.add_argument('--cmake', help='Generate cmake file', action='store_true', default=False)
|
||||
parser.add_argument('--rast', help='Generate rasterizer functions instead of normal backend', action='store_true', default=False)
|
||||
|
||||
args = parser.parse_args(args)
|
||||
|
||||
|
||||
class backendStrs :
|
||||
def __init__(self) :
|
||||
self.outFileName = 'gen_BackendPixelRate%s.cpp'
|
||||
self.outHeaderName = 'gen_BackendPixelRate.hpp'
|
||||
self.functionTableName = 'gBackendPixelRateTable'
|
||||
self.funcInstanceHeader = ' = BackendPixelRate<SwrBackendTraits<'
|
||||
self.template = 'gen_backend.cpp'
|
||||
self.hpp_template = 'gen_header_init.hpp'
|
||||
self.cmakeFileName = 'gen_backends.cmake'
|
||||
self.cmakeSrcVar = 'GEN_BACKEND_SOURCES'
|
||||
self.tableName = 'BackendPixelRate'
|
||||
|
||||
if args.rast:
|
||||
self.outFileName = 'gen_rasterizer%s.cpp'
|
||||
self.outHeaderName = 'gen_rasterizer.hpp'
|
||||
self.functionTableName = 'gRasterizerFuncs'
|
||||
self.funcInstanceHeader = ' = RasterizeTriangle<RasterizerTraits<'
|
||||
self.template = 'gen_rasterizer.cpp'
|
||||
self.cmakeFileName = 'gen_rasterizer.cmake'
|
||||
self.cmakeSrcVar = 'GEN_RASTERIZER_SOURCES'
|
||||
self.tableName = 'RasterizerFuncs'
|
||||
|
||||
|
||||
backend = backendStrs()
|
||||
|
||||
output_list = []
|
||||
for x in args.dim:
|
||||
output_list.append(list(range(x)))
|
||||
|
||||
# generate all permutations possible for template parameter inputs
|
||||
output_combinations = list(itertools.product(*output_list))
|
||||
output_list = []
|
||||
|
||||
# for each permutation
|
||||
for x in range(len(output_combinations)):
|
||||
# separate each template peram into its own list member
|
||||
new_list = [output_combinations[x][i] for i in range(len(output_combinations[x]))]
|
||||
tempStr = backend.functionTableName
|
||||
#print each list member as an index in the multidimensional array
|
||||
for i in new_list:
|
||||
tempStr += '[' + str(i) + ']'
|
||||
#map each entry in the permutation as its own string member, store as the template instantiation string
|
||||
tempStr += backend.funcInstanceHeader + ','.join(map(str, output_combinations[x])) + '>>;'
|
||||
#append the line of c++ code in the list of output lines
|
||||
output_list.append(tempStr)
|
||||
|
||||
# how many files should we split the global template initialization into?
|
||||
if (args.split == 0):
|
||||
numFiles = 1
|
||||
else:
|
||||
numFiles = (len(output_list) + args.split - 1) // args.split
|
||||
if (args.numfiles != 0):
|
||||
numFiles = args.numfiles
|
||||
linesPerFile = (len(output_list) + numFiles - 1) // numFiles
|
||||
chunkedList = [output_list[x:x+linesPerFile] for x in range(0, len(output_list), linesPerFile)]
|
||||
|
||||
tmp_output_dir = MakeTmpDir('_codegen')
|
||||
|
||||
if not os.path.exists(args.outdir):
|
||||
try:
|
||||
os.makedirs(args.outdir)
|
||||
except OSError as err:
|
||||
if err.errno != errno.EEXIST:
|
||||
print('ERROR: Could not create directory:', args.outdir, file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rval = 0
|
||||
|
||||
# generate .cpp files
|
||||
try:
|
||||
if args.cpp:
|
||||
baseCppName = os.path.join(tmp_output_dir, backend.outFileName)
|
||||
templateCpp = os.path.join(thisDir, 'templates', backend.template)
|
||||
|
||||
for fileNum in range(numFiles):
|
||||
filename = baseCppName % str(fileNum)
|
||||
MakoTemplateWriter.to_file(
|
||||
templateCpp,
|
||||
baseCppName % str(fileNum),
|
||||
cmdline=sys.argv,
|
||||
fileNum=fileNum,
|
||||
funcList=chunkedList[fileNum])
|
||||
|
||||
if args.hpp:
|
||||
baseHppName = os.path.join(tmp_output_dir, backend.outHeaderName)
|
||||
templateHpp = os.path.join(thisDir, 'templates', backend.hpp_template)
|
||||
|
||||
MakoTemplateWriter.to_file(
|
||||
templateHpp,
|
||||
baseHppName,
|
||||
cmdline=sys.argv,
|
||||
numFiles=numFiles,
|
||||
filename=backend.outHeaderName,
|
||||
tableName=backend.tableName)
|
||||
|
||||
# generate gen_backend.cmake file
|
||||
if args.cmake:
|
||||
templateCmake = os.path.join(thisDir, 'templates', 'gen_backend.cmake')
|
||||
cmakeFile = os.path.join(tmp_output_dir, backend.cmakeFileName)
|
||||
|
||||
MakoTemplateWriter.to_file(
|
||||
templateCmake,
|
||||
cmakeFile,
|
||||
cmdline=sys.argv,
|
||||
srcVar=backend.cmakeSrcVar,
|
||||
numFiles=numFiles,
|
||||
baseCppName='${RASTY_GEN_SRC_DIR}/backends/' + os.path.basename(baseCppName))
|
||||
|
||||
rval = CopyDirFilesIfDifferent(tmp_output_dir, args.outdir)
|
||||
|
||||
except:
|
||||
rval = 1
|
||||
|
||||
finally:
|
||||
DeleteDirTree(tmp_output_dir)
|
||||
|
||||
return rval
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
|
@ -1,291 +0,0 @@
|
|||
# Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice (including the next
|
||||
# paragraph) shall be included in all copies or substantial portions of the
|
||||
# Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
# IN THE SOFTWARE.
|
||||
|
||||
# Python source
|
||||
import os
|
||||
import errno
|
||||
import sys
|
||||
import argparse
|
||||
import tempfile
|
||||
import filecmp
|
||||
import shutil
|
||||
from mako.template import Template
|
||||
from mako.exceptions import RichTraceback
|
||||
|
||||
#==============================================================================
|
||||
def ConcatLists(list_of_lists):
|
||||
output = []
|
||||
for l in list_of_lists: output += l
|
||||
return output
|
||||
|
||||
#==============================================================================
|
||||
def MakeTmpDir(suffix=''):
|
||||
'''
|
||||
Create temporary directory for use in codegen scripts.
|
||||
'''
|
||||
return tempfile.mkdtemp(suffix)
|
||||
|
||||
#==============================================================================
|
||||
def MakeDir(dir_path):
|
||||
'''
|
||||
Create a directory if it doesn't exist
|
||||
|
||||
returns 0 on success, non-zero on failure
|
||||
'''
|
||||
dir_path = os.path.abspath(dir_path)
|
||||
|
||||
if not os.path.exists(dir_path):
|
||||
try:
|
||||
os.makedirs(dir_path)
|
||||
except OSError as err:
|
||||
if err.errno != errno.EEXIST:
|
||||
return 1
|
||||
else:
|
||||
if not os.path.isdir(dir_path):
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
#==============================================================================
|
||||
def DeleteDirTree(dir_path):
|
||||
'''
|
||||
Delete directory tree.
|
||||
|
||||
returns 0 on success, non-zero on failure
|
||||
'''
|
||||
rval = 0
|
||||
try:
|
||||
shutil.rmtree(dir_path, False)
|
||||
except:
|
||||
rval = 1
|
||||
return rval
|
||||
|
||||
#==============================================================================
|
||||
def CopyFileIfDifferent(src, dst, verbose = False):
|
||||
'''
|
||||
Copy <src> file to <dst> file if the <dst>
|
||||
file either doesn't contain the file or the file
|
||||
contents are different.
|
||||
|
||||
returns 0 on success, non-zero on failure
|
||||
'''
|
||||
|
||||
assert os.path.isfile(src)
|
||||
assert (False == os.path.exists(dst) or os.path.isfile(dst))
|
||||
|
||||
need_copy = not os.path.exists(dst)
|
||||
if not need_copy:
|
||||
need_copy = not filecmp.cmp(src, dst)
|
||||
|
||||
if need_copy:
|
||||
try:
|
||||
shutil.copy2(src, dst)
|
||||
except:
|
||||
print('ERROR: Could not copy %s to %s' % (src, dst), file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if verbose:
|
||||
print(src, '-->', dst)
|
||||
|
||||
return 0
|
||||
|
||||
#==============================================================================
|
||||
def CopyDirFilesIfDifferent(src, dst, recurse = True, verbose = False, orig_dst = None):
|
||||
'''
|
||||
Copy files <src> directory to <dst> directory if the <dst>
|
||||
directory either doesn't contain the file or the file
|
||||
contents are different.
|
||||
|
||||
Optionally recurses into subdirectories
|
||||
|
||||
returns 0 on success, non-zero on failure
|
||||
'''
|
||||
|
||||
assert os.path.isdir(src)
|
||||
assert os.path.isdir(dst)
|
||||
|
||||
src = os.path.abspath(src)
|
||||
dst = os.path.abspath(dst)
|
||||
|
||||
if not orig_dst:
|
||||
orig_dst = dst
|
||||
|
||||
for f in os.listdir(src):
|
||||
src_path = os.path.join(src, f)
|
||||
dst_path = os.path.join(dst, f)
|
||||
|
||||
# prevent recursion
|
||||
if src_path == orig_dst:
|
||||
continue
|
||||
|
||||
if os.path.isdir(src_path):
|
||||
if recurse:
|
||||
if MakeDir(dst_path):
|
||||
print('ERROR: Could not create directory:', dst_path, file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if verbose:
|
||||
print('mkdir', dst_path)
|
||||
rval = CopyDirFilesIfDifferent(src_path, dst_path, recurse, verbose, orig_dst)
|
||||
else:
|
||||
rval = CopyFileIfDifferent(src_path, dst_path, verbose)
|
||||
|
||||
if rval:
|
||||
return rval
|
||||
|
||||
return 0
|
||||
|
||||
#==============================================================================
|
||||
class MakoTemplateWriter:
|
||||
'''
|
||||
MakoTemplateWriter - Class (namespace) for functions to generate strings
|
||||
or files using the Mako template module.
|
||||
|
||||
See http://docs.makotemplates.org/en/latest/ for
|
||||
mako documentation.
|
||||
'''
|
||||
|
||||
@staticmethod
|
||||
def to_string(template_filename, **kwargs):
|
||||
'''
|
||||
Write template data to a string object and return the string
|
||||
'''
|
||||
from mako.template import Template
|
||||
from mako.exceptions import RichTraceback
|
||||
|
||||
try:
|
||||
template = Template(filename=template_filename)
|
||||
# Split + Join fixes line-endings for whatever platform you are using
|
||||
return '\n'.join(template.render(**kwargs).splitlines())
|
||||
except:
|
||||
traceback = RichTraceback()
|
||||
for (filename, lineno, function, line) in traceback.traceback:
|
||||
print('File %s, line %s, in %s' % (filename, lineno, function))
|
||||
print(line, '\n')
|
||||
print('%s: %s' % (str(traceback.error.__class__.__name__), traceback.error))
|
||||
raise
|
||||
|
||||
@staticmethod
|
||||
def to_file(template_filename, output_filename, **kwargs):
|
||||
'''
|
||||
Write template data to a file
|
||||
'''
|
||||
if MakeDir(os.path.dirname(output_filename)):
|
||||
return 1
|
||||
with open(output_filename, 'w') as outfile:
|
||||
print(MakoTemplateWriter.to_string(template_filename, **kwargs), file=outfile)
|
||||
return 0
|
||||
|
||||
|
||||
#==============================================================================
|
||||
class ArgumentParser(argparse.ArgumentParser):
|
||||
'''
|
||||
Subclass of argparse.ArgumentParser
|
||||
|
||||
Allow parsing from command files that start with @
|
||||
Example:
|
||||
>bt run @myargs.txt
|
||||
|
||||
Contents of myargs.txt:
|
||||
-m <machine>
|
||||
--target cdv_win7
|
||||
|
||||
The below function allows multiple args to be placed on the same text-file line.
|
||||
The default is one token per line, which is a little cumbersome.
|
||||
|
||||
Also allow all characters after a '#' character to be ignored.
|
||||
'''
|
||||
|
||||
#==============================================================================
|
||||
class _HelpFormatter(argparse.RawTextHelpFormatter):
|
||||
''' Better help formatter for argument parser '''
|
||||
|
||||
def _split_lines(self, text, width):
|
||||
''' optimized split lines algorithm, indents split lines '''
|
||||
lines = text.splitlines()
|
||||
out_lines = []
|
||||
if len(lines):
|
||||
out_lines.append(lines[0])
|
||||
for line in lines[1:]:
|
||||
out_lines.append(' ' + line)
|
||||
return out_lines
|
||||
|
||||
#==============================================================================
|
||||
def __init__(self, *args, **kwargs):
|
||||
''' Constructor. Compatible with argparse.ArgumentParser(),
|
||||
but with some modifications for better usage and help display.
|
||||
'''
|
||||
super(ArgumentParser, self).__init__(
|
||||
*args,
|
||||
fromfile_prefix_chars='@',
|
||||
formatter_class=ArgumentParser._HelpFormatter,
|
||||
**kwargs)
|
||||
|
||||
#==========================================================================
|
||||
def convert_arg_line_to_args(self, arg_line):
|
||||
''' convert one line of parsed file to arguments '''
|
||||
arg_line = arg_line.split('#', 1)[0]
|
||||
if sys.platform == 'win32':
|
||||
arg_line = arg_line.replace('\\', '\\\\')
|
||||
for arg in shlex.split(arg_line):
|
||||
if not arg.strip():
|
||||
continue
|
||||
yield arg
|
||||
|
||||
#==========================================================================
|
||||
def _read_args_from_files(self, arg_strings):
|
||||
''' read arguments from files '''
|
||||
# expand arguments referencing files
|
||||
new_arg_strings = []
|
||||
for arg_string in arg_strings:
|
||||
|
||||
# for regular arguments, just add them back into the list
|
||||
if arg_string[0] not in self.fromfile_prefix_chars:
|
||||
new_arg_strings.append(arg_string)
|
||||
|
||||
# replace arguments referencing files with the file content
|
||||
else:
|
||||
filename = arg_string[1:]
|
||||
|
||||
# Search in sys.path
|
||||
if not os.path.exists(filename):
|
||||
for path in sys.path:
|
||||
filename = os.path.join(path, arg_string[1:])
|
||||
if os.path.exists(filename):
|
||||
break
|
||||
|
||||
try:
|
||||
args_file = open(filename)
|
||||
try:
|
||||
arg_strings = []
|
||||
for arg_line in args_file.read().splitlines():
|
||||
for arg in self.convert_arg_line_to_args(arg_line):
|
||||
arg_strings.append(arg)
|
||||
arg_strings = self._read_args_from_files(arg_strings)
|
||||
new_arg_strings.extend(arg_strings)
|
||||
finally:
|
||||
args_file.close()
|
||||
except IOError:
|
||||
err = sys.exc_info()[1]
|
||||
self.error(str(err))
|
||||
|
||||
# return the modified argument list
|
||||
return new_arg_strings
|
||||
|
|
@ -1,80 +0,0 @@
|
|||
# Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice (including the next
|
||||
# paragraph) shall be included in all copies or substantial portions of the
|
||||
# Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
# IN THE SOFTWARE.
|
||||
|
||||
# Python source
|
||||
import os
|
||||
import sys
|
||||
import knob_defs
|
||||
from gen_common import *
|
||||
|
||||
def main(args=sys.argv[1:]):
|
||||
|
||||
# parse args
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument("--output", "-o", help="Path to output file", required=True)
|
||||
parser.add_argument("--gen_h", "-gen_h", help="Generate gen_knobs.h", action="store_true", default=False)
|
||||
parser.add_argument("--gen_cpp", "-gen_cpp", help="Generate gen_knobs.cpp", action="store_true", required=False)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
template_cpp = os.path.join(cur_dir, 'templates', 'gen_knobs.cpp')
|
||||
template_h = os.path.join(cur_dir, 'templates', 'gen_knobs.h')
|
||||
|
||||
output_filename = os.path.basename(args.output)
|
||||
output_dir = MakeTmpDir('_codegen')
|
||||
|
||||
output_file = os.path.join(output_dir, output_filename)
|
||||
|
||||
rval = 0
|
||||
|
||||
try:
|
||||
if args.gen_h:
|
||||
MakoTemplateWriter.to_file(
|
||||
template_h,
|
||||
output_file,
|
||||
cmdline=sys.argv,
|
||||
filename='gen_knobs',
|
||||
knobs=knob_defs.KNOBS)
|
||||
|
||||
if args.gen_cpp:
|
||||
MakoTemplateWriter.to_file(
|
||||
template_cpp,
|
||||
output_file,
|
||||
cmdline=sys.argv,
|
||||
filename='gen_knobs',
|
||||
knobs=knob_defs.KNOBS,
|
||||
includes=['core/knobs_init.h', 'common/os.h', 'sstream', 'iomanip'])
|
||||
|
||||
rval = CopyFileIfDifferent(output_file, args.output)
|
||||
|
||||
except:
|
||||
rval = 1
|
||||
|
||||
finally:
|
||||
# ignore errors from delete of tmp directory
|
||||
DeleteDirTree(output_dir)
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
||||
|
|
@ -1,362 +0,0 @@
|
|||
# Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice (including the next
|
||||
# paragraph) shall be included in all copies or substantial portions of the
|
||||
# Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
# IN THE SOFTWARE.
|
||||
|
||||
import os, sys, re
|
||||
from gen_common import *
|
||||
from argparse import FileType
|
||||
|
||||
inst_aliases = {
|
||||
'SHUFFLE_VECTOR': 'VSHUFFLE',
|
||||
'INSERT_ELEMENT': 'VINSERT',
|
||||
'EXTRACT_ELEMENT': 'VEXTRACT',
|
||||
'MEM_SET': 'MEMSET',
|
||||
'MEM_CPY': 'MEMCOPY',
|
||||
'MEM_MOVE': 'MEMMOVE',
|
||||
'L_SHR': 'LSHR',
|
||||
'A_SHR': 'ASHR',
|
||||
'BIT_CAST': 'BITCAST',
|
||||
'U_DIV': 'UDIV',
|
||||
'S_DIV': 'SDIV',
|
||||
'U_REM': 'UREM',
|
||||
'S_REM': 'SREM',
|
||||
'BIN_OP': 'BINOP',
|
||||
}
|
||||
|
||||
intrinsics = [
|
||||
['VGATHERPD', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
|
||||
['VGATHERPS', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
|
||||
['VGATHERDD', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
|
||||
['VSCATTERPS', ['pBase', 'mask', 'indices', 'src', 'scale'], 'src'],
|
||||
['VRCPPS', ['a'], 'a'],
|
||||
['VROUND', ['a', 'rounding'], 'a'],
|
||||
['BEXTR_32', ['src', 'control'], 'src'],
|
||||
['VPSHUFB', ['a', 'b'], 'a'],
|
||||
['VPERMD', ['a', 'idx'], 'a'],
|
||||
['VPERMPS', ['idx', 'a'], 'a'],
|
||||
['VCVTPD2PS', ['a'], 'getVectorType(mFP32Ty, VEC_GET_NUM_ELEMS)'],
|
||||
['VCVTPS2PH', ['a', 'round'], 'mSimdInt16Ty'],
|
||||
['VHSUBPS', ['a', 'b'], 'a'],
|
||||
['VPTESTC', ['a', 'b'], 'mInt32Ty'],
|
||||
['VPTESTZ', ['a', 'b'], 'mInt32Ty'],
|
||||
['VPHADDD', ['a', 'b'], 'a'],
|
||||
['PDEP32', ['a', 'b'], 'a'],
|
||||
['RDTSC', [], 'mInt64Ty'],
|
||||
]
|
||||
|
||||
llvm_intrinsics = [
|
||||
['CTTZ', 'cttz', ['a', 'flag'], ['a']],
|
||||
['CTLZ', 'ctlz', ['a', 'flag'], ['a']],
|
||||
['VSQRTPS', 'sqrt', ['a'], ['a']],
|
||||
['STACKSAVE', 'stacksave', [], []],
|
||||
['STACKRESTORE', 'stackrestore', ['a'], []],
|
||||
['VMINPS', 'minnum', ['a', 'b'], ['a']],
|
||||
['VMAXPS', 'maxnum', ['a', 'b'], ['a']],
|
||||
['VFMADDPS', 'fmuladd', ['a', 'b', 'c'], ['a']],
|
||||
['DEBUGTRAP', 'debugtrap', [], []],
|
||||
['POPCNT', 'ctpop', ['a'], ['a']],
|
||||
['LOG2', 'log2', ['a'], ['a']],
|
||||
['FABS', 'fabs', ['a'], ['a']],
|
||||
['EXP2', 'exp2', ['a'], ['a']],
|
||||
['COS', 'cos', ['a'], ['a']],
|
||||
['SIN', 'sin', ['a'], ['a']],
|
||||
['FLOOR', 'floor', ['a'], ['a']],
|
||||
['POW', 'pow', ['a', 'b'], ['a']]
|
||||
]
|
||||
|
||||
this_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
template = os.path.join(this_dir, 'templates', 'gen_builder.hpp')
|
||||
|
||||
def convert_uppercamel(name):
|
||||
s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
|
||||
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).upper()
|
||||
|
||||
'''
|
||||
Given an input file (e.g. IRBuilder.h) generates function dictionary.
|
||||
'''
|
||||
def parse_ir_builder(input_file):
|
||||
|
||||
functions = []
|
||||
|
||||
lines = input_file.readlines()
|
||||
deprecated = None
|
||||
|
||||
idx = 0
|
||||
while idx < len(lines) - 1:
|
||||
line = lines[idx].rstrip()
|
||||
idx += 1
|
||||
|
||||
if deprecated is None:
|
||||
deprecated = re.search(r'LLVM_ATTRIBUTE_DEPRECATED', line)
|
||||
|
||||
#match = re.search(r'\*Create', line)
|
||||
match = re.search(r'[\*\s]Create(\w*)\(', line)
|
||||
if match is not None:
|
||||
#print('Line: %s' % match.group(1))
|
||||
|
||||
# Skip function if LLVM_ATTRIBUTE_DEPRECATED found before
|
||||
if deprecated is not None:
|
||||
deprecated = None
|
||||
continue
|
||||
|
||||
if re.search(r'^\s*Create', line) is not None:
|
||||
func_sig = lines[idx-2].rstrip() + line
|
||||
else:
|
||||
func_sig = line
|
||||
|
||||
end_of_args = False
|
||||
while not end_of_args:
|
||||
end_paren = re.search(r'\)', line)
|
||||
if end_paren is not None:
|
||||
end_of_args = True
|
||||
else:
|
||||
line = lines[idx].rstrip()
|
||||
func_sig += line
|
||||
idx += 1
|
||||
|
||||
delfunc = re.search(r'LLVM_DELETED_FUNCTION|= delete;', func_sig)
|
||||
|
||||
if not delfunc:
|
||||
func = re.search(r'(.*?)\*[\n\s]*(Create\w*)\((.*?)\)', func_sig)
|
||||
if func is not None:
|
||||
|
||||
return_type = func.group(1).strip() + '*'
|
||||
func_name = func.group(2)
|
||||
arguments = func.group(3)
|
||||
|
||||
func_args = []
|
||||
arg_names = []
|
||||
args = arguments.split(',')
|
||||
for arg in args:
|
||||
arg = arg.strip()
|
||||
if arg:
|
||||
func_args.append(arg)
|
||||
|
||||
split_args = arg.split('=')
|
||||
arg_name = split_args[0].rsplit(None, 1)[-1]
|
||||
|
||||
reg_arg = re.search(r'[\&\*]*(\w*)', arg_name)
|
||||
if reg_arg:
|
||||
arg_names += [reg_arg.group(1)]
|
||||
|
||||
ignore = False
|
||||
|
||||
# The following functions need to be ignored in openswr.
|
||||
# API change in llvm-5.0 breaks baked autogen files
|
||||
if (
|
||||
(func_name == 'CreateFence' or
|
||||
func_name == 'CreateAtomicCmpXchg' or
|
||||
func_name == 'CreateAtomicRMW')):
|
||||
ignore = True
|
||||
|
||||
# The following functions need to be ignored.
|
||||
if (func_name == 'CreateInsertNUWNSWBinOp' or
|
||||
func_name == 'CreateMaskedIntrinsic' or
|
||||
func_name == 'CreateAlignmentAssumptionHelper' or
|
||||
func_name == 'CreateGEP' or
|
||||
func_name == 'CreateLoad' or
|
||||
func_name == 'CreateMaskedLoad' or
|
||||
func_name == 'CreateStore' or
|
||||
func_name == 'CreateMaskedStore' or
|
||||
func_name == 'CreateFCmpHelper' or
|
||||
func_name == 'CreateElementUnorderedAtomicMemCpy'):
|
||||
ignore = True
|
||||
|
||||
# Convert CamelCase to CAMEL_CASE
|
||||
func_mod = re.search(r'Create(\w*)', func_name)
|
||||
if func_mod:
|
||||
func_mod = func_mod.group(1)
|
||||
func_mod = convert_uppercamel(func_mod)
|
||||
if func_mod[0:2] == 'F_' or func_mod[0:2] == 'I_':
|
||||
func_mod = func_mod[0] + func_mod[2:]
|
||||
|
||||
# Substitute alias based on CAMEL_CASE name.
|
||||
func_alias = inst_aliases.get(func_mod)
|
||||
if not func_alias:
|
||||
func_alias = func_mod
|
||||
|
||||
if func_name == 'CreateCall' or func_name == 'CreateGEP':
|
||||
arglist = re.search(r'ArrayRef', ', '.join(func_args))
|
||||
if arglist:
|
||||
func_alias = func_alias + 'A'
|
||||
|
||||
if not ignore:
|
||||
functions.append({
|
||||
'name' : func_name,
|
||||
'alias' : func_alias,
|
||||
'return' : return_type,
|
||||
'args' : ', '.join(func_args),
|
||||
'arg_names' : arg_names,
|
||||
})
|
||||
|
||||
return functions
|
||||
|
||||
'''
|
||||
Auto-generates macros for LLVM IR
|
||||
'''
|
||||
def generate_gen_h(functions, output_dir):
|
||||
filename = 'gen_builder.hpp'
|
||||
output_filename = os.path.join(output_dir, filename)
|
||||
|
||||
templfuncs = []
|
||||
for func in functions:
|
||||
decl = '%s %s(%s)' % (func['return'], func['alias'], func['args'])
|
||||
|
||||
templfuncs.append({
|
||||
'decl' : decl,
|
||||
'intrin' : func['name'],
|
||||
'args' : func['arg_names'],
|
||||
})
|
||||
|
||||
MakoTemplateWriter.to_file(
|
||||
template,
|
||||
output_filename,
|
||||
cmdline=sys.argv,
|
||||
comment='Builder IR Wrappers',
|
||||
filename=filename,
|
||||
functions=templfuncs,
|
||||
isX86=False, isIntrin=False)
|
||||
|
||||
'''
|
||||
Auto-generates macros for LLVM IR
|
||||
'''
|
||||
def generate_meta_h(output_dir):
|
||||
filename = 'gen_builder_meta.hpp'
|
||||
output_filename = os.path.join(output_dir, filename)
|
||||
|
||||
functions = []
|
||||
for inst in intrinsics:
|
||||
name = inst[0]
|
||||
args = inst[1]
|
||||
ret = inst[2]
|
||||
|
||||
#print('Inst: %s, x86: %s numArgs: %d' % (inst[0], inst[1], len(inst[2])))
|
||||
if len(args) != 0:
|
||||
declargs = 'Value* ' + ', Value* '.join(args)
|
||||
decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (name, declargs)
|
||||
else:
|
||||
decl = 'Value* %s(const llvm::Twine& name = "")' % (name)
|
||||
|
||||
# determine the return type of the intrinsic. It can either be:
|
||||
# - type of one of the input arguments
|
||||
# - snippet of code to set the return type
|
||||
|
||||
if ret in args:
|
||||
returnTy = ret + '->getType()'
|
||||
else:
|
||||
returnTy = ret
|
||||
|
||||
functions.append({
|
||||
'decl' : decl,
|
||||
'name' : name,
|
||||
'args' : args,
|
||||
'returnType': returnTy
|
||||
})
|
||||
|
||||
MakoTemplateWriter.to_file(
|
||||
template,
|
||||
output_filename,
|
||||
cmdline=sys.argv,
|
||||
comment='meta intrinsics',
|
||||
filename=filename,
|
||||
functions=functions,
|
||||
isX86=True, isIntrin=False)
|
||||
|
||||
def generate_intrin_h(output_dir):
|
||||
filename = 'gen_builder_intrin.hpp'
|
||||
output_filename = os.path.join(output_dir, filename)
|
||||
|
||||
functions = []
|
||||
for inst in llvm_intrinsics:
|
||||
#print('Inst: %s, x86: %s numArgs: %d' % (inst[0], inst[1], len(inst[2])))
|
||||
if len(inst[2]) != 0:
|
||||
declargs = 'Value* ' + ', Value* '.join(inst[2])
|
||||
decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (inst[0], declargs)
|
||||
else:
|
||||
decl = 'Value* %s(const llvm::Twine& name = "")' % (inst[0])
|
||||
|
||||
functions.append({
|
||||
'decl' : decl,
|
||||
'intrin' : inst[1],
|
||||
'args' : inst[2],
|
||||
'types' : inst[3],
|
||||
})
|
||||
|
||||
MakoTemplateWriter.to_file(
|
||||
template,
|
||||
output_filename,
|
||||
cmdline=sys.argv,
|
||||
comment='llvm intrinsics',
|
||||
filename=filename,
|
||||
functions=functions,
|
||||
isX86=False, isIntrin=True)
|
||||
'''
|
||||
Function which is invoked when this script is started from a command line.
|
||||
Will present and consume a set of arguments which will tell this script how
|
||||
to behave
|
||||
'''
|
||||
def main():
|
||||
|
||||
# Parse args...
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('--input', '-i', type=FileType('r'), help='Path to IRBuilder.h', required=False)
|
||||
parser.add_argument('--output-dir', '-o', action='store', dest='output', help='Path to output directory', required=True)
|
||||
parser.add_argument('--gen_h', help='Generate builder_gen.h', action='store_true', default=False)
|
||||
parser.add_argument('--gen_meta_h', help='Generate meta intrinsics. No input is needed.', action='store_true', default=False)
|
||||
parser.add_argument('--gen_intrin_h', help='Generate llvm intrinsics. No input is needed.', action='store_true', default=False)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.exists(args.output):
|
||||
os.makedirs(args.output)
|
||||
|
||||
final_output_dir = args.output
|
||||
args.output = MakeTmpDir('_codegen')
|
||||
|
||||
rval = 0
|
||||
try:
|
||||
if args.input:
|
||||
functions = parse_ir_builder(args.input)
|
||||
|
||||
if args.gen_h:
|
||||
generate_gen_h(functions, args.output)
|
||||
|
||||
elif args.gen_h:
|
||||
print('Need to specify --input for --gen_h!')
|
||||
|
||||
if args.gen_meta_h:
|
||||
generate_meta_h(args.output)
|
||||
|
||||
if args.gen_intrin_h:
|
||||
generate_intrin_h(args.output)
|
||||
|
||||
rval = CopyDirFilesIfDifferent(args.output, final_output_dir)
|
||||
|
||||
except:
|
||||
print('ERROR: Could not generate llvm_ir_macros', file=sys.stderr)
|
||||
rval = 1
|
||||
|
||||
finally:
|
||||
DeleteDirTree(args.output)
|
||||
|
||||
return rval
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
# END OF FILE
|
||||
|
|
@ -1,360 +0,0 @@
|
|||
# Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice (including the next
|
||||
# paragraph) shall be included in all copies or substantial portions of the
|
||||
# Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
# IN THE SOFTWARE.
|
||||
|
||||
import os, sys, re
|
||||
from gen_common import *
|
||||
from argparse import FileType
|
||||
|
||||
'''
|
||||
'''
|
||||
def gen_llvm_type(type, name, idx, is_pointer, is_pointer_pointer, is_array, is_array_array, array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file):
|
||||
|
||||
llvm_type = ''
|
||||
|
||||
if is_llvm_struct:
|
||||
if is_pointer or is_pointer_pointer:
|
||||
llvm_type = 'Type::getInt32Ty(ctx)'
|
||||
else:
|
||||
llvm_type = 'ArrayType::get(Type::getInt8Ty(ctx), sizeof(%s))' % type
|
||||
elif is_llvm_enum:
|
||||
llvm_type = 'Type::getInt32Ty(ctx)'
|
||||
elif is_llvm_pfn:
|
||||
llvm_type = 'PointerType::get(Type::getInt8Ty(ctx), 0)'
|
||||
else:
|
||||
if type == 'BYTE' or type == 'char' or type == 'uint8_t' or type == 'int8_t' or type == 'bool':
|
||||
llvm_type = 'Type::getInt8Ty(ctx)'
|
||||
elif type == 'UINT64' or type == 'INT64' or type == 'uint64_t' or type == 'int64_t' or type == 'gfxptr_t':
|
||||
llvm_type = 'Type::getInt64Ty(ctx)'
|
||||
elif type == 'UINT16' or type == 'int16_t' or type == 'uint16_t':
|
||||
llvm_type = 'Type::getInt16Ty(ctx)'
|
||||
elif type == 'UINT' or type == 'INT' or type == 'int' or type == 'BOOL' or type == 'uint32_t' or type == 'int32_t':
|
||||
llvm_type = 'Type::getInt32Ty(ctx)'
|
||||
elif type == 'float' or type == 'FLOAT':
|
||||
llvm_type = 'Type::getFloatTy(ctx)'
|
||||
elif type == 'double' or type == 'DOUBLE':
|
||||
llvm_type = 'Type::getDoubleTy(ctx)'
|
||||
elif type == 'void' or type == 'VOID':
|
||||
llvm_type = 'Type::getInt32Ty(ctx)'
|
||||
elif type == 'HANDLE':
|
||||
llvm_type = 'PointerType::get(Type::getInt32Ty(ctx), 0)'
|
||||
elif type == 'simdscalar':
|
||||
llvm_type = 'getVectorType(Type::getFloatTy(ctx), pJitMgr->mVWidth)'
|
||||
elif type == 'simdscalari':
|
||||
llvm_type = 'getVectorType(Type::getInt32Ty(ctx), pJitMgr->mVWidth)'
|
||||
elif type == 'simd16scalar':
|
||||
llvm_type = 'getVectorType(Type::getFloatTy(ctx), 16)'
|
||||
elif type == 'simd16scalari':
|
||||
llvm_type = 'getVectorType(Type::getInt32Ty(ctx), 16)'
|
||||
elif type == '__m128i':
|
||||
llvm_type = 'getVectorType(Type::getInt32Ty(ctx), 4)'
|
||||
elif type == 'SIMD256::Float':
|
||||
llvm_type = 'getVectorType(Type::getFloatTy(ctx), 8)'
|
||||
elif type == 'SIMD256::Integer':
|
||||
llvm_type = 'getVectorType(Type::getInt32Ty(ctx), 8)'
|
||||
elif type == 'SIMD512::Float':
|
||||
llvm_type = 'getVectorType(Type::getFloatTy(ctx), 16)'
|
||||
elif type == 'SIMD512::Integer':
|
||||
llvm_type = 'getVectorType(Type::getInt32Ty(ctx), 16)'
|
||||
elif type == 'simdvector':
|
||||
llvm_type = 'ArrayType::get(getVectorType(Type::getFloatTy(ctx), 8), 4)'
|
||||
elif type == 'simd16vector':
|
||||
llvm_type = 'ArrayType::get(getVectorType(Type::getFloatTy(ctx), 16), 4)'
|
||||
elif type == 'SIMD256::Vec4':
|
||||
llvm_type = 'ArrayType::get(getVectorType(Type::getFloatTy(ctx), 8), 4)'
|
||||
elif type == 'SIMD512::Vec4':
|
||||
llvm_type = 'ArrayType::get(getVectorType(Type::getFloatTy(ctx), 16), 4)'
|
||||
else:
|
||||
llvm_type = 'Gen_%s(pJitMgr)' % type
|
||||
|
||||
if is_pointer:
|
||||
llvm_type = 'PointerType::get(%s, 0)' % llvm_type
|
||||
|
||||
if is_pointer_pointer:
|
||||
llvm_type = 'PointerType::get(%s, 0)' % llvm_type
|
||||
|
||||
if is_array_array:
|
||||
llvm_type = 'ArrayType::get(ArrayType::get(%s, %s), %s)' % (llvm_type, array_count1, array_count)
|
||||
elif is_array:
|
||||
llvm_type = 'ArrayType::get(%s, %s)' % (llvm_type, array_count)
|
||||
|
||||
return {
|
||||
'name' : name,
|
||||
'lineNum' : idx,
|
||||
'type' : llvm_type,
|
||||
}
|
||||
|
||||
'''
|
||||
'''
|
||||
def gen_llvm_types(input_file, output_file):
|
||||
|
||||
lines = input_file.readlines()
|
||||
|
||||
types = []
|
||||
|
||||
for idx in range(len(lines)):
|
||||
line = lines[idx].rstrip()
|
||||
|
||||
if 'gen_llvm_types FINI' in line:
|
||||
break
|
||||
|
||||
match = re.match(r'(\s*)struct(\s*)(\w+)', line)
|
||||
if match:
|
||||
llvm_args = []
|
||||
|
||||
# Detect start of structure
|
||||
is_fwd_decl = re.search(r';', line)
|
||||
|
||||
if not is_fwd_decl:
|
||||
|
||||
# Extract the command name
|
||||
struct_name = match.group(3).strip()
|
||||
|
||||
type_entry = {
|
||||
'name' : struct_name,
|
||||
'lineNum' : idx+1,
|
||||
'members' : [],
|
||||
}
|
||||
|
||||
end_of_struct = False
|
||||
|
||||
while not end_of_struct and idx < len(lines)-1:
|
||||
idx += 1
|
||||
line = lines[idx].rstrip()
|
||||
|
||||
is_llvm_typedef = re.search(r'@llvm_typedef', line)
|
||||
if is_llvm_typedef is not None:
|
||||
is_llvm_typedef = True
|
||||
continue
|
||||
else:
|
||||
is_llvm_typedef = False
|
||||
|
||||
###########################################
|
||||
# Is field a llvm struct? Tells script to treat type as array of bytes that is size of structure.
|
||||
is_llvm_struct = re.search(r'@llvm_struct', line)
|
||||
|
||||
if is_llvm_struct is not None:
|
||||
is_llvm_struct = True
|
||||
else:
|
||||
is_llvm_struct = False
|
||||
|
||||
###########################################
|
||||
# Is field the start of a function? Tells script to ignore it
|
||||
is_llvm_func_start = re.search(r'@llvm_func_start', line)
|
||||
|
||||
if is_llvm_func_start is not None:
|
||||
while not end_of_struct and idx < len(lines)-1:
|
||||
idx += 1
|
||||
line = lines[idx].rstrip()
|
||||
is_llvm_func_end = re.search(r'@llvm_func_end', line)
|
||||
if is_llvm_func_end is not None:
|
||||
break;
|
||||
continue
|
||||
|
||||
###########################################
|
||||
# Is field a function? Tells script to ignore it
|
||||
is_llvm_func = re.search(r'@llvm_func', line)
|
||||
|
||||
if is_llvm_func is not None:
|
||||
continue
|
||||
|
||||
###########################################
|
||||
# Is field a llvm enum? Tells script to treat type as an enum and replaced with uint32 type.
|
||||
is_llvm_enum = re.search(r'@llvm_enum', line)
|
||||
|
||||
if is_llvm_enum is not None:
|
||||
is_llvm_enum = True
|
||||
else:
|
||||
is_llvm_enum = False
|
||||
|
||||
###########################################
|
||||
# Is field a llvm function pointer? Tells script to treat type as an enum and replaced with uint32 type.
|
||||
is_llvm_pfn = re.search(r'@llvm_pfn', line)
|
||||
|
||||
if is_llvm_pfn is not None:
|
||||
is_llvm_pfn = True
|
||||
else:
|
||||
is_llvm_pfn = False
|
||||
|
||||
###########################################
|
||||
# Is field const?
|
||||
is_const = re.search(r'\s+const\s+', line)
|
||||
|
||||
if is_const is not None:
|
||||
is_const = True
|
||||
else:
|
||||
is_const = False
|
||||
|
||||
###########################################
|
||||
# Is field a pointer?
|
||||
is_pointer_pointer = re.search('\*\*', line)
|
||||
|
||||
if is_pointer_pointer is not None:
|
||||
is_pointer_pointer = True
|
||||
else:
|
||||
is_pointer_pointer = False
|
||||
|
||||
###########################################
|
||||
# Is field a pointer?
|
||||
is_pointer = re.search('\*', line)
|
||||
|
||||
if is_pointer is not None:
|
||||
is_pointer = True
|
||||
else:
|
||||
is_pointer = False
|
||||
|
||||
###########################################
|
||||
# Is field an array of arrays?
|
||||
# TODO: Can add this to a list.
|
||||
is_array_array = re.search('\[(\w*)\]\[(\w*)\]', line)
|
||||
array_count = '0'
|
||||
array_count1 = '0'
|
||||
|
||||
if is_array_array is not None:
|
||||
array_count = is_array_array.group(1)
|
||||
array_count1 = is_array_array.group(2)
|
||||
is_array_array = True
|
||||
else:
|
||||
is_array_array = False
|
||||
|
||||
###########################################
|
||||
# Is field an array?
|
||||
is_array = re.search('\[(\w*)\]', line)
|
||||
|
||||
if is_array is not None:
|
||||
array_count = is_array.group(1)
|
||||
is_array = True
|
||||
else:
|
||||
is_array = False
|
||||
|
||||
is_scoped = re.search('::', line)
|
||||
|
||||
if is_scoped is not None:
|
||||
is_scoped = True
|
||||
else:
|
||||
is_scoped = False
|
||||
|
||||
type = None
|
||||
name = None
|
||||
if is_const and is_pointer:
|
||||
|
||||
if is_scoped:
|
||||
field_match = re.match(r'(\s*)(\w+\<*\w*\>*)(\s+)(\w+::)(\w+)(\s*\**\s*)(\w+)', line)
|
||||
|
||||
type = '%s%s' % (field_match.group(4), field_match.group(5))
|
||||
name = field_match.group(7)
|
||||
else:
|
||||
field_match = re.match(r'(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*\**\s*)(\w+)', line)
|
||||
|
||||
type = field_match.group(4)
|
||||
name = field_match.group(6)
|
||||
|
||||
elif is_pointer:
|
||||
field_match = re.match(r'(\s*)(\s+)(\w+\<*\w*\>*)(\s*\**\s*)(\w+)', line)
|
||||
|
||||
if field_match:
|
||||
type = field_match.group(3)
|
||||
name = field_match.group(5)
|
||||
elif is_const:
|
||||
field_match = re.match(r'(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*)(\w+)', line)
|
||||
|
||||
if field_match:
|
||||
type = field_match.group(4)
|
||||
name = field_match.group(6)
|
||||
else:
|
||||
if is_scoped:
|
||||
field_match = re.match(r'\s*(\w+\<*\w*\>*)\s*::\s*(\w+\<*\w*\>*)\s+(\w+)', line)
|
||||
|
||||
if field_match:
|
||||
type = field_match.group(1) + '::' + field_match.group(2)
|
||||
name = field_match.group(3)
|
||||
else:
|
||||
field_match = re.match(r'(\s*)(\w+\<*\w*\>*)(\s+)(\w+)', line)
|
||||
|
||||
if field_match:
|
||||
type = field_match.group(2)
|
||||
name = field_match.group(4)
|
||||
|
||||
if is_llvm_typedef is False:
|
||||
if type is not None:
|
||||
type_entry['members'].append(
|
||||
gen_llvm_type(
|
||||
type, name, idx+1, is_pointer, is_pointer_pointer, is_array, is_array_array,
|
||||
array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file))
|
||||
|
||||
# Detect end of structure
|
||||
end_of_struct = re.match(r'(\s*)};', line)
|
||||
|
||||
if end_of_struct:
|
||||
types.append(type_entry)
|
||||
|
||||
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
template = os.path.join(cur_dir, 'templates', 'gen_llvm.hpp')
|
||||
|
||||
MakoTemplateWriter.to_file(
|
||||
template,
|
||||
output_file,
|
||||
cmdline=sys.argv,
|
||||
filename=os.path.basename(output_file),
|
||||
types=types,
|
||||
input_dir=os.path.dirname(input_file.name),
|
||||
input_file=os.path.basename(input_file.name))
|
||||
|
||||
'''
|
||||
Function which is invoked when this script is started from a command line.
|
||||
Will present and consume a set of arguments which will tell this script how
|
||||
to behave
|
||||
'''
|
||||
def main():
|
||||
|
||||
# Parse args...
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('--input', '-i', type=FileType('r'),
|
||||
help='Path to input file containing structs', required=True)
|
||||
parser.add_argument('--output', '-o', action='store',
|
||||
help='Path to output file', required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
final_output_dir = os.path.dirname(args.output)
|
||||
if MakeDir(final_output_dir):
|
||||
return 1
|
||||
|
||||
final_output_file = args.output
|
||||
|
||||
tmp_dir = MakeTmpDir('_codegen')
|
||||
args.output = os.path.join(tmp_dir, os.path.basename(args.output))
|
||||
|
||||
rval = 0
|
||||
try:
|
||||
gen_llvm_types(args.input, args.output)
|
||||
|
||||
rval = CopyFileIfDifferent(args.output, final_output_file)
|
||||
except:
|
||||
print('ERROR: Could not generate llvm types', file=sys.stderr)
|
||||
rval = 1
|
||||
|
||||
finally:
|
||||
DeleteDirTree(tmp_dir)
|
||||
|
||||
return rval
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
# END OF FILE
|
||||
|
|
@ -1,383 +0,0 @@
|
|||
# Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice (including the next
|
||||
# paragraph) shall be included in all copies or substantial portions of the
|
||||
# Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
# IN THE SOFTWARE.
|
||||
import sys
|
||||
|
||||
# Python source
|
||||
KNOBS = [
|
||||
|
||||
['ENABLE_ASSERT_DIALOGS', {
|
||||
'type' : 'bool',
|
||||
'default' : 'true',
|
||||
'desc' : ['Use dialogs when asserts fire.',
|
||||
'Asserts are only enabled in debug builds'],
|
||||
'category' : 'debug',
|
||||
}],
|
||||
|
||||
['SINGLE_THREADED', {
|
||||
'type' : 'bool',
|
||||
'default' : 'false',
|
||||
'desc' : ['If enabled will perform all rendering on the API thread.',
|
||||
'This is useful mainly for debugging purposes.'],
|
||||
'category' : 'debug',
|
||||
}],
|
||||
|
||||
['DUMP_SHADER_IR', {
|
||||
'type' : 'bool',
|
||||
'default' : 'false',
|
||||
'desc' : ['Dumps shader LLVM IR at various stages of jit compilation.'],
|
||||
'category' : 'debug',
|
||||
}],
|
||||
|
||||
['USE_GENERIC_STORETILE', {
|
||||
'type' : 'bool',
|
||||
'default' : 'false',
|
||||
'desc' : ['Always use generic function for performing StoreTile.',
|
||||
'Will be slightly slower than using optimized (jitted) path'],
|
||||
'category' : 'debug_adv',
|
||||
}],
|
||||
|
||||
['FAST_CLEAR', {
|
||||
'type' : 'bool',
|
||||
'default' : 'true',
|
||||
'desc' : ['Replace 3D primitive execute with a SWRClearRT operation and',
|
||||
'defer clear execution to first backend op on hottile, or hottile store'],
|
||||
'category' : 'perf_adv',
|
||||
}],
|
||||
|
||||
['MAX_NUMA_NODES', {
|
||||
'type' : 'uint32_t',
|
||||
'default' : '1' if sys.platform == 'win32' else '0',
|
||||
'desc' : ['Maximum # of NUMA-nodes per system used for worker threads',
|
||||
' 0 == ALL NUMA-nodes in the system',
|
||||
' N == Use at most N NUMA-nodes for rendering'],
|
||||
'category' : 'perf',
|
||||
}],
|
||||
|
||||
['MAX_CORES_PER_NUMA_NODE', {
|
||||
'type' : 'uint32_t',
|
||||
'default' : '0',
|
||||
'desc' : ['Maximum # of cores per NUMA-node used for worker threads.',
|
||||
' 0 == ALL non-API thread cores per NUMA-node',
|
||||
' N == Use at most N cores per NUMA-node'],
|
||||
'category' : 'perf',
|
||||
}],
|
||||
|
||||
['MAX_THREADS_PER_CORE', {
|
||||
'type' : 'uint32_t',
|
||||
'default' : '1',
|
||||
'desc' : ['Maximum # of (hyper)threads per physical core used for worker threads.',
|
||||
' 0 == ALL hyper-threads per core',
|
||||
' N == Use at most N hyper-threads per physical core'],
|
||||
'category' : 'perf',
|
||||
}],
|
||||
|
||||
['MAX_WORKER_THREADS', {
|
||||
'type' : 'uint32_t',
|
||||
'default' : '0',
|
||||
'desc' : ['Maximum worker threads to spawn.',
|
||||
'',
|
||||
'IMPORTANT: If this is non-zero, no worker threads will be bound to',
|
||||
'specific HW threads. They will all be "floating" SW threads.',
|
||||
'In this case, the above 3 KNOBS will be ignored.'],
|
||||
'category' : 'perf',
|
||||
}],
|
||||
|
||||
['BASE_NUMA_NODE', {
|
||||
'type' : 'uint32_t',
|
||||
'default' : '0',
|
||||
'desc' : ['Starting NUMA node index to use when allocating compute resources.',
|
||||
'Setting this to a non-zero value will reduce the maximum # of NUMA nodes used.'],
|
||||
'category' : 'perf',
|
||||
}],
|
||||
|
||||
['BASE_CORE', {
|
||||
'type' : 'uint32_t',
|
||||
'default' : '0',
|
||||
'desc' : ['Starting core index to use when allocating compute resources.',
|
||||
'Setting this to a non-zero value will reduce the maximum # of cores used.'],
|
||||
'category' : 'perf',
|
||||
}],
|
||||
|
||||
['BASE_THREAD', {
|
||||
'type' : 'uint32_t',
|
||||
'default' : '0',
|
||||
'desc' : ['Starting thread index to use when allocating compute resources.',
|
||||
'Setting this to a non-zero value will reduce the maximum # of threads used.'],
|
||||
'category' : 'perf',
|
||||
}],
|
||||
|
||||
['BUCKETS_START_FRAME', {
|
||||
'type' : 'uint32_t',
|
||||
'default' : '1200',
|
||||
'desc' : ['Frame from when to start saving buckets data.',
|
||||
'',
|
||||
'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
|
||||
'for this to have an effect.'],
|
||||
'category' : 'perf_adv',
|
||||
}],
|
||||
|
||||
['BUCKETS_END_FRAME', {
|
||||
'type' : 'uint32_t',
|
||||
'default' : '1400',
|
||||
'desc' : ['Frame at which to stop saving buckets data.',
|
||||
'',
|
||||
'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
|
||||
'for this to have an effect.'],
|
||||
'category' : 'perf_adv',
|
||||
}],
|
||||
|
||||
['WORKER_SPIN_LOOP_COUNT', {
|
||||
'type' : 'uint32_t',
|
||||
'default' : '5000',
|
||||
'desc' : ['Number of spin-loop iterations worker threads will perform',
|
||||
'before going to sleep when waiting for work'],
|
||||
'category' : 'perf_adv',
|
||||
}],
|
||||
|
||||
['MAX_DRAWS_IN_FLIGHT', {
|
||||
'type' : 'uint32_t',
|
||||
'default' : '256',
|
||||
'desc' : ['Maximum number of draws outstanding before API thread blocks.',
|
||||
'This value MUST be evenly divisible into 2^32'],
|
||||
'category' : 'perf_adv',
|
||||
}],
|
||||
|
||||
['MAX_PRIMS_PER_DRAW', {
|
||||
'type' : 'uint32_t',
|
||||
'default' : '49152',
|
||||
'desc' : ['Maximum primitives in a single Draw().',
|
||||
'Larger primitives are split into smaller Draw calls.',
|
||||
'Should be a multiple of (3 * vectorWidth).'],
|
||||
'category' : 'perf_adv',
|
||||
}],
|
||||
|
||||
['MAX_TESS_PRIMS_PER_DRAW', {
|
||||
'type' : 'uint32_t',
|
||||
'default' : '16',
|
||||
'desc' : ['Maximum primitives in a single Draw() with tessellation enabled.',
|
||||
'Larger primitives are split into smaller Draw calls.',
|
||||
'Should be a multiple of (vectorWidth).'],
|
||||
'category' : 'perf_adv',
|
||||
}],
|
||||
|
||||
|
||||
['DEBUG_OUTPUT_DIR', {
|
||||
'type' : 'std::string',
|
||||
'default' : r'%TEMP%\Rast\DebugOutput' if sys.platform == 'win32' else '/tmp/Rast/DebugOutput',
|
||||
'desc' : ['Output directory for debug data.'],
|
||||
'category' : 'debug',
|
||||
}],
|
||||
|
||||
['JIT_ENABLE_CACHE', {
|
||||
'type' : 'bool',
|
||||
'default' : 'false',
|
||||
'desc' : ['Enables caching of compiled shaders'],
|
||||
'category' : 'debug_adv',
|
||||
}],
|
||||
|
||||
['JIT_OPTIMIZATION_LEVEL', {
|
||||
'type' : 'int',
|
||||
'default' : '-1',
|
||||
'desc' : ['JIT compile optimization level:',],
|
||||
'category' : 'debug',
|
||||
'control' : 'dropdown',
|
||||
'choices' : [
|
||||
{
|
||||
'name' : 'Automatic',
|
||||
'desc' : 'Automatic based on other KNOB and build settings',
|
||||
'value' : -1,
|
||||
},
|
||||
{
|
||||
'name' : 'Debug',
|
||||
'desc' : 'No optimization: -O0',
|
||||
'value' : 0,
|
||||
},
|
||||
{
|
||||
'name' : 'Less',
|
||||
'desc' : 'Some optimization: -O1',
|
||||
'value' : 1,
|
||||
},
|
||||
{
|
||||
'name' : 'Optimize',
|
||||
'desc' : 'Default Clang / LLVM optimizations: -O2',
|
||||
'value' : 2,
|
||||
},
|
||||
{
|
||||
'name' : 'Aggressive',
|
||||
'desc' : 'Maximum optimization: -O3',
|
||||
'value' : 3,
|
||||
},
|
||||
],
|
||||
}],
|
||||
|
||||
['JIT_CACHE_DIR', {
|
||||
'type' : 'std::string',
|
||||
'default' : r'%TEMP%\SWR\JitCache' if sys.platform == 'win32' else '${HOME}/.swr/jitcache',
|
||||
'desc' : ['Cache directory for compiled shaders.'],
|
||||
'category' : 'debug',
|
||||
}],
|
||||
|
||||
['TOSS_DRAW', {
|
||||
'type' : 'bool',
|
||||
'default' : 'false',
|
||||
'desc' : ['Disable per-draw/dispatch execution'],
|
||||
'category' : 'perf',
|
||||
}],
|
||||
|
||||
['TOSS_QUEUE_FE', {
|
||||
'type' : 'bool',
|
||||
'default' : 'false',
|
||||
'desc' : ['Stop per-draw execution at worker FE',
|
||||
'',
|
||||
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
|
||||
'category' : 'perf_adv',
|
||||
}],
|
||||
|
||||
['TOSS_FETCH', {
|
||||
'type' : 'bool',
|
||||
'default' : 'false',
|
||||
'desc' : ['Stop per-draw execution at vertex fetch',
|
||||
'',
|
||||
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
|
||||
'category' : 'perf_adv',
|
||||
}],
|
||||
|
||||
['TOSS_IA', {
|
||||
'type' : 'bool',
|
||||
'default' : 'false',
|
||||
'desc' : ['Stop per-draw execution at input assembler',
|
||||
'',
|
||||
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
|
||||
'category' : 'perf_adv',
|
||||
}],
|
||||
|
||||
['TOSS_VS', {
|
||||
'type' : 'bool',
|
||||
'default' : 'false',
|
||||
'desc' : ['Stop per-draw execution at vertex shader',
|
||||
'',
|
||||
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
|
||||
'category' : 'perf_adv',
|
||||
}],
|
||||
|
||||
['TOSS_SETUP_TRIS', {
|
||||
'type' : 'bool',
|
||||
'default' : 'false',
|
||||
'desc' : ['Stop per-draw execution at primitive setup',
|
||||
'',
|
||||
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
|
||||
'category' : 'perf_adv',
|
||||
}],
|
||||
|
||||
['TOSS_BIN_TRIS', {
|
||||
'type' : 'bool',
|
||||
'default' : 'false',
|
||||
'desc' : ['Stop per-draw execution at primitive binning',
|
||||
'',
|
||||
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
|
||||
'category' : 'perf_adv',
|
||||
}],
|
||||
|
||||
['TOSS_RS', {
|
||||
'type' : 'bool',
|
||||
'default' : 'false',
|
||||
'desc' : ['Stop per-draw execution at rasterizer',
|
||||
'',
|
||||
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
|
||||
'category' : 'perf_adv',
|
||||
}],
|
||||
|
||||
['DISABLE_SPLIT_DRAW', {
|
||||
'type' : 'bool',
|
||||
'default' : 'false',
|
||||
'desc' : ['Don\'t split large draws into smaller draws.,',
|
||||
'MAX_PRIMS_PER_DRAW and MAX_TESS_PRIMS_PER_DRAW can be used to control split size.',
|
||||
'',
|
||||
'Useful to disable split draws for gathering archrast stats.'],
|
||||
'category' : 'perf_adv',
|
||||
}],
|
||||
|
||||
['AR_ENABLE_PIPELINE_STATS', {
|
||||
'type' : 'bool',
|
||||
'default' : 'true',
|
||||
'desc' : ['Enable pipeline stats when using Archrast'],
|
||||
'category' : 'archrast',
|
||||
}],
|
||||
|
||||
['AR_ENABLE_SHADER_STATS', {
|
||||
'type' : 'bool',
|
||||
'default' : 'true',
|
||||
'desc' : ['Enable shader stats when using Archrast'],
|
||||
'category' : 'archrast',
|
||||
}],
|
||||
|
||||
['AR_ENABLE_SWTAG_DATA', {
|
||||
'type' : 'bool',
|
||||
'default' : 'false',
|
||||
'desc' : ['Enable SWTag data when using Archrast'],
|
||||
'category' : 'archrast',
|
||||
}],
|
||||
|
||||
['AR_ENABLE_SWR_EVENTS', {
|
||||
'type' : 'bool',
|
||||
'default' : 'true',
|
||||
'desc' : ['Enable internal SWR events when using Archrast'],
|
||||
'category' : 'archrast',
|
||||
}],
|
||||
|
||||
['AR_ENABLE_PIPELINE_EVENTS', {
|
||||
'type' : 'bool',
|
||||
'default' : 'true',
|
||||
'desc' : ['Enable pipeline events when using Archrast'],
|
||||
'category' : 'archrast',
|
||||
}],
|
||||
|
||||
['AR_ENABLE_SHADER_EVENTS', {
|
||||
'type' : 'bool',
|
||||
'default' : 'true',
|
||||
'desc' : ['Enable shader events when using Archrast'],
|
||||
'category' : 'archrast',
|
||||
}],
|
||||
|
||||
['AR_ENABLE_SWTAG_EVENTS', {
|
||||
'type' : 'bool',
|
||||
'default' : 'false',
|
||||
'desc' : ['Enable SWTag events when using Archrast'],
|
||||
'category' : 'archrast',
|
||||
}],
|
||||
|
||||
['AR_ENABLE_MEMORY_EVENTS', {
|
||||
'type' : 'bool',
|
||||
'default' : 'false',
|
||||
'desc' : ['Enable memory events when using Archrast'],
|
||||
'category' : 'archrast',
|
||||
}],
|
||||
|
||||
['AR_MEM_SET_BYTE_GRANULARITY', {
|
||||
'type' : 'uint32_t',
|
||||
'default' : '64',
|
||||
'desc' : ['Granularity and alignment of tracking of memory accesses',
|
||||
'ONLY ACTIVE UNDER ArchRast.'],
|
||||
'category' : 'archrast',
|
||||
}],
|
||||
|
||||
|
||||
]
|
||||
|
|
@ -1,77 +0,0 @@
|
|||
# Copyright © 2017-2018 Intel Corporation
|
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
gen_knobs_cpp = custom_target(
|
||||
'gen_knobs.cpp',
|
||||
input : ['gen_knobs.py'],
|
||||
output : 'gen_knobs.cpp',
|
||||
command : [prog_python, '@INPUT0@', '--output', '@OUTPUT@', '--gen_cpp'],
|
||||
depend_files : files(
|
||||
'knob_defs.py', 'gen_common.py',
|
||||
'templates/gen_knobs.cpp',
|
||||
),
|
||||
)
|
||||
|
||||
gen_knobs_h = custom_target(
|
||||
'gen_knobs.h',
|
||||
input : ['gen_knobs.py'],
|
||||
output : 'gen_knobs.h',
|
||||
command : [prog_python, '@INPUT0@', '--output', '@OUTPUT@', '--gen_h'],
|
||||
depend_files : files(
|
||||
'knob_defs.py', 'gen_common.py',
|
||||
'templates/gen_knobs.h',
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
# The generators above this are needed individually, while the below generators
|
||||
# are all inputs to the same lib, so they don't need unique names.
|
||||
files_swr_common += [
|
||||
gen_builder_hpp, gen_builder_meta_hpp, gen_knobs_h, gen_knobs_cpp
|
||||
]
|
||||
|
||||
foreach x : [[swr_context_files, 'gen_swr_context_llvm.h'],
|
||||
[swr_state_files, 'gen_state_llvm.h'],
|
||||
[swr_surf_state_files, 'gen_surf_state_llvm.h']]
|
||||
files_swr_common += custom_target(
|
||||
x[1],
|
||||
input : ['gen_llvm_types.py', x[0]],
|
||||
output : x[1],
|
||||
command : [prog_python, '@INPUT0@', '--input', '@INPUT1@', '--output', '@OUTPUT@'],
|
||||
depend_files : files(
|
||||
'templates/gen_llvm.hpp',
|
||||
'gen_common.py',
|
||||
),
|
||||
)
|
||||
endforeach
|
||||
|
||||
ar_output_filenames = ['gen_ar_event.hpp', 'gen_ar_event.cpp', 'gen_ar_eventhandler.hpp', 'gen_ar_eventhandlerfile.hpp']
|
||||
ar_template_filenames = []
|
||||
foreach fname : ar_output_filenames
|
||||
ar_template_filenames += join_paths('templates', fname)
|
||||
endforeach
|
||||
|
||||
files_swr_common += custom_target(
|
||||
'gen_archrast',
|
||||
input : ['gen_archrast.py', swr_event_proto_files, swr_event_pproto_files],
|
||||
output : ar_output_filenames,
|
||||
command : [prog_python, '@INPUT0@', '--proto', '@INPUT1@', '@INPUT2@', '--output-dir', meson.current_build_dir()],
|
||||
depend_files : files('gen_common.py', ar_template_filenames)
|
||||
)
|
||||
|
|
@ -1,55 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file ${filename}
|
||||
*
|
||||
* @brief Implementation for events. auto-generated file
|
||||
*
|
||||
* DO NOT EDIT
|
||||
*
|
||||
* Generation Command Line:
|
||||
* ${'\n * '.join(cmdline)}
|
||||
*
|
||||
******************************************************************************/
|
||||
// clang-format off
|
||||
#include "common/os.h"
|
||||
#include "gen_ar_event.hpp"
|
||||
#include "gen_ar_eventhandler.hpp"
|
||||
|
||||
using namespace ArchRast;
|
||||
|
||||
<% sorted_groups = sorted(protos['events']['groups']) %>
|
||||
% for group in sorted_groups:
|
||||
% for event_key in protos['events']['groups'][group]:
|
||||
<%
|
||||
event = protos['events']['defs'][event_key]
|
||||
%>
|
||||
void ${event['name']}::Accept(EventHandler* pHandler) const
|
||||
{
|
||||
pHandler->Handle(*this);
|
||||
}
|
||||
% endfor
|
||||
% endfor
|
||||
|
||||
|
||||
// clan-format on
|
||||
|
||||
|
|
@ -1,168 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file ${filename}
|
||||
*
|
||||
* @brief Definitions for events. auto-generated file
|
||||
*
|
||||
* DO NOT EDIT
|
||||
*
|
||||
* Generation Command Line:
|
||||
* ${'\n * '.join(cmdline)}
|
||||
*
|
||||
******************************************************************************/
|
||||
// clang-format off
|
||||
#pragma once
|
||||
|
||||
#include "common/os.h"
|
||||
#include "core/state.h"
|
||||
|
||||
<%
|
||||
always_enabled_knob_groups = ['Framework', 'SWTagFramework', 'ApiSwr']
|
||||
group_knob_remap_table = {
|
||||
"ShaderStats": "KNOB_AR_ENABLE_SHADER_STATS",
|
||||
"PipelineStats" : "KNOB_AR_ENABLE_PIPELINE_STATS",
|
||||
"SWTagData" : "KNOB_AR_ENABLE_SWTAG_DATA",
|
||||
}
|
||||
%>
|
||||
namespace ArchRast
|
||||
{
|
||||
<% sorted_enums = sorted(protos['enums']['defs']) %>
|
||||
% for name in sorted_enums:
|
||||
enum ${name}
|
||||
{<% names = protos['enums']['defs'][name]['names'] %>
|
||||
% for i in range(len(names)):
|
||||
${names[i].lstrip()}
|
||||
% endfor
|
||||
};
|
||||
% endfor
|
||||
|
||||
// Forward decl
|
||||
class EventHandler;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Event - interface for handling events.
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Event
|
||||
{
|
||||
const uint32_t eventId = {0xFFFFFFFF};
|
||||
Event() {}
|
||||
virtual ~Event() {}
|
||||
|
||||
virtual bool IsEnabled() const { return true; };
|
||||
virtual const uint32_t GetEventId() const = 0;
|
||||
virtual void Accept(EventHandler* pHandler) const = 0;
|
||||
};
|
||||
|
||||
<% sorted_groups = sorted(protos['events']['groups']) %>
|
||||
% for group in sorted_groups:
|
||||
% for event_key in protos['events']['groups'][group]:
|
||||
<%
|
||||
event = protos['events']['defs'][event_key]
|
||||
%>
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// ${event_key}Data
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
#pragma pack(push, 1)
|
||||
struct ${event['name']}Data
|
||||
{<%
|
||||
fields = event['fields'] %>
|
||||
// Fields
|
||||
% for i in range(len(fields)):
|
||||
% if fields[i]['size'] > 1:
|
||||
${fields[i]['type']} ${fields[i]['name']}[${fields[i]['size']}];
|
||||
% else:
|
||||
${fields[i]['type']} ${fields[i]['name']};
|
||||
% endif
|
||||
% endfor
|
||||
};
|
||||
#pragma pack(pop)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// ${event_key}
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct ${event['name']} : Event
|
||||
{<%
|
||||
fields = event['fields'] %>
|
||||
const uint32_t eventId = {${ event['id'] }};
|
||||
${event['name']}Data data;
|
||||
|
||||
// Constructor
|
||||
${event['name']}(
|
||||
% for i in range(len(fields)):
|
||||
% if i < len(fields)-1:
|
||||
% if fields[i]['size'] > 1:
|
||||
${fields[i]['type']}* ${fields[i]['name']},
|
||||
uint32_t ${fields[i]['name']}_size,
|
||||
% else:
|
||||
${fields[i]['type']} ${fields[i]['name']},
|
||||
% endif
|
||||
% endif
|
||||
% if i == len(fields)-1:
|
||||
% if fields[i]['size'] > 1:
|
||||
${fields[i]['type']}* ${fields[i]['name']},
|
||||
uint32_t ${fields[i]['name']}_size
|
||||
% else:
|
||||
${fields[i]['type']} ${fields[i]['name']}
|
||||
% endif
|
||||
% endif
|
||||
% endfor
|
||||
)
|
||||
{
|
||||
% for i in range(len(fields)):
|
||||
% if fields[i]['size'] > 1:
|
||||
% if fields[i]['type'] == 'char':
|
||||
// Copy size of string (null-terminated) followed by string into entire buffer
|
||||
SWR_ASSERT(${fields[i]['name']}_size + 1 < ${fields[i]['size']} - sizeof(uint32_t), "String length must be less than size of char buffer - size(uint32_t)!");
|
||||
memcpy(data.${fields[i]['name']}, &${fields[i]['name']}_size, sizeof(uint32_t));
|
||||
strcpy_s(data.${fields[i]['name']} + sizeof(uint32_t), ${fields[i]['name']}_size + 1, ${fields[i]['name']});
|
||||
% else:
|
||||
memcpy(data.${fields[i]['name']}, ${fields[i]['name']}, ${fields[i]['name']}_size);
|
||||
% endif
|
||||
% else:
|
||||
data.${fields[i]['name']} = ${fields[i]['name']};
|
||||
% endif
|
||||
% endfor
|
||||
}
|
||||
|
||||
virtual void Accept(EventHandler* pHandler) const;
|
||||
inline const uint32_t GetEventId() const { return eventId; }
|
||||
% if group not in always_enabled_knob_groups:
|
||||
<%
|
||||
if group in group_knob_remap_table:
|
||||
group_knob_define = group_knob_remap_table[group]
|
||||
else:
|
||||
group_knob_define = 'KNOB_AR_ENABLE_' + group.upper() + '_EVENTS'
|
||||
%>
|
||||
bool IsEnabled() const
|
||||
{
|
||||
static const bool IsEventEnabled = true; // TODO: Replace with knob for each event
|
||||
return ${group_knob_define} && IsEventEnabled;
|
||||
}
|
||||
% endif
|
||||
};
|
||||
|
||||
% endfor
|
||||
|
||||
% endfor
|
||||
} // namespace ArchRast
|
||||
// clang-format on
|
||||
|
|
@ -1,61 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file ${filename}
|
||||
*
|
||||
* @brief Event handler interface. auto-generated file
|
||||
*
|
||||
* DO NOT EDIT
|
||||
*
|
||||
* Generation Command Line:
|
||||
* ${'\n * '.join(cmdline)}
|
||||
*
|
||||
******************************************************************************/
|
||||
// clang-format on
|
||||
#pragma once
|
||||
|
||||
#include "${event_header}"
|
||||
|
||||
namespace ArchRast
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// EventHandler - interface for handling events.
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
class EventHandler
|
||||
{
|
||||
public:
|
||||
EventHandler() {}
|
||||
virtual ~EventHandler() {}
|
||||
|
||||
virtual void FlushDraw(uint32_t drawId) {}
|
||||
|
||||
<% sorted_groups = sorted(protos['events']['groups']) %>
|
||||
% for group in sorted_groups:
|
||||
% for event_key in protos['events']['groups'][group]:
|
||||
<%
|
||||
event = protos['events']['defs'][event_key]
|
||||
%> virtual void Handle(const ${event['name']}& event) {}
|
||||
% endfor
|
||||
% endfor
|
||||
};
|
||||
} // namespace ArchRast
|
||||
// clan-format off
|
||||
|
|
@ -1,174 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file ${filename}
|
||||
*
|
||||
* @brief Event handler interface. auto-generated file
|
||||
*
|
||||
* DO NOT EDIT
|
||||
*
|
||||
* Generation Command Line:
|
||||
* ${'\n * '.join(cmdline)}
|
||||
*
|
||||
******************************************************************************/
|
||||
// clang-format off
|
||||
#pragma once
|
||||
|
||||
#include "common/os.h"
|
||||
#include "${event_header}"
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
#include <thread>
|
||||
|
||||
namespace ArchRast
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// EventHandlerFile - interface for handling events.
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
class EventHandlerFile : public EventHandler
|
||||
{
|
||||
public:
|
||||
EventHandlerFile(uint32_t id) : mBufOffset(0)
|
||||
{
|
||||
#if defined(_WIN32)
|
||||
DWORD pid = GetCurrentProcessId();
|
||||
TCHAR procname[MAX_PATH];
|
||||
GetModuleFileName(NULL, procname, MAX_PATH);
|
||||
const char* pBaseName = strrchr(procname, '\\');
|
||||
std::stringstream outDir;
|
||||
outDir << KNOB_DEBUG_OUTPUT_DIR << pBaseName << "_" << pid << std::ends;
|
||||
mOutputDir = outDir.str();
|
||||
if (CreateDirectory(mOutputDir.c_str(), NULL))
|
||||
{
|
||||
std::cout << std::endl
|
||||
<< "ArchRast Dir: " << mOutputDir << std::endl
|
||||
<< std::endl
|
||||
<< std::flush;
|
||||
}
|
||||
|
||||
// There could be multiple threads creating thread pools. We
|
||||
// want to make sure they are uniquely identified by adding in
|
||||
// the creator's thread id into the filename.
|
||||
std::stringstream fstr;
|
||||
fstr << outDir.str().c_str() << "\\ar_event" << std::this_thread::get_id();
|
||||
fstr << "_" << id << ".bin" << std::ends;
|
||||
mFilename = fstr.str();
|
||||
#else
|
||||
// There could be multiple threads creating thread pools. We
|
||||
// want to make sure they are uniquely identified by adding in
|
||||
// the creator's thread id into the filename.
|
||||
std::stringstream fstr;
|
||||
fstr << "/tmp/ar_event" << std::this_thread::get_id();
|
||||
fstr << "_" << id << ".bin" << std::ends;
|
||||
mFilename = fstr.str();
|
||||
#endif
|
||||
}
|
||||
|
||||
virtual ~EventHandlerFile() { FlushBuffer(); }
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Flush buffer to file.
|
||||
bool FlushBuffer()
|
||||
{
|
||||
if (mBufOffset > 0)
|
||||
{
|
||||
if (mBufOffset == mHeaderBufOffset)
|
||||
{
|
||||
// Nothing to flush. Only header has been generated.
|
||||
return false;
|
||||
}
|
||||
|
||||
std::ofstream file;
|
||||
file.open(mFilename, std::ios::out | std::ios::app | std::ios::binary);
|
||||
|
||||
if (!file.is_open())
|
||||
{
|
||||
SWR_INVALID("ArchRast: Could not open event file!");
|
||||
return false;
|
||||
}
|
||||
|
||||
file.write((char*)mBuffer, mBufOffset);
|
||||
file.close();
|
||||
|
||||
mBufOffset = 0;
|
||||
mHeaderBufOffset = 0; // Reset header offset so its no longer considered.
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Write event and its payload to the memory buffer.
|
||||
void Write(uint32_t eventId, const char* pBlock, uint32_t size)
|
||||
{
|
||||
if ((mBufOffset + size + sizeof(eventId)) > mBufferSize)
|
||||
{
|
||||
if (!FlushBuffer())
|
||||
{
|
||||
// Don't corrupt what's already in the buffer?
|
||||
/// @todo Maybe add corrupt marker to buffer here in case we can open file in
|
||||
/// future?
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
memcpy(&mBuffer[mBufOffset], (char*)&eventId, sizeof(eventId));
|
||||
mBufOffset += sizeof(eventId);
|
||||
memcpy(&mBuffer[mBufOffset], pBlock, size);
|
||||
mBufOffset += size;
|
||||
}
|
||||
<% sorted_groups = sorted(protos['events']['groups']) %>
|
||||
% for group in sorted_groups:
|
||||
% for event_key in protos['events']['groups'][group]:
|
||||
<%
|
||||
event = protos['events']['defs'][event_key]
|
||||
%>
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Handle ${event_key} event
|
||||
virtual void Handle(const ${event['name']}& event)
|
||||
{
|
||||
% if event['num_fields'] == 0:
|
||||
Write(event.eventId, (char*)&event.data, 0);
|
||||
% else:
|
||||
Write(event.eventId, (char*)&event.data, sizeof(event.data));
|
||||
% endif
|
||||
}
|
||||
% endfor
|
||||
% endfor
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Everything written to buffer this point is the header.
|
||||
virtual void MarkHeader()
|
||||
{
|
||||
mHeaderBufOffset = mBufOffset;
|
||||
}
|
||||
|
||||
std::string mFilename;
|
||||
std::string mOutputDir;
|
||||
|
||||
static const uint32_t mBufferSize = 1024;
|
||||
uint8_t mBuffer[mBufferSize];
|
||||
uint32_t mBufOffset{0};
|
||||
uint32_t mHeaderBufOffset{0};
|
||||
};
|
||||
} // namespace ArchRast
|
||||
// clang-format on
|
||||
|
|
@ -1,42 +0,0 @@
|
|||
//============================================================================
|
||||
// Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a
|
||||
// copy of this software and associated documentation files (the "Software"),
|
||||
// to deal in the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice (including the next
|
||||
// paragraph) shall be included in all copies or substantial portions of the
|
||||
// Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
// IN THE SOFTWARE.
|
||||
//
|
||||
// @file BackendPixelRate${fileNum}.cpp
|
||||
//
|
||||
// @brief auto-generated file
|
||||
//
|
||||
// DO NOT EDIT
|
||||
//
|
||||
// Generation Command Line:
|
||||
// ${'\n// '.join(cmdline)}
|
||||
//
|
||||
//============================================================================
|
||||
|
||||
#include "core/backend.h"
|
||||
#include "core/backend_impl.h"
|
||||
|
||||
void InitBackendPixelRate${fileNum}()
|
||||
{
|
||||
%for func in funcList:
|
||||
${func}
|
||||
%endfor
|
||||
}
|
||||
|
|
@ -1,84 +0,0 @@
|
|||
//============================================================================
|
||||
// Copyright (C) 2014-2020 Intel Corporation. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a
|
||||
// copy of this software and associated documentation files (the "Software"),
|
||||
// to deal in the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice (including the next
|
||||
// paragraph) shall be included in all copies or substantial portions of the
|
||||
// Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
// IN THE SOFTWARE.
|
||||
//
|
||||
// @file ${filename}
|
||||
//
|
||||
// @brief auto-generated file
|
||||
//
|
||||
// DO NOT EDIT
|
||||
//
|
||||
// Generation Command Line:
|
||||
// ${'\n// '.join(cmdline)}
|
||||
//
|
||||
//============================================================================
|
||||
// clang-format off
|
||||
#pragma once
|
||||
|
||||
//============================================================================
|
||||
// Auto-generated ${comment}
|
||||
//============================================================================
|
||||
%for func in functions:
|
||||
<%argList = ', '.join(func['args'])%>\
|
||||
${func['decl']}
|
||||
{
|
||||
%if isX86:
|
||||
%if len(func['args']) != 0:
|
||||
SmallVector<Type*, ${len(func['args'])}> argTypes;
|
||||
%for arg in func['args']:
|
||||
argTypes.push_back(${arg}->getType());
|
||||
%endfor
|
||||
#if LLVM_VERSION_MAJOR >= 12
|
||||
#define VEC_GET_NUM_ELEMS cast<FixedVectorType>(a->getType())->getNumElements()
|
||||
#elif LLVM_VERSION_MAJOR >= 11
|
||||
#define VEC_GET_NUM_ELEMS cast<VectorType>(a->getType())->getNumElements()
|
||||
#else
|
||||
#define VEC_GET_NUM_ELEMS a->getType()->getVectorNumElements()
|
||||
#endif
|
||||
FunctionType* pFuncTy = FunctionType::get(${ func['returnType'] }, argTypes, false);
|
||||
%else:
|
||||
FunctionType* pFuncTy = FunctionType::get(${ func['returnType'] }, {}, false);
|
||||
%endif:
|
||||
#if LLVM_VERSION_MAJOR >= 9
|
||||
Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("meta.intrinsic.${func['name']}", pFuncTy).getCallee());
|
||||
#else
|
||||
Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("meta.intrinsic.${func['name']}", pFuncTy));
|
||||
#endif
|
||||
return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name);
|
||||
%elif isIntrin:
|
||||
%if len(func['types']) != 0:
|
||||
SmallVector<Type*, ${len(func['types'])}> args;
|
||||
%for arg in func['types']:
|
||||
args.push_back(${arg}->getType());
|
||||
%endfor
|
||||
Function* pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']}, args);
|
||||
return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name);
|
||||
%else:
|
||||
Function* pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']});
|
||||
return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name);
|
||||
%endif
|
||||
%else:
|
||||
return IRB()->${func['intrin']}(${argList});
|
||||
%endif
|
||||
}
|
||||
|
||||
% endfor
|
||||
// clang-format on
|
||||
|
|
@ -1,46 +0,0 @@
|
|||
//============================================================================
|
||||
// Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a
|
||||
// copy of this software and associated documentation files (the "Software"),
|
||||
// to deal in the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice (including the next
|
||||
// paragraph) shall be included in all copies or substantial portions of the
|
||||
// Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
// IN THE SOFTWARE.
|
||||
//
|
||||
// @file ${filename}
|
||||
//
|
||||
// @brief auto-generated file
|
||||
//
|
||||
// DO NOT EDIT
|
||||
//
|
||||
// Generation Command Line:
|
||||
// ${'\n// '.join(cmdline)}
|
||||
//
|
||||
//============================================================================
|
||||
|
||||
// clang-format off
|
||||
|
||||
%for num in range(numFiles):
|
||||
void Init${tableName}${num}();
|
||||
%endfor
|
||||
|
||||
static INLINE void Init${tableName}()
|
||||
{
|
||||
%for num in range(numFiles):
|
||||
Init${tableName}${num}();
|
||||
%endfor
|
||||
}
|
||||
// clang-format on
|
||||
|
|
@ -1,143 +0,0 @@
|
|||
/******************************************************************************
|
||||
* Copyright (C) 2015-2018 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file ${filename}.cpp
|
||||
*
|
||||
* @brief Dynamic Knobs for Core.
|
||||
*
|
||||
* ======================= AUTO GENERATED: DO NOT EDIT !!! ====================
|
||||
*
|
||||
* Generation Command Line:
|
||||
* ${'\n * '.join(cmdline)}
|
||||
*
|
||||
******************************************************************************/
|
||||
// clang-format off
|
||||
<% calc_max_knob_len(knobs) %>
|
||||
% for inc in includes:
|
||||
#include <${inc}>
|
||||
% endfor
|
||||
#include <regex>
|
||||
#include <core/utils.h>
|
||||
|
||||
//========================================================
|
||||
// Implementation
|
||||
//========================================================
|
||||
void KnobBase::autoExpandEnvironmentVariables(std::string& text)
|
||||
{
|
||||
size_t start;
|
||||
while ((start = text.find("${'${'}")) != std::string::npos)
|
||||
{
|
||||
size_t end = text.find("}");
|
||||
if (end == std::string::npos)
|
||||
break;
|
||||
const std::string var = GetEnv(text.substr(start + 2, end - start - 2));
|
||||
text.replace(start, end - start + 1, var);
|
||||
}
|
||||
// win32 style variable replacement
|
||||
while ((start = text.find("%")) != std::string::npos)
|
||||
{
|
||||
size_t end = text.find("%", start + 1);
|
||||
if (end == std::string::npos)
|
||||
break;
|
||||
const std::string var = GetEnv(text.substr(start + 1, end - start - 1));
|
||||
text.replace(start, end - start + 1, var);
|
||||
}
|
||||
}
|
||||
|
||||
//========================================================
|
||||
// Static Data Members
|
||||
//========================================================
|
||||
% for knob in knobs:
|
||||
% if knob[1]['type'] == 'std::string':
|
||||
${knob[1]['type']} GlobalKnobs::Knob_${knob[0]}::m_default = "${repr(knob[1]['default'])[1:-1]}";
|
||||
% else:
|
||||
${knob[1]['type']} GlobalKnobs::Knob_${knob[0]}::m_default = ${knob[1]['default']};
|
||||
% endif
|
||||
% endfor
|
||||
GlobalKnobs g_GlobalKnobs;
|
||||
|
||||
//========================================================
|
||||
// Knob Initialization
|
||||
//========================================================
|
||||
GlobalKnobs::GlobalKnobs()
|
||||
{
|
||||
% for knob in knobs :
|
||||
InitKnob(${ knob[0] });
|
||||
% endfor
|
||||
}
|
||||
|
||||
//========================================================
|
||||
// Knob Display (Convert to String)
|
||||
//========================================================
|
||||
std::string GlobalKnobs::ToString(const char* optPerLinePrefix)
|
||||
{
|
||||
std::basic_stringstream<char> str;
|
||||
str << std::showbase << std::setprecision(1) << std::fixed;
|
||||
|
||||
if (optPerLinePrefix == nullptr)
|
||||
{
|
||||
optPerLinePrefix = "";
|
||||
}
|
||||
|
||||
% for knob in knobs:
|
||||
str << optPerLinePrefix << "KNOB_${knob[0]}:${space_knob(knob[0])}";
|
||||
% if knob[1]['type'] == 'bool':
|
||||
str << (KNOB_${knob[0]} ? "+\n" : "-\n");
|
||||
% elif knob[1]['type'] != 'float' and knob[1]['type'] != 'std::string':
|
||||
str << std::hex << std::setw(11) << std::left << KNOB_${knob[0]};
|
||||
str << std::dec << KNOB_${knob[0]} << "\n";
|
||||
% else:
|
||||
str << KNOB_${knob[0]} << "\n";
|
||||
% endif
|
||||
% endfor
|
||||
str << std::ends;
|
||||
|
||||
return str.str();
|
||||
}
|
||||
<%!
|
||||
# Globally available python
|
||||
max_len = 0
|
||||
def calc_max_knob_len(knobs):
|
||||
global max_len
|
||||
max_len = 0
|
||||
for knob in knobs:
|
||||
if len(knob[0]) > max_len: max_len = len(knob[0])
|
||||
max_len += len('KNOB_ ')
|
||||
if max_len % 4: max_len += 4 - (max_len % 4)
|
||||
|
||||
def space_knob(knob):
|
||||
knob_len = len('KNOB_' + knob)
|
||||
return ' '*(max_len - knob_len)
|
||||
|
||||
def calc_max_name_len(choices_array):
|
||||
_max_len = 0
|
||||
for choice in choices_array:
|
||||
if len(choice['name']) > _max_len: _max_len = len(choice['name'])
|
||||
|
||||
if _max_len % 4: _max_len += 4 - (_max_len % 4)
|
||||
return _max_len
|
||||
|
||||
def space_name(name, max_len):
|
||||
name_len = len(name)
|
||||
return ' '*(max_len - name_len)
|
||||
%>
|
||||
// clang-format on
|
||||
|
|
@ -1,154 +0,0 @@
|
|||
/******************************************************************************
|
||||
* Copyright (C) 2015-2018 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file ${filename}.h
|
||||
*
|
||||
* @brief Dynamic Knobs for Core.
|
||||
*
|
||||
* ======================= AUTO GENERATED: DO NOT EDIT !!! ====================
|
||||
*
|
||||
* Generation Command Line:
|
||||
* ${'\n * '.join(cmdline)}
|
||||
*
|
||||
******************************************************************************/
|
||||
// clang-format off
|
||||
<% calc_max_knob_len(knobs) %>
|
||||
#pragma once
|
||||
#include <string>
|
||||
|
||||
struct KnobBase
|
||||
{
|
||||
private:
|
||||
// Update the input string.
|
||||
static void autoExpandEnvironmentVariables(std::string& text);
|
||||
|
||||
protected:
|
||||
// Leave input alone and return new string.
|
||||
static std::string expandEnvironmentVariables(std::string const& input)
|
||||
{
|
||||
std::string text = input;
|
||||
autoExpandEnvironmentVariables(text);
|
||||
return text;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static T expandEnvironmentVariables(T const& input)
|
||||
{
|
||||
return input;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct Knob : KnobBase
|
||||
{
|
||||
public:
|
||||
const T& Value() const { return m_Value; }
|
||||
const T& Value(T const& newValue)
|
||||
{
|
||||
m_Value = expandEnvironmentVariables(newValue);
|
||||
return Value();
|
||||
}
|
||||
|
||||
private:
|
||||
T m_Value;
|
||||
};
|
||||
|
||||
#define DEFINE_KNOB(_name, _type) \\
|
||||
|
||||
struct Knob_##_name : Knob<_type> \\
|
||||
|
||||
{ \\
|
||||
|
||||
static const char* Name() { return "KNOB_" #_name; } \\
|
||||
|
||||
static _type DefaultValue() { return (m_default); } \\
|
||||
|
||||
private: \\
|
||||
|
||||
static _type m_default; \\
|
||||
|
||||
} _name;
|
||||
|
||||
#define GET_KNOB(_name) g_GlobalKnobs._name.Value()
|
||||
#define SET_KNOB(_name, _newValue) g_GlobalKnobs._name.Value(_newValue)
|
||||
|
||||
struct GlobalKnobs
|
||||
{
|
||||
% for knob in knobs:
|
||||
//-----------------------------------------------------------
|
||||
// KNOB_${knob[0]}
|
||||
//
|
||||
% for line in knob[1]['desc']:
|
||||
// ${line}
|
||||
% endfor
|
||||
% if knob[1].get('choices'):
|
||||
<%
|
||||
choices = knob[1].get('choices')
|
||||
_max_len = calc_max_name_len(choices) %>//
|
||||
% for i in range(len(choices)):
|
||||
// ${choices[i]['name']}${space_name(choices[i]['name'], _max_len)} = ${format(choices[i]['value'], '#010x')}
|
||||
% endfor
|
||||
% endif
|
||||
//
|
||||
DEFINE_KNOB(${knob[0]}, ${knob[1]['type']});
|
||||
|
||||
% endfor
|
||||
|
||||
std::string ToString(const char* optPerLinePrefix="");
|
||||
GlobalKnobs();
|
||||
};
|
||||
extern GlobalKnobs g_GlobalKnobs;
|
||||
|
||||
#undef DEFINE_KNOB
|
||||
|
||||
% for knob in knobs:
|
||||
#define KNOB_${knob[0]}${space_knob(knob[0])} GET_KNOB(${knob[0]})
|
||||
% endfor
|
||||
|
||||
<%!
|
||||
# Globally available python
|
||||
max_len = 0
|
||||
def calc_max_knob_len(knobs):
|
||||
global max_len
|
||||
max_len = 0
|
||||
for knob in knobs:
|
||||
if len(knob[0]) > max_len: max_len = len(knob[0])
|
||||
max_len += len('KNOB_ ')
|
||||
if max_len % 4: max_len += 4 - (max_len % 4)
|
||||
|
||||
def space_knob(knob):
|
||||
knob_len = len('KNOB_' + knob)
|
||||
return ' '*(max_len - knob_len)
|
||||
|
||||
def calc_max_name_len(choices_array):
|
||||
_max_len = 0
|
||||
for choice in choices_array:
|
||||
if len(choice['name']) > _max_len: _max_len = len(choice['name'])
|
||||
|
||||
if _max_len % 4: _max_len += 4 - (_max_len % 4)
|
||||
return _max_len
|
||||
|
||||
def space_name(name, max_len):
|
||||
name_len = len(name)
|
||||
return ' '*(max_len - name_len)
|
||||
%>
|
||||
// clang-format on
|
||||
|
|
@ -1,109 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file ${filename}
|
||||
*
|
||||
* @brief auto-generated file
|
||||
*
|
||||
* DO NOT EDIT
|
||||
*
|
||||
* Generation Command Line:
|
||||
* ${'\n * '.join(cmdline)}
|
||||
*
|
||||
******************************************************************************/
|
||||
// clang-format off
|
||||
|
||||
#include <llvm/IR/DerivedTypes.h>
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace SwrJit
|
||||
{
|
||||
using namespace llvm;
|
||||
|
||||
%for type in types:
|
||||
INLINE static StructType* Gen_${type['name']}(JitManager* pJitMgr)
|
||||
{
|
||||
%if needs_ctx(type):
|
||||
LLVMContext& ctx = pJitMgr->mContext;
|
||||
|
||||
%endif
|
||||
#if LLVM_VERSION_MAJOR >= 12
|
||||
StructType* pRetType = StructType::getTypeByName(pJitMgr->mContext, "${type['name']}");
|
||||
#else
|
||||
StructType* pRetType = pJitMgr->mpCurrentModule->getTypeByName("${type['name']}");
|
||||
#endif
|
||||
if (pRetType == nullptr)
|
||||
{
|
||||
std::vector<Type*> members =<% (max_type_len, max_name_len) = calc_max_len(type['members']) %>
|
||||
{
|
||||
%for member in type['members']:
|
||||
/* ${member['name']} ${pad(len(member['name']), max_name_len)}*/ ${member['type']},
|
||||
%endfor
|
||||
};
|
||||
|
||||
pRetType = StructType::create(members, "${type['name']}", false);
|
||||
|
||||
// Compute debug metadata
|
||||
llvm::DIBuilder builder(*pJitMgr->mpCurrentModule);
|
||||
llvm::DIFile* pFile = builder.createFile("${input_file}", "${os.path.normpath(input_dir).replace('\\', '/')}");
|
||||
|
||||
std::vector<std::pair<std::string, uint32_t>> dbgMembers =
|
||||
{
|
||||
%for member in type['members']:
|
||||
std::make_pair("${member['name']}", ${pad(len(member['name']), max_name_len)}${member['lineNum']}),
|
||||
%endfor
|
||||
};
|
||||
pJitMgr->CreateDebugStructType(pRetType, "${type['name']}", pFile, ${type['lineNum']}, dbgMembers);
|
||||
}
|
||||
|
||||
return pRetType;
|
||||
}
|
||||
|
||||
%for member in type['members']:
|
||||
static const uint32_t ${type['name']}_${member['name']} ${pad(len(member['name']), max_name_len)}= ${loop.index};
|
||||
%endfor
|
||||
|
||||
%endfor
|
||||
} // namespace SwrJit
|
||||
|
||||
<%! # Global function definitions
|
||||
import os
|
||||
def needs_ctx(struct_type):
|
||||
for m in struct_type.get('members', []):
|
||||
if '(ctx)' in m.get('type', ''):
|
||||
return True
|
||||
return False
|
||||
|
||||
def calc_max_len(fields):
|
||||
max_type_len = 0
|
||||
max_name_len = 0
|
||||
for f in fields:
|
||||
if len(f['type']) > max_type_len: max_type_len = len(f['type'])
|
||||
if len(f['name']) > max_name_len: max_name_len = len(f['name'])
|
||||
return (max_type_len, max_name_len)
|
||||
|
||||
def pad(cur_len, max_len):
|
||||
pad_amt = max_len - cur_len
|
||||
return ' '*pad_amt
|
||||
%>
|
||||
// clang-format on
|
||||
|
|
@ -1,44 +0,0 @@
|
|||
//============================================================================
|
||||
// Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a
|
||||
// copy of this software and associated documentation files (the "Software"),
|
||||
// to deal in the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice (including the next
|
||||
// paragraph) shall be included in all copies or substantial portions of the
|
||||
// Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
// IN THE SOFTWARE.
|
||||
//
|
||||
// @file gen_rasterizer${fileNum}.cpp
|
||||
//
|
||||
// @brief auto-generated file
|
||||
//
|
||||
// DO NOT EDIT
|
||||
//
|
||||
// Generation Command Line:
|
||||
// ${'\n// '.join(cmdline)}
|
||||
//
|
||||
//============================================================================
|
||||
// clang-format off
|
||||
|
||||
#include "core/rasterizer.h"
|
||||
#include "core/rasterizer_impl.h"
|
||||
|
||||
void InitRasterizerFuncs${fileNum}()
|
||||
{
|
||||
%for func in funcList:
|
||||
${func}
|
||||
%endfor
|
||||
}
|
||||
// clang-format on
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,268 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file formats.h
|
||||
*
|
||||
* @brief auto-generated file
|
||||
*
|
||||
* DO NOT EDIT
|
||||
*
|
||||
******************************************************************************/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common/os.h"
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// SWR_TYPE - Format component type
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
enum SWR_TYPE
|
||||
{
|
||||
SWR_TYPE_UNKNOWN,
|
||||
SWR_TYPE_UNUSED,
|
||||
SWR_TYPE_UNORM,
|
||||
SWR_TYPE_SNORM,
|
||||
SWR_TYPE_UINT,
|
||||
SWR_TYPE_SINT,
|
||||
SWR_TYPE_FLOAT,
|
||||
SWR_TYPE_SSCALED,
|
||||
SWR_TYPE_USCALED,
|
||||
SWR_TYPE_SFIXED,
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// SWR_FORMAT
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
enum SWR_FORMAT
|
||||
{
|
||||
R32G32B32A32_FLOAT = 0x0,
|
||||
R32G32B32A32_SINT = 0x1,
|
||||
R32G32B32A32_UINT = 0x2,
|
||||
R64G64_FLOAT = 0x5,
|
||||
R32G32B32X32_FLOAT = 0x6,
|
||||
R32G32B32A32_SSCALED = 0x7,
|
||||
R32G32B32A32_USCALED = 0x8,
|
||||
R32G32B32A32_SFIXED = 0x20,
|
||||
R32G32B32_FLOAT = 0x40,
|
||||
R32G32B32_SINT = 0x41,
|
||||
R32G32B32_UINT = 0x42,
|
||||
R32G32B32_SSCALED = 0x45,
|
||||
R32G32B32_USCALED = 0x46,
|
||||
R32G32B32_SFIXED = 0x50,
|
||||
R16G16B16A16_UNORM = 0x80,
|
||||
R16G16B16A16_SNORM = 0x81,
|
||||
R16G16B16A16_SINT = 0x82,
|
||||
R16G16B16A16_UINT = 0x83,
|
||||
R16G16B16A16_FLOAT = 0x84,
|
||||
R32G32_FLOAT = 0x85,
|
||||
R32G32_SINT = 0x86,
|
||||
R32G32_UINT = 0x87,
|
||||
R32_FLOAT_X8X24_TYPELESS = 0x88,
|
||||
X32_TYPELESS_G8X24_UINT = 0x89,
|
||||
L32A32_FLOAT = 0x8A,
|
||||
R64_FLOAT = 0x8D,
|
||||
R16G16B16X16_UNORM = 0x8E,
|
||||
R16G16B16X16_FLOAT = 0x8F,
|
||||
L32X32_FLOAT = 0x91,
|
||||
I32X32_FLOAT = 0x92,
|
||||
R16G16B16A16_SSCALED = 0x93,
|
||||
R16G16B16A16_USCALED = 0x94,
|
||||
R32G32_SSCALED = 0x95,
|
||||
R32G32_USCALED = 0x96,
|
||||
R32G32_SFIXED = 0xA0,
|
||||
B8G8R8A8_UNORM = 0xC0,
|
||||
B8G8R8A8_UNORM_SRGB = 0xC1,
|
||||
R10G10B10A2_UNORM = 0xC2,
|
||||
R10G10B10A2_UNORM_SRGB = 0xC3,
|
||||
R10G10B10A2_UINT = 0xC4,
|
||||
R8G8B8A8_UNORM = 0xC7,
|
||||
R8G8B8A8_UNORM_SRGB = 0xC8,
|
||||
R8G8B8A8_SNORM = 0xC9,
|
||||
R8G8B8A8_SINT = 0xCA,
|
||||
R8G8B8A8_UINT = 0xCB,
|
||||
R16G16_UNORM = 0xCC,
|
||||
R16G16_SNORM = 0xCD,
|
||||
R16G16_SINT = 0xCE,
|
||||
R16G16_UINT = 0xCF,
|
||||
R16G16_FLOAT = 0xD0,
|
||||
B10G10R10A2_UNORM = 0xD1,
|
||||
B10G10R10A2_UNORM_SRGB = 0xD2,
|
||||
R11G11B10_FLOAT = 0xD3,
|
||||
R10G10B10_FLOAT_A2_UNORM = 0xD5,
|
||||
R32_SINT = 0xD6,
|
||||
R32_UINT = 0xD7,
|
||||
R32_FLOAT = 0xD8,
|
||||
R24_UNORM_X8_TYPELESS = 0xD9,
|
||||
X24_TYPELESS_G8_UINT = 0xDA,
|
||||
L32_UNORM = 0xDD,
|
||||
L16A16_UNORM = 0xDF,
|
||||
I24X8_UNORM = 0xE0,
|
||||
L24X8_UNORM = 0xE1,
|
||||
I32_FLOAT = 0xE3,
|
||||
L32_FLOAT = 0xE4,
|
||||
A32_FLOAT = 0xE5,
|
||||
B8G8R8X8_UNORM = 0xE9,
|
||||
B8G8R8X8_UNORM_SRGB = 0xEA,
|
||||
R8G8B8X8_UNORM = 0xEB,
|
||||
R8G8B8X8_UNORM_SRGB = 0xEC,
|
||||
R9G9B9E5_SHAREDEXP = 0xED,
|
||||
B10G10R10X2_UNORM = 0xEE,
|
||||
L16A16_FLOAT = 0xF0,
|
||||
R10G10B10X2_USCALED = 0xF3,
|
||||
R8G8B8A8_SSCALED = 0xF4,
|
||||
R8G8B8A8_USCALED = 0xF5,
|
||||
R16G16_SSCALED = 0xF6,
|
||||
R16G16_USCALED = 0xF7,
|
||||
R32_SSCALED = 0xF8,
|
||||
R32_USCALED = 0xF9,
|
||||
B5G6R5_UNORM = 0x100,
|
||||
B5G6R5_UNORM_SRGB = 0x101,
|
||||
B5G5R5A1_UNORM = 0x102,
|
||||
B5G5R5A1_UNORM_SRGB = 0x103,
|
||||
B4G4R4A4_UNORM = 0x104,
|
||||
B4G4R4A4_UNORM_SRGB = 0x105,
|
||||
R8G8_UNORM = 0x106,
|
||||
R8G8_SNORM = 0x107,
|
||||
R8G8_SINT = 0x108,
|
||||
R8G8_UINT = 0x109,
|
||||
R16_UNORM = 0x10A,
|
||||
R16_SNORM = 0x10B,
|
||||
R16_SINT = 0x10C,
|
||||
R16_UINT = 0x10D,
|
||||
R16_FLOAT = 0x10E,
|
||||
I16_UNORM = 0x111,
|
||||
L16_UNORM = 0x112,
|
||||
A16_UNORM = 0x113,
|
||||
L8A8_UNORM = 0x114,
|
||||
I16_FLOAT = 0x115,
|
||||
L16_FLOAT = 0x116,
|
||||
A16_FLOAT = 0x117,
|
||||
L8A8_UNORM_SRGB = 0x118,
|
||||
B5G5R5X1_UNORM = 0x11A,
|
||||
B5G5R5X1_UNORM_SRGB = 0x11B,
|
||||
R8G8_SSCALED = 0x11C,
|
||||
R8G8_USCALED = 0x11D,
|
||||
R16_SSCALED = 0x11E,
|
||||
R16_USCALED = 0x11F,
|
||||
A1B5G5R5_UNORM = 0x124,
|
||||
A4B4G4R4_UNORM = 0x125,
|
||||
L8A8_UINT = 0x126,
|
||||
L8A8_SINT = 0x127,
|
||||
R8_UNORM = 0x140,
|
||||
R8_SNORM = 0x141,
|
||||
R8_SINT = 0x142,
|
||||
R8_UINT = 0x143,
|
||||
A8_UNORM = 0x144,
|
||||
I8_UNORM = 0x145,
|
||||
L8_UNORM = 0x146,
|
||||
R8_SSCALED = 0x149,
|
||||
R8_USCALED = 0x14A,
|
||||
L8_UNORM_SRGB = 0x14C,
|
||||
L8_UINT = 0x152,
|
||||
L8_SINT = 0x153,
|
||||
I8_UINT = 0x154,
|
||||
I8_SINT = 0x155,
|
||||
DXT1_RGB_SRGB = 0x180,
|
||||
YCRCB_SWAPUVY = 0x183,
|
||||
BC1_UNORM = 0x186,
|
||||
BC2_UNORM = 0x187,
|
||||
BC3_UNORM = 0x188,
|
||||
BC4_UNORM = 0x189,
|
||||
BC5_UNORM = 0x18A,
|
||||
BC1_UNORM_SRGB = 0x18B,
|
||||
BC2_UNORM_SRGB = 0x18C,
|
||||
BC3_UNORM_SRGB = 0x18D,
|
||||
YCRCB_SWAPUV = 0x18F,
|
||||
DXT1_RGB = 0x191,
|
||||
R8G8B8_UNORM = 0x193,
|
||||
R8G8B8_SNORM = 0x194,
|
||||
R8G8B8_SSCALED = 0x195,
|
||||
R8G8B8_USCALED = 0x196,
|
||||
R64G64B64A64_FLOAT = 0x197,
|
||||
R64G64B64_FLOAT = 0x198,
|
||||
BC4_SNORM = 0x199,
|
||||
BC5_SNORM = 0x19A,
|
||||
R16G16B16_FLOAT = 0x19B,
|
||||
R16G16B16_UNORM = 0x19C,
|
||||
R16G16B16_SNORM = 0x19D,
|
||||
R16G16B16_SSCALED = 0x19E,
|
||||
R16G16B16_USCALED = 0x19F,
|
||||
BC6H_SF16 = 0x1A1,
|
||||
BC7_UNORM = 0x1A2,
|
||||
BC7_UNORM_SRGB = 0x1A3,
|
||||
BC6H_UF16 = 0x1A4,
|
||||
R8G8B8_UNORM_SRGB = 0x1A8,
|
||||
R16G16B16_UINT = 0x1B0,
|
||||
R16G16B16_SINT = 0x1B1,
|
||||
R32_SFIXED = 0x1B2,
|
||||
R10G10B10A2_SNORM = 0x1B3,
|
||||
R10G10B10A2_USCALED = 0x1B4,
|
||||
R10G10B10A2_SSCALED = 0x1B5,
|
||||
R10G10B10A2_SINT = 0x1B6,
|
||||
B10G10R10A2_SNORM = 0x1B7,
|
||||
B10G10R10A2_USCALED = 0x1B8,
|
||||
B10G10R10A2_SSCALED = 0x1B9,
|
||||
B10G10R10A2_UINT = 0x1BA,
|
||||
B10G10R10A2_SINT = 0x1BB,
|
||||
R8G8B8_UINT = 0x1C8,
|
||||
R8G8B8_SINT = 0x1C9,
|
||||
RAW = 0x1FF,
|
||||
NUM_SWR_FORMATS = 0x200,
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// SWR_FORMAT_INFO - Format information
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct SWR_FORMAT_INFO
|
||||
{
|
||||
const char* name;
|
||||
SWR_TYPE type[4];
|
||||
uint32_t defaults[4];
|
||||
uint32_t swizzle[4]; ///< swizzle per component
|
||||
uint32_t bpc[4]; ///< bits per component
|
||||
uint32_t bpp; ///< bits per pixel
|
||||
uint32_t Bpp; ///< bytes per pixel
|
||||
uint32_t numComps; ///< number of components
|
||||
bool isSRGB;
|
||||
bool isBC;
|
||||
bool isSubsampled;
|
||||
bool isLuminance;
|
||||
bool isNormalized[4];
|
||||
float toFloat[4];
|
||||
uint32_t bcWidth;
|
||||
uint32_t bcHeight;
|
||||
};
|
||||
|
||||
extern const SWR_FORMAT_INFO gFormatInfo[NUM_SWR_FORMATS];
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Retrieves format info struct for given format.
|
||||
/// @param format - SWR format
|
||||
INLINE const SWR_FORMAT_INFO& GetFormatInfo(SWR_FORMAT format)
|
||||
{
|
||||
SWR_ASSERT(format < NUM_SWR_FORMATS, "Invalid Surface Format: %d", format);
|
||||
SWR_ASSERT(gFormatInfo[format].name != nullptr, "Invalid Surface Format: %d", format);
|
||||
return gFormatInfo[format];
|
||||
}
|
||||
|
||||
// lookup table for unorm8 srgb -> float conversion
|
||||
extern const uint32_t srgb8Table[256];
|
||||
|
|
@ -1,120 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef __SWR_INTRIN_H__
|
||||
#define __SWR_INTRIN_H__
|
||||
|
||||
#include "os.h"
|
||||
|
||||
#if !defined(SIMD_ARCH)
|
||||
#define SIMD_ARCH KNOB_ARCH
|
||||
#endif
|
||||
|
||||
#include "simdlib_types.hpp"
|
||||
|
||||
typedef SIMDImpl::SIMD128Impl::Float simd4scalar;
|
||||
typedef SIMDImpl::SIMD128Impl::Double simd4scalard;
|
||||
typedef SIMDImpl::SIMD128Impl::Integer simd4scalari;
|
||||
typedef SIMDImpl::SIMD128Impl::Vec4 simd4vector;
|
||||
typedef SIMDImpl::SIMD128Impl::Mask simd4mask;
|
||||
|
||||
typedef SIMDImpl::SIMD256Impl::Float simd8scalar;
|
||||
typedef SIMDImpl::SIMD256Impl::Double simd8scalard;
|
||||
typedef SIMDImpl::SIMD256Impl::Integer simd8scalari;
|
||||
typedef SIMDImpl::SIMD256Impl::Vec4 simd8vector;
|
||||
typedef SIMDImpl::SIMD256Impl::Mask simd8mask;
|
||||
|
||||
typedef SIMDImpl::SIMD512Impl::Float simd16scalar;
|
||||
typedef SIMDImpl::SIMD512Impl::Double simd16scalard;
|
||||
typedef SIMDImpl::SIMD512Impl::Integer simd16scalari;
|
||||
typedef SIMDImpl::SIMD512Impl::Vec4 simd16vector;
|
||||
typedef SIMDImpl::SIMD512Impl::Mask simd16mask;
|
||||
|
||||
#if KNOB_SIMD_WIDTH == 8
|
||||
typedef simd8scalar simdscalar;
|
||||
typedef simd8scalard simdscalard;
|
||||
typedef simd8scalari simdscalari;
|
||||
typedef simd8vector simdvector;
|
||||
typedef simd8mask simdmask;
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
|
||||
INLINE
|
||||
UINT pdep_u32(UINT a, UINT mask)
|
||||
{
|
||||
#if KNOB_ARCH >= KNOB_ARCH_AVX2
|
||||
return _pdep_u32(a, mask);
|
||||
#else
|
||||
UINT result = 0;
|
||||
|
||||
// copied from http://wm.ite.pl/articles/pdep-soft-emu.html
|
||||
// using bsf instead of funky loop
|
||||
unsigned long maskIndex = 0;
|
||||
while (_BitScanForward(&maskIndex, mask))
|
||||
{
|
||||
// 1. isolate lowest set bit of mask
|
||||
const UINT lowest = 1 << maskIndex;
|
||||
|
||||
// 2. populate LSB from src
|
||||
const UINT LSB = (UINT)((int)(a << 31) >> 31);
|
||||
|
||||
// 3. copy bit from mask
|
||||
result |= LSB & lowest;
|
||||
|
||||
// 4. clear lowest bit
|
||||
mask &= ~lowest;
|
||||
|
||||
// 5. prepare for next iteration
|
||||
a >>= 1;
|
||||
}
|
||||
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
INLINE
|
||||
UINT pext_u32(UINT a, UINT mask)
|
||||
{
|
||||
#if KNOB_ARCH >= KNOB_ARCH_AVX2
|
||||
return _pext_u32(a, mask);
|
||||
#else
|
||||
UINT result = 0;
|
||||
unsigned long maskIndex;
|
||||
uint32_t currentBit = 0;
|
||||
while (_BitScanForward(&maskIndex, mask))
|
||||
{
|
||||
// 1. isolate lowest set bit of mask
|
||||
const UINT lowest = 1 << maskIndex;
|
||||
|
||||
// 2. copy bit from mask
|
||||
result |= ((a & lowest) > 0) << currentBit++;
|
||||
|
||||
// 3. clear lowest bit
|
||||
mask &= ~lowest;
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif //__SWR_INTRIN_H__
|
||||
|
|
@ -1,231 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <bitset>
|
||||
#include <array>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
|
||||
// Clang for Windows does supply an intrin.h with __cpuid intrinsics, however...
|
||||
// It seems to not realize that a write to "b" (ebx) will kill the value in rbx.
|
||||
// This attempts to use the "native" clang / gcc intrinsics instead of the windows
|
||||
// compatible ones.
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <string.h>
|
||||
#if !defined(__cpuid)
|
||||
#include <cpuid.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
class InstructionSet
|
||||
{
|
||||
public:
|
||||
InstructionSet() : CPU_Rep(){};
|
||||
|
||||
// getters
|
||||
std::string Vendor(void) { return CPU_Rep.vendor_; }
|
||||
std::string Brand(void) { return CPU_Rep.brand_; }
|
||||
|
||||
bool SSE3(void) { return CPU_Rep.f_1_ECX_[0]; }
|
||||
bool PCLMULQDQ(void) { return CPU_Rep.f_1_ECX_[1]; }
|
||||
bool MONITOR(void) { return CPU_Rep.f_1_ECX_[3]; }
|
||||
bool SSSE3(void) { return CPU_Rep.f_1_ECX_[9]; }
|
||||
bool FMA(void) { return CPU_Rep.f_1_ECX_[12]; }
|
||||
bool CMPXCHG16B(void) { return CPU_Rep.f_1_ECX_[13]; }
|
||||
bool SSE41(void) { return CPU_Rep.f_1_ECX_[19]; }
|
||||
bool SSE42(void) { return CPU_Rep.f_1_ECX_[20]; }
|
||||
bool MOVBE(void) { return CPU_Rep.f_1_ECX_[22]; }
|
||||
bool POPCNT(void) { return CPU_Rep.f_1_ECX_[23]; }
|
||||
bool AES(void) { return CPU_Rep.f_1_ECX_[25]; }
|
||||
bool XSAVE(void) { return CPU_Rep.f_1_ECX_[26]; }
|
||||
bool OSXSAVE(void) { return CPU_Rep.f_1_ECX_[27]; }
|
||||
bool RDRAND(void) { return CPU_Rep.f_1_ECX_[30]; }
|
||||
|
||||
bool MSR(void) { return CPU_Rep.f_1_EDX_[5]; }
|
||||
bool CX8(void) { return CPU_Rep.f_1_EDX_[8]; }
|
||||
bool SEP(void) { return CPU_Rep.f_1_EDX_[11]; }
|
||||
bool CMOV(void) { return CPU_Rep.f_1_EDX_[15]; }
|
||||
bool CLFSH(void) { return CPU_Rep.f_1_EDX_[19]; }
|
||||
bool MMX(void) { return CPU_Rep.f_1_EDX_[23]; }
|
||||
bool FXSR(void) { return CPU_Rep.f_1_EDX_[24]; }
|
||||
bool SSE(void) { return CPU_Rep.f_1_EDX_[25]; }
|
||||
bool SSE2(void) { return CPU_Rep.f_1_EDX_[26]; }
|
||||
|
||||
bool FSGSBASE(void) { return CPU_Rep.f_7_EBX_[0]; }
|
||||
bool BMI1(void) { return CPU_Rep.f_7_EBX_[3]; }
|
||||
bool HLE(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[4]; }
|
||||
bool BMI2(void) { return CPU_Rep.f_7_EBX_[8]; }
|
||||
bool ERMS(void) { return CPU_Rep.f_7_EBX_[9]; }
|
||||
bool INVPCID(void) { return CPU_Rep.f_7_EBX_[10]; }
|
||||
bool RTM(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[11]; }
|
||||
bool RDSEED(void) { return CPU_Rep.f_7_EBX_[18]; }
|
||||
bool ADX(void) { return CPU_Rep.f_7_EBX_[19]; }
|
||||
bool SHA(void) { return CPU_Rep.f_7_EBX_[29]; }
|
||||
|
||||
bool PREFETCHWT1(void) { return CPU_Rep.f_7_ECX_[0]; }
|
||||
|
||||
bool LAHF(void) { return CPU_Rep.f_81_ECX_[0]; }
|
||||
bool LZCNT(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_ECX_[5]; }
|
||||
bool ABM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[5]; }
|
||||
bool SSE4a(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[6]; }
|
||||
bool XOP(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[11]; }
|
||||
bool TBM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[21]; }
|
||||
|
||||
bool SYSCALL(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[11]; }
|
||||
bool MMXEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[22]; }
|
||||
bool RDTSCP(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[27]; }
|
||||
bool _3DNOWEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[30]; }
|
||||
bool _3DNOW(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[31]; }
|
||||
|
||||
bool AVX(void) { return CPU_Rep.f_1_ECX_[28]; }
|
||||
bool F16C(void) { return CPU_Rep.f_1_ECX_[29]; }
|
||||
bool AVX2(void) { return CPU_Rep.f_7_EBX_[5]; }
|
||||
bool AVX512F(void) { return CPU_Rep.f_7_EBX_[16]; }
|
||||
bool AVX512PF(void) { return CPU_Rep.f_7_EBX_[26]; }
|
||||
bool AVX512ER(void) { return CPU_Rep.f_7_EBX_[27]; }
|
||||
bool AVX512CD(void) { return CPU_Rep.f_7_EBX_[28]; }
|
||||
|
||||
private:
|
||||
class InstructionSet_Internal
|
||||
{
|
||||
public:
|
||||
InstructionSet_Internal() :
|
||||
nIds_{0}, nExIds_{0}, isIntel_{false}, isAMD_{false}, f_1_ECX_{0}, f_1_EDX_{0},
|
||||
f_7_EBX_{0}, f_7_ECX_{0}, f_81_ECX_{0}, f_81_EDX_{0}, data_{}, extdata_{}
|
||||
{
|
||||
// int cpuInfo[4] = {-1};
|
||||
std::array<int, 4> cpui;
|
||||
|
||||
// Calling __cpuid with 0x0 as the function_id argument
|
||||
// gets the number of the highest valid function ID.
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
__cpuid(cpui.data(), 0);
|
||||
nIds_ = cpui[0];
|
||||
#else
|
||||
nIds_ = __get_cpuid_max(0, NULL);
|
||||
#endif
|
||||
|
||||
for (int i = 0; i <= nIds_; ++i)
|
||||
{
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
__cpuidex(cpui.data(), i, 0);
|
||||
#else
|
||||
int* data = cpui.data();
|
||||
__cpuid_count(i, 0, data[0], data[1], data[2], data[3]);
|
||||
#endif
|
||||
data_.push_back(cpui);
|
||||
}
|
||||
|
||||
// Capture vendor string
|
||||
char vendor[0x20];
|
||||
memset(vendor, 0, sizeof(vendor));
|
||||
*reinterpret_cast<int*>(vendor) = data_[0][1];
|
||||
*reinterpret_cast<int*>(vendor + 4) = data_[0][3];
|
||||
*reinterpret_cast<int*>(vendor + 8) = data_[0][2];
|
||||
vendor_ = vendor;
|
||||
if (vendor_ == "GenuineIntel")
|
||||
{
|
||||
isIntel_ = true;
|
||||
}
|
||||
else if (vendor_ == "AuthenticAMD")
|
||||
{
|
||||
isAMD_ = true;
|
||||
}
|
||||
|
||||
// load bitset with flags for function 0x00000001
|
||||
if (nIds_ >= 1)
|
||||
{
|
||||
f_1_ECX_ = data_[1][2];
|
||||
f_1_EDX_ = data_[1][3];
|
||||
}
|
||||
|
||||
// load bitset with flags for function 0x00000007
|
||||
if (nIds_ >= 7)
|
||||
{
|
||||
f_7_EBX_ = data_[7][1];
|
||||
f_7_ECX_ = data_[7][2];
|
||||
}
|
||||
|
||||
// Calling __cpuid with 0x80000000 as the function_id argument
|
||||
// gets the number of the highest valid extended ID.
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
__cpuid(cpui.data(), 0x80000000);
|
||||
nExIds_ = cpui[0];
|
||||
#else
|
||||
nExIds_ = __get_cpuid_max(0x80000000, NULL);
|
||||
#endif
|
||||
|
||||
char brand[0x40];
|
||||
memset(brand, 0, sizeof(brand));
|
||||
|
||||
for (unsigned i = 0x80000000; i <= nExIds_; ++i)
|
||||
{
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
__cpuidex(cpui.data(), i, 0);
|
||||
#else
|
||||
int* data = cpui.data();
|
||||
__cpuid_count(i, 0, data[0], data[1], data[2], data[3]);
|
||||
#endif
|
||||
extdata_.push_back(cpui);
|
||||
}
|
||||
|
||||
// load bitset with flags for function 0x80000001
|
||||
if (nExIds_ >= 0x80000001)
|
||||
{
|
||||
f_81_ECX_ = extdata_[1][2];
|
||||
f_81_EDX_ = extdata_[1][3];
|
||||
}
|
||||
|
||||
// Interpret CPU brand string if reported
|
||||
if (nExIds_ >= 0x80000004)
|
||||
{
|
||||
memcpy(brand, extdata_[2].data(), sizeof(cpui));
|
||||
memcpy(brand + 16, extdata_[3].data(), sizeof(cpui));
|
||||
memcpy(brand + 32, extdata_[4].data(), sizeof(cpui));
|
||||
brand_ = brand;
|
||||
}
|
||||
};
|
||||
|
||||
int nIds_;
|
||||
unsigned nExIds_;
|
||||
std::string vendor_;
|
||||
std::string brand_;
|
||||
bool isIntel_;
|
||||
bool isAMD_;
|
||||
std::bitset<32> f_1_ECX_;
|
||||
std::bitset<32> f_1_EDX_;
|
||||
std::bitset<32> f_7_EBX_;
|
||||
std::bitset<32> f_7_ECX_;
|
||||
std::bitset<32> f_81_ECX_;
|
||||
std::bitset<32> f_81_EDX_;
|
||||
std::vector<std::array<int, 4>> data_;
|
||||
std::vector<std::array<int, 4>> extdata_;
|
||||
};
|
||||
const InstructionSet_Internal CPU_Rep;
|
||||
};
|
||||
|
|
@ -1,314 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
|
||||
#include "common/os.h"
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <sstream>
|
||||
|
||||
#if defined(_WIN32)
|
||||
#include <shlobj.h>
|
||||
#endif // Windows
|
||||
|
||||
#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
|
||||
#include <pthread.h>
|
||||
#endif // Linux
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
static const DWORD MS_VC_EXCEPTION = 0x406D1388;
|
||||
|
||||
#pragma pack(push, 8)
|
||||
typedef struct tagTHREADNAME_INFO
|
||||
{
|
||||
DWORD dwType; // Must be 0x1000.
|
||||
LPCSTR szName; // Pointer to name (in user addr space).
|
||||
DWORD dwThreadID; // Thread ID (-1=caller thread).
|
||||
DWORD dwFlags; // Reserved for future use, must be zero.
|
||||
} THREADNAME_INFO;
|
||||
#pragma pack(pop)
|
||||
|
||||
void LegacySetThreadName(const char* pThreadName)
|
||||
{
|
||||
THREADNAME_INFO info;
|
||||
info.dwType = 0x1000;
|
||||
info.szName = pThreadName;
|
||||
info.dwThreadID = GetCurrentThreadId();
|
||||
info.dwFlags = 0;
|
||||
|
||||
if (!IsDebuggerPresent())
|
||||
{
|
||||
// No debugger attached to interpret exception, no need to actually do it
|
||||
return;
|
||||
}
|
||||
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 6320 6322)
|
||||
__try
|
||||
{
|
||||
RaiseException(MS_VC_EXCEPTION, 0, sizeof(info) / sizeof(ULONG_PTR), (ULONG_PTR*)&info);
|
||||
}
|
||||
__except (EXCEPTION_EXECUTE_HANDLER)
|
||||
{
|
||||
}
|
||||
#pragma warning(pop)
|
||||
}
|
||||
#endif // _WIN32
|
||||
|
||||
void SWR_API SetCurrentThreadName(const char* pThreadName)
|
||||
{
|
||||
#if defined(_MSC_VER)
|
||||
// The SetThreadDescription API was brought in version 1607 of Windows 10.
|
||||
typedef HRESULT(WINAPI * PFNSetThreadDescription)(HANDLE hThread, PCWSTR lpThreadDescription);
|
||||
// The SetThreadDescription API works even if no debugger is attached.
|
||||
auto pfnSetThreadDescription = reinterpret_cast<PFNSetThreadDescription>(
|
||||
GetProcAddress(GetModuleHandleA("Kernel32.dll"), "SetThreadDescription"));
|
||||
|
||||
if (!pfnSetThreadDescription)
|
||||
{
|
||||
// try KernelBase.dll
|
||||
pfnSetThreadDescription = reinterpret_cast<PFNSetThreadDescription>(
|
||||
GetProcAddress(GetModuleHandleA("KernelBase.dll"), "SetThreadDescription"));
|
||||
}
|
||||
|
||||
if (pfnSetThreadDescription)
|
||||
{
|
||||
std::string utf8Name = pThreadName;
|
||||
std::wstring wideName;
|
||||
wideName.resize(utf8Name.size() + 1);
|
||||
swprintf_s(&(wideName.front()), wideName.size(), L"%S", utf8Name.c_str());
|
||||
HRESULT hr = pfnSetThreadDescription(GetCurrentThread(), wideName.c_str());
|
||||
SWR_ASSERT(SUCCEEDED(hr), "Failed to set thread name to %s", pThreadName);
|
||||
|
||||
// Fall through - it seems like some debuggers only recognize the exception
|
||||
}
|
||||
|
||||
// Fall back to exception based hack
|
||||
LegacySetThreadName(pThreadName);
|
||||
#endif // _WIN32
|
||||
|
||||
#if defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
|
||||
pthread_setname_np(pthread_self(), pThreadName);
|
||||
#endif // Linux
|
||||
}
|
||||
|
||||
#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
|
||||
static void
|
||||
SplitString(std::vector<std::string>& out_segments, const std::string& input, char splitToken)
|
||||
{
|
||||
out_segments.clear();
|
||||
|
||||
std::istringstream f(input);
|
||||
std::string s;
|
||||
while (std::getline(f, s, splitToken))
|
||||
{
|
||||
if (s.size())
|
||||
{
|
||||
out_segments.push_back(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // Unix
|
||||
|
||||
void SWR_API CreateDirectoryPath(const std::string& path)
|
||||
{
|
||||
#if defined(_WIN32)
|
||||
SHCreateDirectoryExA(nullptr, path.c_str(), nullptr);
|
||||
#endif // Windows
|
||||
|
||||
#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
|
||||
std::vector<std::string> pathSegments;
|
||||
SplitString(pathSegments, path, '/');
|
||||
|
||||
std::string tmpPath;
|
||||
for (auto const& segment : pathSegments)
|
||||
{
|
||||
tmpPath.push_back('/');
|
||||
tmpPath += segment;
|
||||
|
||||
int result = mkdir(tmpPath.c_str(), 0777);
|
||||
if (result == -1 && errno != EEXIST)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif // Unix
|
||||
}
|
||||
|
||||
/// Execute Command (block until finished)
|
||||
/// @returns process exit value
|
||||
int SWR_API ExecCmd(const std::string& cmd, ///< (In) Command line string
|
||||
const char* pOptEnvStrings, ///< (Optional In) Environment block for new process
|
||||
std::string* pOptStdOut, ///< (Optional Out) Standard Output text
|
||||
std::string* pOptStdErr, ///< (Optional Out) Standard Error text
|
||||
const std::string* pOptStdIn) ///< (Optional In) Standard Input text
|
||||
{
|
||||
int rvalue = -1;
|
||||
|
||||
#if defined(_WIN32)
|
||||
struct WinPipe
|
||||
{
|
||||
HANDLE hRead;
|
||||
HANDLE hWrite;
|
||||
};
|
||||
std::array<WinPipe, 3> hPipes = {};
|
||||
|
||||
SECURITY_ATTRIBUTES saAttr = {sizeof(SECURITY_ATTRIBUTES)};
|
||||
saAttr.bInheritHandle = TRUE; // Pipe handles are inherited by child process.
|
||||
saAttr.lpSecurityDescriptor = NULL;
|
||||
|
||||
{
|
||||
bool bFail = false;
|
||||
for (WinPipe& p : hPipes)
|
||||
{
|
||||
if (!CreatePipe(&p.hRead, &p.hWrite, &saAttr, 0))
|
||||
{
|
||||
bFail = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (bFail)
|
||||
{
|
||||
for (WinPipe& p : hPipes)
|
||||
{
|
||||
CloseHandle(p.hRead);
|
||||
CloseHandle(p.hWrite);
|
||||
}
|
||||
return rvalue;
|
||||
}
|
||||
}
|
||||
|
||||
STARTUPINFOA StartupInfo{};
|
||||
StartupInfo.cb = sizeof(STARTUPINFOA);
|
||||
StartupInfo.dwFlags = STARTF_USESTDHANDLES;
|
||||
StartupInfo.dwFlags |= STARTF_USESHOWWINDOW;
|
||||
StartupInfo.wShowWindow = SW_HIDE;
|
||||
if (pOptStdIn)
|
||||
{
|
||||
StartupInfo.hStdInput = hPipes[0].hRead;
|
||||
}
|
||||
StartupInfo.hStdOutput = hPipes[1].hWrite;
|
||||
StartupInfo.hStdError = hPipes[2].hWrite;
|
||||
PROCESS_INFORMATION procInfo{};
|
||||
|
||||
// CreateProcess can modify the string
|
||||
std::string local_cmd = cmd;
|
||||
|
||||
BOOL ProcessValue = CreateProcessA(NULL,
|
||||
(LPSTR)local_cmd.c_str(),
|
||||
NULL,
|
||||
NULL,
|
||||
TRUE,
|
||||
0,
|
||||
(LPVOID)pOptEnvStrings,
|
||||
NULL,
|
||||
&StartupInfo,
|
||||
&procInfo);
|
||||
|
||||
if (ProcessValue && procInfo.hProcess)
|
||||
{
|
||||
auto ReadFromPipe = [](HANDLE hPipe, std::string* pOutStr) {
|
||||
char buf[1024];
|
||||
DWORD dwRead = 0;
|
||||
DWORD dwAvail = 0;
|
||||
while (true)
|
||||
{
|
||||
if (!::PeekNamedPipe(hPipe, NULL, 0, NULL, &dwAvail, NULL))
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
if (!dwAvail) // no data available, return
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
if (!::ReadFile(hPipe,
|
||||
buf,
|
||||
std::min<size_t>(sizeof(buf) - 1, size_t(dwAvail)),
|
||||
&dwRead,
|
||||
NULL) ||
|
||||
!dwRead)
|
||||
{
|
||||
// error, the child process might ended
|
||||
break;
|
||||
}
|
||||
|
||||
buf[dwRead] = 0;
|
||||
if (pOutStr)
|
||||
{
|
||||
(*pOutStr) += buf;
|
||||
}
|
||||
}
|
||||
};
|
||||
bool bProcessEnded = false;
|
||||
size_t bytesWritten = 0;
|
||||
do
|
||||
{
|
||||
if (pOptStdIn && (pOptStdIn->size() > bytesWritten))
|
||||
{
|
||||
DWORD bytesToWrite = static_cast<DWORD>(pOptStdIn->size()) - bytesWritten;
|
||||
if (!::WriteFile(hPipes[0].hWrite,
|
||||
pOptStdIn->data() + bytesWritten,
|
||||
bytesToWrite,
|
||||
&bytesToWrite,
|
||||
nullptr))
|
||||
{
|
||||
// Failed to write to pipe
|
||||
break;
|
||||
}
|
||||
bytesWritten += bytesToWrite;
|
||||
}
|
||||
|
||||
// Give some timeslice (50ms), so we won't waste 100% cpu.
|
||||
bProcessEnded = (WaitForSingleObject(procInfo.hProcess, 50) == WAIT_OBJECT_0);
|
||||
|
||||
ReadFromPipe(hPipes[1].hRead, pOptStdOut);
|
||||
ReadFromPipe(hPipes[2].hRead, pOptStdErr);
|
||||
} while (!bProcessEnded);
|
||||
|
||||
DWORD exitVal = 0;
|
||||
if (!GetExitCodeProcess(procInfo.hProcess, &exitVal))
|
||||
{
|
||||
exitVal = 1;
|
||||
}
|
||||
|
||||
CloseHandle(procInfo.hProcess);
|
||||
CloseHandle(procInfo.hThread);
|
||||
|
||||
rvalue = exitVal;
|
||||
}
|
||||
|
||||
for (WinPipe& p : hPipes)
|
||||
{
|
||||
CloseHandle(p.hRead);
|
||||
CloseHandle(p.hWrite);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// Non-Windows implementation
|
||||
|
||||
#endif
|
||||
|
||||
return rvalue;
|
||||
}
|
||||
|
|
@ -1,365 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef __SWR_OS_H__
|
||||
#define __SWR_OS_H__
|
||||
|
||||
#include <cstddef>
|
||||
#include "core/knobs.h"
|
||||
|
||||
#if (defined(FORCE_WINDOWS) || defined(_WIN32)) && !defined(FORCE_LINUX)
|
||||
|
||||
#define SWR_API __cdecl
|
||||
#define SWR_VISIBLE __declspec(dllexport)
|
||||
|
||||
#ifndef NOMINMAX
|
||||
#undef UNICODE
|
||||
#define NOMINMAX
|
||||
#include <windows.h>
|
||||
#undef NOMINMAX
|
||||
#define UNICODE
|
||||
#else
|
||||
#undef UNICODE
|
||||
#include <windows.h>
|
||||
#define UNICODE
|
||||
#endif
|
||||
#include <intrin.h>
|
||||
#include <cstdint>
|
||||
|
||||
#if defined(MemoryFence)
|
||||
// Windows.h defines MemoryFence as _mm_mfence, but this conflicts with llvm::sys::MemoryFence
|
||||
#undef MemoryFence
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define OSALIGN(RWORD, WIDTH) __declspec(align(WIDTH)) RWORD
|
||||
#elif defined(__GNUC__)
|
||||
#define OSALIGN(RWORD, WIDTH) RWORD __attribute__((aligned(WIDTH)))
|
||||
#endif
|
||||
|
||||
#if defined(_DEBUG)
|
||||
// We compile Debug builds with inline function expansion enabled. This allows
|
||||
// functions compiled with __forceinline to be inlined even in Debug builds.
|
||||
// The inline_depth(0) pragma below will disable inline function expansion for
|
||||
// normal INLINE / inline functions, but not for __forceinline functions.
|
||||
// Our SIMD function wrappers (see simdlib.hpp) use __forceinline even in
|
||||
// Debug builds.
|
||||
#define INLINE inline
|
||||
#pragma inline_depth(0)
|
||||
#else
|
||||
// Use of __forceinline increases compile time dramatically in release builds
|
||||
// and provides almost 0 measurable benefit. Disable until we have a compelling
|
||||
// use-case
|
||||
// #define INLINE __forceinline
|
||||
#define INLINE inline
|
||||
#endif
|
||||
#ifndef FORCEINLINE
|
||||
#define FORCEINLINE __forceinline
|
||||
#endif
|
||||
|
||||
#define DEBUGBREAK __debugbreak()
|
||||
|
||||
#define PRAGMA_WARNING_PUSH_DISABLE(...) \
|
||||
__pragma(warning(push)); \
|
||||
__pragma(warning(disable : __VA_ARGS__));
|
||||
|
||||
#define PRAGMA_WARNING_POP() __pragma(warning(pop))
|
||||
|
||||
static inline void* AlignedMalloc(size_t _Size, size_t _Alignment)
|
||||
{
|
||||
return _aligned_malloc(_Size, _Alignment);
|
||||
}
|
||||
|
||||
static inline void AlignedFree(void* p)
|
||||
{
|
||||
return _aligned_free(p);
|
||||
}
|
||||
|
||||
#if defined(_WIN64)
|
||||
#define BitScanReverseSizeT BitScanReverse64
|
||||
#define BitScanForwardSizeT BitScanForward64
|
||||
#define _mm_popcount_sizeT _mm_popcnt_u64
|
||||
#else
|
||||
#define BitScanReverseSizeT BitScanReverse
|
||||
#define BitScanForwardSizeT BitScanForward
|
||||
#define _mm_popcount_sizeT _mm_popcnt_u32
|
||||
#endif
|
||||
|
||||
#if !defined(_WIN64)
|
||||
extern "C" {
|
||||
inline unsigned char _BitScanForward64(unsigned long* Index, uint64_t Mask)
|
||||
{
|
||||
if (Mask == 0)
|
||||
return 0;
|
||||
#ifdef __GNUC__
|
||||
*Index = __builtin_ctzll(Mask);
|
||||
#else
|
||||
*Index = 0;
|
||||
for (int i = 0; i < 64; ++ i)
|
||||
if ((1ULL << i) & Mask)
|
||||
*Index = i;
|
||||
#endif
|
||||
return 1;
|
||||
}
|
||||
|
||||
inline unsigned char _BitScanReverse64(unsigned long* Index, uint64_t Mask)
|
||||
{
|
||||
if (Mask == 0)
|
||||
return 0;
|
||||
#ifdef __GNUC__
|
||||
*Index = 63 - __builtin_clzll(Mask);
|
||||
#else
|
||||
*Index = 0;
|
||||
for (int i = 63; i >= 0; -- i)
|
||||
if ((1ULL << i) & Mask)
|
||||
*Index = i;
|
||||
#endif
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#elif defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
|
||||
|
||||
#define SWR_API
|
||||
#define SWR_VISIBLE __attribute__((visibility("default")))
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <x86intrin.h>
|
||||
#include <stdint.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/stat.h>
|
||||
#include <stdio.h>
|
||||
#include <limits.h>
|
||||
|
||||
typedef void VOID;
|
||||
typedef void* LPVOID;
|
||||
typedef int INT;
|
||||
typedef unsigned int UINT;
|
||||
typedef void* HANDLE;
|
||||
typedef int LONG;
|
||||
typedef unsigned int DWORD;
|
||||
|
||||
#undef FALSE
|
||||
#define FALSE 0
|
||||
|
||||
#undef TRUE
|
||||
#define TRUE 1
|
||||
|
||||
#define MAX_PATH PATH_MAX
|
||||
|
||||
#define OSALIGN(RWORD, WIDTH) RWORD __attribute__((aligned(WIDTH)))
|
||||
#ifndef INLINE
|
||||
#define INLINE __inline
|
||||
#endif
|
||||
#ifndef FORCEINLINE
|
||||
#define FORCEINLINE INLINE
|
||||
#endif
|
||||
#define DEBUGBREAK asm("int $3")
|
||||
|
||||
#if !defined(__CYGWIN__)
|
||||
|
||||
#ifndef __cdecl
|
||||
#define __cdecl
|
||||
#endif
|
||||
#ifndef __stdcall
|
||||
#define __stdcall
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
|
||||
#define __declspec(x) __declspec_##x
|
||||
#define __declspec_align(y) __attribute__((aligned(y)))
|
||||
#define __declspec_deprecated __attribute__((deprecated))
|
||||
#define __declspec_dllexport
|
||||
#define __declspec_dllimport
|
||||
#define __declspec_noinline __attribute__((__noinline__))
|
||||
#define __declspec_nothrow __attribute__((nothrow))
|
||||
#define __declspec_novtable
|
||||
#define __declspec_thread __thread
|
||||
#else
|
||||
#define __declspec(X)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
|
||||
|
||||
#if !defined(__clang__) && (__GNUC__) && (GCC_VERSION < 40500)
|
||||
inline uint64_t __rdtsc()
|
||||
{
|
||||
long low, high;
|
||||
asm volatile("rdtsc" : "=a"(low), "=d"(high));
|
||||
return (low | ((uint64_t)high << 32));
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !defined(__clang__) && !defined(__INTEL_COMPILER)
|
||||
// Intrinsic not defined in gcc < 10
|
||||
#if (__GNUC__) && (GCC_VERSION < 100000)
|
||||
static INLINE void _mm256_storeu2_m128i(__m128i* hi, __m128i* lo, __m256i a)
|
||||
{
|
||||
_mm_storeu_si128((__m128i*)lo, _mm256_castsi256_si128(a));
|
||||
_mm_storeu_si128((__m128i*)hi, _mm256_extractf128_si256(a, 0x1));
|
||||
}
|
||||
#endif
|
||||
|
||||
// gcc prior to 4.9 doesn't have _mm*_undefined_*
|
||||
#if (__GNUC__) && (GCC_VERSION < 40900)
|
||||
#define _mm_undefined_si128 _mm_setzero_si128
|
||||
#define _mm256_undefined_ps _mm256_setzero_ps
|
||||
#endif
|
||||
#endif
|
||||
|
||||
inline unsigned char _BitScanForward64(unsigned long* Index, uint64_t Mask)
|
||||
{
|
||||
if (Mask == 0)
|
||||
return 0;
|
||||
*Index = __builtin_ctzll(Mask);
|
||||
return 1;
|
||||
}
|
||||
|
||||
inline unsigned char _BitScanForward(unsigned long* Index, uint32_t Mask)
|
||||
{
|
||||
if (Mask == 0)
|
||||
return 0;
|
||||
*Index = __builtin_ctz(Mask);
|
||||
return 1;
|
||||
}
|
||||
|
||||
inline unsigned char _BitScanReverse64(unsigned long* Index, uint64_t Mask)
|
||||
{
|
||||
if (Mask == 0)
|
||||
return 0;
|
||||
*Index = 63 - __builtin_clzll(Mask);
|
||||
return 1;
|
||||
}
|
||||
|
||||
inline unsigned char _BitScanReverse(unsigned long* Index, uint32_t Mask)
|
||||
{
|
||||
if (Mask == 0)
|
||||
return 0;
|
||||
*Index = 31 - __builtin_clz(Mask);
|
||||
return 1;
|
||||
}
|
||||
|
||||
inline void* AlignedMalloc(size_t size, size_t alignment)
|
||||
{
|
||||
void* ret;
|
||||
if (posix_memalign(&ret, alignment, size))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void AlignedFree(void* p)
|
||||
{
|
||||
free(p);
|
||||
}
|
||||
|
||||
#define _countof(a) (sizeof(a) / sizeof(*(a)))
|
||||
|
||||
#define sprintf_s sprintf
|
||||
#define strcpy_s(dst, size, src) strncpy(dst, src, size)
|
||||
#define GetCurrentProcessId getpid
|
||||
|
||||
#define InterlockedCompareExchange(Dest, Exchange, Comparand) \
|
||||
__sync_val_compare_and_swap(Dest, Comparand, Exchange)
|
||||
#define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value)
|
||||
#define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1)
|
||||
#define InterlockedDecrement64(Append) __sync_sub_and_fetch(Append, 1)
|
||||
#define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1)
|
||||
#define InterlockedAdd(Addend, Value) __sync_add_and_fetch(Addend, Value)
|
||||
#define InterlockedAdd64(Addend, Value) __sync_add_and_fetch(Addend, Value)
|
||||
#define _ReadWriteBarrier() asm volatile("" ::: "memory")
|
||||
|
||||
#define PRAGMA_WARNING_PUSH_DISABLE(...)
|
||||
#define PRAGMA_WARNING_POP()
|
||||
|
||||
#define ZeroMemory(dst, size) memset(dst, 0, size)
|
||||
#else
|
||||
|
||||
#error Unsupported OS/system.
|
||||
|
||||
#endif
|
||||
|
||||
#define THREAD thread_local
|
||||
|
||||
// Universal types
|
||||
typedef uint8_t KILOBYTE[1024];
|
||||
typedef KILOBYTE MEGABYTE[1024];
|
||||
typedef MEGABYTE GIGABYTE[1024];
|
||||
|
||||
#define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64)
|
||||
#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, KNOB_SIMD_BYTES)
|
||||
#define OSALIGNSIMD16(RWORD) OSALIGN(RWORD, KNOB_SIMD16_BYTES)
|
||||
|
||||
#include "common/swr_assert.h"
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define ATTR_UNUSED __attribute__((unused))
|
||||
#else
|
||||
#define ATTR_UNUSED
|
||||
#endif
|
||||
|
||||
#define SWR_FUNC(_retType, _funcName, /* args */...) \
|
||||
typedef _retType(SWR_API* PFN##_funcName)(__VA_ARGS__); \
|
||||
_retType SWR_API _funcName(__VA_ARGS__);
|
||||
|
||||
// Defined in os.cpp
|
||||
void SWR_API SetCurrentThreadName(const char* pThreadName);
|
||||
void SWR_API CreateDirectoryPath(const std::string& path);
|
||||
|
||||
/// Execute Command (block until finished)
|
||||
/// @returns process exit value
|
||||
int SWR_API
|
||||
ExecCmd(const std::string& cmd, ///< (In) Command line string
|
||||
const char* pOptEnvStrings = nullptr, ///< (Optional In) Environment block for new process
|
||||
std::string* pOptStdOut = nullptr, ///< (Optional Out) Standard Output text
|
||||
std::string* pOptStdErr = nullptr, ///< (Optional Out) Standard Error text
|
||||
const std::string* pOptStdIn = nullptr); ///< (Optional In) Standard Input text
|
||||
|
||||
|
||||
/// Helper for setting up FP state
|
||||
/// @returns old csr state
|
||||
static INLINE uint32_t SetOptimalVectorCSR()
|
||||
{
|
||||
uint32_t oldCSR = _mm_getcsr();
|
||||
|
||||
uint32_t newCSR = (oldCSR & ~(_MM_ROUND_MASK | _MM_DENORMALS_ZERO_MASK | _MM_FLUSH_ZERO_MASK));
|
||||
newCSR |= (_MM_ROUND_NEAREST | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
|
||||
_mm_setcsr(newCSR);
|
||||
|
||||
return oldCSR;
|
||||
}
|
||||
|
||||
/// Set Vector CSR state.
|
||||
/// @param csrState - should be value returned from SetOptimalVectorCSR()
|
||||
static INLINE void RestoreVectorCSR(uint32_t csrState)
|
||||
{
|
||||
_mm_setcsr(csrState);
|
||||
}
|
||||
|
||||
#endif //__SWR_OS_H__
|
||||
|
|
@ -1,192 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file rdtsc_buckets.cpp
|
||||
*
|
||||
* @brief implementation of rdtsc buckets.
|
||||
*
|
||||
* Notes:
|
||||
*
|
||||
******************************************************************************/
|
||||
#include "rdtsc_buckets.h"
|
||||
#include <inttypes.h>
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define PATH_SEPARATOR "\\"
|
||||
#elif defined(__unix__) || defined(__APPLE__)
|
||||
#define PATH_SEPARATOR "/"
|
||||
#else
|
||||
#error "Unsupported platform"
|
||||
#endif
|
||||
|
||||
THREAD UINT tlsThreadId = 0;
|
||||
|
||||
BucketManager::~BucketManager()
|
||||
{
|
||||
}
|
||||
|
||||
void BucketManager::RegisterThread(const std::string& name)
|
||||
{
|
||||
|
||||
BUCKET_THREAD newThread;
|
||||
newThread.name = name;
|
||||
newThread.root.children.reserve(mBuckets.size());
|
||||
newThread.root.id = 0;
|
||||
newThread.root.pParent = nullptr;
|
||||
newThread.pCurrent = &newThread.root;
|
||||
|
||||
mThreadMutex.lock();
|
||||
|
||||
// assign unique thread id for this thread
|
||||
size_t id = mThreads.size();
|
||||
newThread.id = (UINT)id;
|
||||
tlsThreadId = (UINT)id;
|
||||
|
||||
// store new thread
|
||||
mThreads.push_back(newThread);
|
||||
|
||||
mThreadMutex.unlock();
|
||||
}
|
||||
|
||||
UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc)
|
||||
{
|
||||
mThreadMutex.lock();
|
||||
size_t id = mBuckets.size();
|
||||
mBuckets.push_back(desc);
|
||||
mThreadMutex.unlock();
|
||||
return (UINT)id;
|
||||
}
|
||||
|
||||
void BucketManager::PrintBucket(
|
||||
FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket)
|
||||
{
|
||||
const char* arrows[] = {
|
||||
"",
|
||||
"|-> ",
|
||||
" |-> ",
|
||||
" |-> ",
|
||||
" |-> ",
|
||||
" |-> ",
|
||||
" |-> ",
|
||||
" |-> ",
|
||||
" |-> ",
|
||||
};
|
||||
|
||||
// compute percent of total cycles used by this bucket
|
||||
float percentTotal = (float)((double)bucket.elapsed / (double)threadCycles * 100.0);
|
||||
|
||||
// compute percent of parent cycles used by this bucket
|
||||
float percentParent = (float)((double)bucket.elapsed / (double)parentCycles * 100.0);
|
||||
|
||||
// compute average cycle count per invocation
|
||||
uint64_t CPE = bucket.elapsed / bucket.count;
|
||||
|
||||
BUCKET_DESC& desc = mBuckets[bucket.id];
|
||||
|
||||
// construct hierarchy visualization
|
||||
std::string str = arrows[level];
|
||||
str += desc.name;
|
||||
char hier[80];
|
||||
strcpy_s(hier, sizeof(hier)-1, str.c_str());
|
||||
|
||||
// print out
|
||||
fprintf(f,
|
||||
"%6.2f %6.2f %-10" PRIu64 " %-10" PRIu64 " %-10u %-10lu %-10u %s\n",
|
||||
percentTotal,
|
||||
percentParent,
|
||||
bucket.elapsed,
|
||||
CPE,
|
||||
bucket.count,
|
||||
(unsigned long)0,
|
||||
(uint32_t)0,
|
||||
hier);
|
||||
|
||||
// dump all children of this bucket
|
||||
for (const BUCKET& child : bucket.children)
|
||||
{
|
||||
if (child.count)
|
||||
{
|
||||
PrintBucket(f, level + 1, threadCycles, bucket.elapsed, child);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BucketManager::PrintThread(FILE* f, const BUCKET_THREAD& thread)
|
||||
{
|
||||
// print header
|
||||
fprintf(f, "\nThread %u (%s)\n", thread.id, thread.name.c_str());
|
||||
fprintf(f, " %%Tot %%Par Cycles CPE NumEvent CPE2 NumEvent2 Bucket\n");
|
||||
|
||||
// compute thread level total cycle counts across all buckets from root
|
||||
const BUCKET& root = thread.root;
|
||||
uint64_t totalCycles = 0;
|
||||
for (const BUCKET& child : root.children)
|
||||
{
|
||||
totalCycles += child.elapsed;
|
||||
}
|
||||
|
||||
for (const BUCKET& child : root.children)
|
||||
{
|
||||
if (child.count)
|
||||
{
|
||||
PrintBucket(f, 0, totalCycles, totalCycles, child);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BucketManager::PrintReport(const std::string& filename)
|
||||
{
|
||||
{
|
||||
FILE* f = fopen(filename.c_str(), "w");
|
||||
assert(f);
|
||||
|
||||
mThreadMutex.lock();
|
||||
for (const BUCKET_THREAD& thread : mThreads)
|
||||
{
|
||||
PrintThread(f, thread);
|
||||
fprintf(f, "\n");
|
||||
}
|
||||
|
||||
mThreadMutex.unlock();
|
||||
|
||||
fclose(f);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void BucketManager::StartCapture()
|
||||
{
|
||||
|
||||
printf("Capture Starting\n");
|
||||
|
||||
mCapturing = true;
|
||||
}
|
||||
|
||||
void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id)
|
||||
{
|
||||
pBucketMgr->StartBucket(id);
|
||||
}
|
||||
|
||||
void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id)
|
||||
{
|
||||
pBucketMgr->StopBucket(id);
|
||||
}
|
||||
|
|
@ -1,227 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file rdtsc_buckets.h
|
||||
*
|
||||
* @brief declaration for rdtsc buckets.
|
||||
*
|
||||
* Notes:
|
||||
*
|
||||
******************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#include "os.h"
|
||||
#include <vector>
|
||||
#include <mutex>
|
||||
#include <sstream>
|
||||
|
||||
#include "rdtsc_buckets_shared.h"
|
||||
|
||||
|
||||
// unique thread id stored in thread local storage
|
||||
extern THREAD UINT tlsThreadId;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief BucketManager encapsulates a single instance of the buckets
|
||||
/// functionality. There can be one or many bucket managers active
|
||||
/// at any time. The manager owns all the threads and
|
||||
/// bucket information that have been registered to it.
|
||||
class BucketManager
|
||||
{
|
||||
public:
|
||||
|
||||
uint32_t mCurrentFrame;
|
||||
std::vector<uint32_t> mBucketMap;
|
||||
bool mBucketsInitialized;
|
||||
std::string mBucketMgrName;
|
||||
|
||||
|
||||
BucketManager(std::string name) : mCurrentFrame(0), mBucketsInitialized(false), mBucketMgrName(name)
|
||||
{
|
||||
mBucketMap.clear();
|
||||
}
|
||||
~BucketManager();
|
||||
|
||||
// removes all registered thread data
|
||||
void ClearThreads()
|
||||
{
|
||||
mThreadMutex.lock();
|
||||
mThreads.clear();
|
||||
mThreadMutex.unlock();
|
||||
}
|
||||
|
||||
// removes all registered buckets
|
||||
void ClearBuckets()
|
||||
{
|
||||
mThreadMutex.lock();
|
||||
mBuckets.clear();
|
||||
mThreadMutex.unlock();
|
||||
}
|
||||
|
||||
/// Registers a new thread with the manager.
|
||||
/// @param name - name of thread, used for labels in reports and threadviz
|
||||
void RegisterThread(const std::string& name);
|
||||
|
||||
/// Registers a new bucket type with the manager. Returns a unique
|
||||
/// id which should be used in subsequent calls to start/stop the bucket
|
||||
/// @param desc - description of the bucket
|
||||
/// @return unique id
|
||||
UINT RegisterBucket(const BUCKET_DESC& desc);
|
||||
|
||||
// print report
|
||||
void PrintReport(const std::string& filename);
|
||||
|
||||
|
||||
// start capturing
|
||||
void StartCapture();
|
||||
|
||||
// stop capturing
|
||||
INLINE void StopCapture()
|
||||
{
|
||||
mCapturing = false;
|
||||
|
||||
// wait for all threads to pop back to root bucket
|
||||
bool stillCapturing = true;
|
||||
while (stillCapturing)
|
||||
{
|
||||
stillCapturing = false;
|
||||
for (const BUCKET_THREAD& t : mThreads)
|
||||
{
|
||||
if (t.level > 0)
|
||||
{
|
||||
stillCapturing = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mDoneCapturing = true;
|
||||
printf("Capture Stopped\n");
|
||||
}
|
||||
|
||||
// start a bucket
|
||||
// @param id generated by RegisterBucket
|
||||
INLINE void StartBucket(UINT id)
|
||||
{
|
||||
if (!mCapturing)
|
||||
return;
|
||||
|
||||
SWR_ASSERT(tlsThreadId < mThreads.size());
|
||||
|
||||
BUCKET_THREAD& bt = mThreads[tlsThreadId];
|
||||
|
||||
uint64_t tsc = __rdtsc();
|
||||
|
||||
{
|
||||
if (bt.pCurrent->children.size() < mBuckets.size())
|
||||
{
|
||||
bt.pCurrent->children.resize(mBuckets.size());
|
||||
}
|
||||
BUCKET& child = bt.pCurrent->children[id];
|
||||
child.pParent = bt.pCurrent;
|
||||
child.id = id;
|
||||
child.start = tsc;
|
||||
|
||||
// update thread's currently executing bucket
|
||||
bt.pCurrent = &child;
|
||||
}
|
||||
|
||||
|
||||
bt.level++;
|
||||
}
|
||||
|
||||
// stop the currently executing bucket
|
||||
INLINE void StopBucket(UINT id)
|
||||
{
|
||||
SWR_ASSERT(tlsThreadId < mThreads.size());
|
||||
BUCKET_THREAD& bt = mThreads[tlsThreadId];
|
||||
|
||||
if (bt.level == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
uint64_t tsc = __rdtsc();
|
||||
|
||||
{
|
||||
if (bt.pCurrent->start == 0)
|
||||
return;
|
||||
SWR_ASSERT(bt.pCurrent->id == id, "Mismatched buckets detected");
|
||||
|
||||
bt.pCurrent->elapsed += (tsc - bt.pCurrent->start);
|
||||
bt.pCurrent->count++;
|
||||
|
||||
// pop to parent
|
||||
bt.pCurrent = bt.pCurrent->pParent;
|
||||
}
|
||||
|
||||
bt.level--;
|
||||
}
|
||||
|
||||
INLINE void AddEvent(uint32_t id, uint32_t count)
|
||||
{
|
||||
if (!mCapturing)
|
||||
return;
|
||||
|
||||
SWR_ASSERT(tlsThreadId < mThreads.size());
|
||||
|
||||
BUCKET_THREAD& bt = mThreads[tlsThreadId];
|
||||
|
||||
// don't record events for threadviz
|
||||
{
|
||||
if (bt.pCurrent->children.size() < mBuckets.size())
|
||||
{
|
||||
bt.pCurrent->children.resize(mBuckets.size());
|
||||
}
|
||||
BUCKET& child = bt.pCurrent->children[id];
|
||||
child.pParent = bt.pCurrent;
|
||||
child.id = id;
|
||||
child.count += count;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void PrintBucket(
|
||||
FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket);
|
||||
void PrintThread(FILE* f, const BUCKET_THREAD& thread);
|
||||
|
||||
// list of active threads that have registered with this manager
|
||||
std::vector<BUCKET_THREAD> mThreads;
|
||||
|
||||
// list of buckets registered with this manager
|
||||
std::vector<BUCKET_DESC> mBuckets;
|
||||
|
||||
// is capturing currently enabled
|
||||
volatile bool mCapturing{false};
|
||||
|
||||
// has capturing completed
|
||||
volatile bool mDoneCapturing{false};
|
||||
|
||||
std::mutex mThreadMutex;
|
||||
|
||||
std::string mThreadVizDir;
|
||||
|
||||
};
|
||||
|
||||
// C helpers for jitter
|
||||
void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id);
|
||||
void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id);
|
||||
|
|
@ -1,169 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file rdtsc_buckets.h
|
||||
*
|
||||
* @brief declaration for rdtsc buckets.
|
||||
*
|
||||
* Notes:
|
||||
*
|
||||
******************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <cassert>
|
||||
|
||||
struct BUCKET
|
||||
{
|
||||
uint32_t id{0};
|
||||
uint64_t start{0};
|
||||
uint64_t elapsed{0};
|
||||
uint32_t count{0};
|
||||
|
||||
BUCKET* pParent{nullptr};
|
||||
std::vector<BUCKET> children;
|
||||
};
|
||||
|
||||
struct BUCKET_DESC
|
||||
{
|
||||
// name of bucket, used in reports
|
||||
std::string name;
|
||||
|
||||
// description of bucket, used in threadviz
|
||||
std::string description;
|
||||
|
||||
// enable for threadviz dumping
|
||||
bool enableThreadViz;
|
||||
|
||||
// threadviz color of bucket, in RGBA8_UNORM format
|
||||
uint32_t color;
|
||||
};
|
||||
|
||||
|
||||
struct BUCKET_THREAD
|
||||
{
|
||||
// name of thread, used in reports
|
||||
std::string name;
|
||||
|
||||
// id for this thread, assigned by the thread manager
|
||||
uint32_t id{0};
|
||||
|
||||
// root of the bucket hierarchy for this thread
|
||||
BUCKET root;
|
||||
|
||||
// currently executing bucket somewhere in the hierarchy
|
||||
BUCKET* pCurrent{nullptr};
|
||||
|
||||
// currently executing hierarchy level
|
||||
uint32_t level{0};
|
||||
|
||||
// threadviz file object
|
||||
FILE* vizFile{nullptr};
|
||||
|
||||
|
||||
BUCKET_THREAD() {}
|
||||
BUCKET_THREAD(const BUCKET_THREAD& that)
|
||||
{
|
||||
name = that.name;
|
||||
id = that.id;
|
||||
root = that.root;
|
||||
pCurrent = &root;
|
||||
vizFile = that.vizFile;
|
||||
}
|
||||
};
|
||||
|
||||
enum VIZ_TYPE
|
||||
{
|
||||
VIZ_START = 0,
|
||||
VIZ_STOP = 1,
|
||||
VIZ_DATA = 2
|
||||
};
|
||||
|
||||
struct VIZ_START_DATA
|
||||
{
|
||||
uint8_t type;
|
||||
uint32_t bucketId;
|
||||
uint64_t timestamp;
|
||||
};
|
||||
|
||||
struct VIZ_STOP_DATA
|
||||
{
|
||||
uint8_t type;
|
||||
uint64_t timestamp;
|
||||
};
|
||||
|
||||
inline void Serialize(FILE* f, const VIZ_START_DATA& data)
|
||||
{
|
||||
fwrite(&data, sizeof(VIZ_START_DATA), 1, f);
|
||||
}
|
||||
|
||||
inline void Deserialize(FILE* f, VIZ_START_DATA& data)
|
||||
{
|
||||
fread(&data, sizeof(VIZ_START_DATA), 1, f);
|
||||
assert(data.type == VIZ_START);
|
||||
}
|
||||
|
||||
inline void Serialize(FILE* f, const VIZ_STOP_DATA& data)
|
||||
{
|
||||
fwrite(&data, sizeof(VIZ_STOP_DATA), 1, f);
|
||||
}
|
||||
|
||||
inline void Deserialize(FILE* f, VIZ_STOP_DATA& data)
|
||||
{
|
||||
fread(&data, sizeof(VIZ_STOP_DATA), 1, f);
|
||||
assert(data.type == VIZ_STOP);
|
||||
}
|
||||
|
||||
inline void Serialize(FILE* f, const std::string& string)
|
||||
{
|
||||
assert(string.size() <= 256);
|
||||
|
||||
uint8_t length = (uint8_t)string.size();
|
||||
fwrite(&length, sizeof(length), 1, f);
|
||||
fwrite(string.c_str(), string.size(), 1, f);
|
||||
}
|
||||
|
||||
inline void Deserialize(FILE* f, std::string& string)
|
||||
{
|
||||
char cstr[256];
|
||||
uint8_t length;
|
||||
fread(&length, sizeof(length), 1, f);
|
||||
fread(cstr, length, 1, f);
|
||||
cstr[length] = 0;
|
||||
string.assign(cstr);
|
||||
}
|
||||
|
||||
inline void Serialize(FILE* f, const BUCKET_DESC& desc)
|
||||
{
|
||||
Serialize(f, desc.name);
|
||||
Serialize(f, desc.description);
|
||||
fwrite(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f);
|
||||
fwrite(&desc.color, sizeof(desc.color), 1, f);
|
||||
}
|
||||
|
||||
inline void Deserialize(FILE* f, BUCKET_DESC& desc)
|
||||
{
|
||||
Deserialize(f, desc.name);
|
||||
Deserialize(f, desc.description);
|
||||
fread(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f);
|
||||
fread(&desc.color, sizeof(desc.color), 1, f);
|
||||
}
|
||||
|
|
@ -1,168 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef __SWR_SIMD16INTRIN_H__
|
||||
#define __SWR_SIMD16INTRIN_H__
|
||||
|
||||
#if KNOB_SIMD16_WIDTH == 16
|
||||
typedef SIMD512 SIMD16;
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif // KNOB_SIMD16_WIDTH == 16
|
||||
|
||||
#define _simd16_setzero_ps SIMD16::setzero_ps
|
||||
#define _simd16_setzero_si SIMD16::setzero_si
|
||||
#define _simd16_set1_ps SIMD16::set1_ps
|
||||
#define _simd16_set1_epi8 SIMD16::set1_epi8
|
||||
#define _simd16_set1_epi32 SIMD16::set1_epi32
|
||||
#define _simd16_set_ps SIMD16::set_ps
|
||||
#define _simd16_set_epi32 SIMD16::set_epi32
|
||||
#define _simd16_load_ps SIMD16::load_ps
|
||||
#define _simd16_loadu_ps SIMD16::loadu_ps
|
||||
#if 1
|
||||
#define _simd16_load1_ps SIMD16::broadcast_ss
|
||||
#endif
|
||||
#define _simd16_load_si SIMD16::load_si
|
||||
#define _simd16_loadu_si SIMD16::loadu_si
|
||||
#define _simd16_broadcast_ss(m) SIMD16::broadcast_ss((float const*)m)
|
||||
#define _simd16_store_ps SIMD16::store_ps
|
||||
#define _simd16_store_si SIMD16::store_si
|
||||
#define _simd16_extract_ps(a, imm8) SIMD16::extract_ps<imm8>(a)
|
||||
#define _simd16_extract_si(a, imm8) SIMD16::extract_si<imm8>(a)
|
||||
#define _simd16_insert_ps(a, b, imm8) SIMD16::insert_ps<imm8>(a, b)
|
||||
#define _simd16_insert_si(a, b, imm8) SIMD16::insert_si<imm8>(a, b)
|
||||
#define _simd16_maskstore_ps SIMD16::maskstore_ps
|
||||
#define _simd16_blend_ps(a, b, mask) SIMD16::blend_ps<mask>(a, b)
|
||||
#define _simd16_blendv_ps SIMD16::blendv_ps
|
||||
#define _simd16_blendv_epi32 SIMD16::blendv_epi32
|
||||
#define _simd16_mul_ps SIMD16::mul_ps
|
||||
#define _simd16_div_ps SIMD16::div_ps
|
||||
#define _simd16_add_ps SIMD16::add_ps
|
||||
#define _simd16_sub_ps SIMD16::sub_ps
|
||||
#define _simd16_rsqrt_ps SIMD16::rsqrt_ps
|
||||
#define _simd16_min_ps SIMD16::min_ps
|
||||
#define _simd16_max_ps SIMD16::max_ps
|
||||
#define _simd16_movemask_ps SIMD16::movemask_ps
|
||||
#define _simd16_movemask_pd SIMD16::movemask_pd
|
||||
#define _simd16_cvtps_epi32 SIMD16::cvtps_epi32
|
||||
#define _simd16_cvttps_epi32 SIMD16::cvttps_epi32
|
||||
#define _simd16_cvtepi32_ps SIMD16::cvtepi32_ps
|
||||
#define _simd16_cmp_ps(a, b, comp) SIMD16::cmp_ps<SIMD16::CompareType(comp)>(a, b)
|
||||
#define _simd16_cmplt_ps SIMD16::cmplt_ps
|
||||
#define _simd16_cmpgt_ps SIMD16::cmpgt_ps
|
||||
#define _simd16_cmpneq_ps SIMD16::cmpneq_ps
|
||||
#define _simd16_cmpeq_ps SIMD16::cmpeq_ps
|
||||
#define _simd16_cmpge_ps SIMD16::cmpge_ps
|
||||
#define _simd16_cmple_ps SIMD16::cmple_ps
|
||||
#define _simd16_castsi_ps SIMD16::castsi_ps
|
||||
#define _simd16_castps_si SIMD16::castps_si
|
||||
#define _simd16_castsi_pd SIMD16::castsi_pd
|
||||
#define _simd16_castpd_si SIMD16::castpd_si
|
||||
#define _simd16_castpd_ps SIMD16::castpd_ps
|
||||
#define _simd16_castps_pd SIMD16::castps_pd
|
||||
#define _simd16_and_ps SIMD16::and_ps
|
||||
#define _simd16_andnot_ps SIMD16::andnot_ps
|
||||
#define _simd16_or_ps SIMD16::or_ps
|
||||
#define _simd16_xor_ps SIMD16::xor_ps
|
||||
#define _simd16_round_ps(a, mode) SIMD16::round_ps<SIMD16::RoundMode(mode)>(a)
|
||||
#define _simd16_mul_epi32 SIMD16::mul_epi32
|
||||
#define _simd16_mullo_epi32 SIMD16::mullo_epi32
|
||||
#define _simd16_sub_epi32 SIMD16::sub_epi32
|
||||
#define _simd16_sub_epi64 SIMD16::sub_epi64
|
||||
#define _simd16_min_epi32 SIMD16::min_epi32
|
||||
#define _simd16_max_epi32 SIMD16::max_epi32
|
||||
#define _simd16_min_epu32 SIMD16::min_epu32
|
||||
#define _simd16_max_epu32 SIMD16::max_epu32
|
||||
#define _simd16_add_epi32 SIMD16::add_epi32
|
||||
#define _simd16_and_si SIMD16::and_si
|
||||
#define _simd16_andnot_si SIMD16::andnot_si
|
||||
#define _simd16_or_si SIMD16::or_si
|
||||
#define _simd16_xor_si SIMD16::xor_si
|
||||
#define _simd16_cmpeq_epi32 SIMD16::cmpeq_epi32
|
||||
#define _simd16_cmpgt_epi32 SIMD16::cmpgt_epi32
|
||||
#define _simd16_cmplt_epi32 SIMD16::cmplt_epi32
|
||||
#define _simd16_testz_ps SIMD16::testz_ps
|
||||
#define _simd16_unpacklo_ps SIMD16::unpacklo_ps
|
||||
#define _simd16_unpackhi_ps SIMD16::unpackhi_ps
|
||||
#define _simd16_unpacklo_pd SIMD16::unpacklo_pd
|
||||
#define _simd16_unpackhi_pd SIMD16::unpackhi_pd
|
||||
#define _simd16_unpacklo_epi8 SIMD16::unpacklo_epi8
|
||||
#define _simd16_unpackhi_epi8 SIMD16::unpackhi_epi8
|
||||
#define _simd16_unpacklo_epi16 SIMD16::unpacklo_epi16
|
||||
#define _simd16_unpackhi_epi16 SIMD16::unpackhi_epi16
|
||||
#define _simd16_unpacklo_epi32 SIMD16::unpacklo_epi32
|
||||
#define _simd16_unpackhi_epi32 SIMD16::unpackhi_epi32
|
||||
#define _simd16_unpacklo_epi64 SIMD16::unpacklo_epi64
|
||||
#define _simd16_unpackhi_epi64 SIMD16::unpackhi_epi64
|
||||
#define _simd16_slli_epi32(a, i) SIMD16::slli_epi32<i>(a)
|
||||
#define _simd16_srli_epi32(a, i) SIMD16::srli_epi32<i>(a)
|
||||
#define _simd16_srai_epi32(a, i) SIMD16::srai_epi32<i>(a)
|
||||
#define _simd16_fmadd_ps SIMD16::fmadd_ps
|
||||
#define _simd16_fmsub_ps SIMD16::fmsub_ps
|
||||
#define _simd16_adds_epu8 SIMD16::adds_epu8
|
||||
#define _simd16_subs_epu8 SIMD16::subs_epu8
|
||||
#define _simd16_add_epi8 SIMD16::add_epi8
|
||||
#define _simd16_shuffle_epi8 SIMD16::shuffle_epi8
|
||||
|
||||
#define _simd16_i32gather_ps(m, index, scale) \
|
||||
SIMD16::i32gather_ps<SIMD16::ScaleFactor(scale)>(m, index)
|
||||
#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) \
|
||||
SIMD16::mask_i32gather_ps<SIMD16::ScaleFactor(scale)>(a, m, index, mask)
|
||||
|
||||
#define _simd16_abs_epi32 SIMD16::abs_epi32
|
||||
|
||||
#define _simd16_cmpeq_epi64 SIMD16::cmpeq_epi64
|
||||
#define _simd16_cmpgt_epi64 SIMD16::cmpgt_epi64
|
||||
#define _simd16_cmpeq_epi16 SIMD16::cmpeq_epi16
|
||||
#define _simd16_cmpgt_epi16 SIMD16::cmpgt_epi16
|
||||
#define _simd16_cmpeq_epi8 SIMD16::cmpeq_epi8
|
||||
#define _simd16_cmpgt_epi8 SIMD16::cmpgt_epi8
|
||||
|
||||
#define _simd16_permute_ps_i(a, i) SIMD16::permute_ps<i>(a)
|
||||
#define _simd16_permute_ps SIMD16::permute_ps
|
||||
#define _simd16_permute_epi32 SIMD16::permute_epi32
|
||||
#define _simd16_sllv_epi32 SIMD16::sllv_epi32
|
||||
#define _simd16_srlv_epi32 SIMD16::sllv_epi32
|
||||
#define _simd16_permute2f128_ps(a, b, i) SIMD16::permute2f128_ps<i>(a, b)
|
||||
#define _simd16_permute2f128_pd(a, b, i) SIMD16::permute2f128_pd<i>(a, b)
|
||||
#define _simd16_permute2f128_si(a, b, i) SIMD16::permute2f128_si<i>(a, b)
|
||||
#define _simd16_shuffle_ps(a, b, i) SIMD16::shuffle_ps<i>(a, b)
|
||||
#define _simd16_shuffle_pd(a, b, i) SIMD16::shuffle_pd<i>(a, b)
|
||||
#define _simd16_shuffle_epi32(a, b, imm8) SIMD16::shuffle_epi32<imm8>(a, b)
|
||||
#define _simd16_shuffle_epi64(a, b, imm8) SIMD16::shuffle_epi64<imm8>(a, b)
|
||||
#define _simd16_cvtepu8_epi16 SIMD16::cvtepu8_epi16
|
||||
#define _simd16_cvtepu8_epi32 SIMD16::cvtepu8_epi32
|
||||
#define _simd16_cvtepu16_epi32 SIMD16::cvtepu16_epi32
|
||||
#define _simd16_cvtepu16_epi64 SIMD16::cvtepu16_epi64
|
||||
#define _simd16_cvtepu32_epi64 SIMD16::cvtepu32_epi64
|
||||
#define _simd16_packus_epi16 SIMD16::packus_epi16
|
||||
#define _simd16_packs_epi16 SIMD16::packs_epi16
|
||||
#define _simd16_packus_epi32 SIMD16::packus_epi32
|
||||
#define _simd16_packs_epi32 SIMD16::packs_epi32
|
||||
#define _simd16_cmplt_ps_mask SIMD16::cmp_ps_mask<SIMD16::CompareType::LT_OQ>
|
||||
#define _simd16_cmpeq_ps_mask SIMD16::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>
|
||||
#define _simd16_int2mask(mask) simd16mask(mask)
|
||||
#define _simd16_mask2int(mask) int(mask)
|
||||
#define _simd16_vmask_ps SIMD16::vmask_ps
|
||||
|
||||
#endif //__SWR_SIMD16INTRIN_H_
|
||||
|
|
@ -1,322 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef __SWR_SIMDINTRIN_H__
|
||||
#define __SWR_SIMDINTRIN_H__
|
||||
|
||||
#include "common/intrin.h"
|
||||
#include "common/simdlib.hpp"
|
||||
|
||||
#if KNOB_SIMD_WIDTH == 8
|
||||
typedef SIMD256 SIMD;
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif // KNOB_SIMD16_WIDTH == 16
|
||||
|
||||
#define _simd128_maskstore_ps SIMD128::maskstore_ps
|
||||
#define _simd128_fmadd_ps SIMD128::fmadd_ps
|
||||
|
||||
#define _simd_load_ps SIMD::load_ps
|
||||
#define _simd_load1_ps SIMD::broadcast_ss
|
||||
#define _simd_loadu_ps SIMD::loadu_ps
|
||||
#define _simd_setzero_ps SIMD::setzero_ps
|
||||
#define _simd_set1_ps SIMD::set1_ps
|
||||
#define _simd_blend_ps(a, b, i) SIMD::blend_ps<i>(a, b)
|
||||
#define _simd_blend_epi32(a, b, i) SIMD::blend_epi32<i>(a, b)
|
||||
#define _simd_blendv_ps SIMD::blendv_ps
|
||||
#define _simd_store_ps SIMD::store_ps
|
||||
#define _simd_mul_ps SIMD::mul_ps
|
||||
#define _simd_add_ps SIMD::add_ps
|
||||
#define _simd_sub_ps SIMD::sub_ps
|
||||
#define _simd_rsqrt_ps SIMD::rsqrt_ps
|
||||
#define _simd_min_ps SIMD::min_ps
|
||||
#define _simd_max_ps SIMD::max_ps
|
||||
#define _simd_movemask_ps SIMD::movemask_ps
|
||||
#define _simd_cvtps_epi32 SIMD::cvtps_epi32
|
||||
#define _simd_cvttps_epi32 SIMD::cvttps_epi32
|
||||
#define _simd_cvtepi32_ps SIMD::cvtepi32_ps
|
||||
#define _simd_cmplt_ps SIMD::cmplt_ps
|
||||
#define _simd_cmpgt_ps SIMD::cmpgt_ps
|
||||
#define _simd_cmpneq_ps SIMD::cmpneq_ps
|
||||
#define _simd_cmpeq_ps SIMD::cmpeq_ps
|
||||
#define _simd_cmpge_ps SIMD::cmpge_ps
|
||||
#define _simd_cmple_ps SIMD::cmple_ps
|
||||
#define _simd_cmp_ps(a, b, imm) SIMD::cmp_ps<SIMD::CompareType(imm)>(a, b)
|
||||
#define _simd_and_ps SIMD::and_ps
|
||||
#define _simd_or_ps SIMD::or_ps
|
||||
#define _simd_rcp_ps SIMD::rcp_ps
|
||||
#define _simd_div_ps SIMD::div_ps
|
||||
#define _simd_castsi_ps SIMD::castsi_ps
|
||||
#define _simd_castps_pd SIMD::castps_pd
|
||||
#define _simd_castpd_ps SIMD::castpd_ps
|
||||
#define _simd_andnot_ps SIMD::andnot_ps
|
||||
#define _simd_round_ps(a, i) SIMD::round_ps<SIMD::RoundMode(i)>(a)
|
||||
#define _simd_castpd_ps SIMD::castpd_ps
|
||||
#define _simd_broadcast_ps(a) SIMD::broadcast_ps((SIMD128::Float const*)(a))
|
||||
#define _simd_stream_ps SIMD::stream_ps
|
||||
|
||||
#define _simd_movemask_pd SIMD::movemask_pd
|
||||
#define _simd_castsi_pd SIMD::castsi_pd
|
||||
|
||||
#define _simd_mul_epi32 SIMD::mul_epi32
|
||||
#define _simd_mullo_epi32 SIMD::mullo_epi32
|
||||
#define _simd_sub_epi32 SIMD::sub_epi32
|
||||
#define _simd_sub_epi64 SIMD::sub_epi64
|
||||
#define _simd_min_epi32 SIMD::min_epi32
|
||||
#define _simd_min_epu32 SIMD::min_epu32
|
||||
#define _simd_max_epi32 SIMD::max_epi32
|
||||
#define _simd_max_epu32 SIMD::max_epu32
|
||||
#define _simd_add_epi32 SIMD::add_epi32
|
||||
#define _simd_and_si SIMD::and_si
|
||||
#define _simd_andnot_si SIMD::andnot_si
|
||||
#define _simd_cmpeq_epi32 SIMD::cmpeq_epi32
|
||||
#define _simd_cmplt_epi32 SIMD::cmplt_epi32
|
||||
#define _simd_cmpgt_epi32 SIMD::cmpgt_epi32
|
||||
#define _simd_or_si SIMD::or_si
|
||||
#define _simd_xor_si SIMD::xor_si
|
||||
#define _simd_castps_si SIMD::castps_si
|
||||
#define _simd_adds_epu8 SIMD::adds_epu8
|
||||
#define _simd_subs_epu8 SIMD::subs_epu8
|
||||
#define _simd_add_epi8 SIMD::add_epi8
|
||||
#define _simd_cmpeq_epi64 SIMD::cmpeq_epi64
|
||||
#define _simd_cmpgt_epi64 SIMD::cmpgt_epi64
|
||||
#define _simd_cmpgt_epi8 SIMD::cmpgt_epi8
|
||||
#define _simd_cmpeq_epi8 SIMD::cmpeq_epi8
|
||||
#define _simd_cmpgt_epi16 SIMD::cmpgt_epi16
|
||||
#define _simd_cmpeq_epi16 SIMD::cmpeq_epi16
|
||||
#define _simd_movemask_epi8 SIMD::movemask_epi8
|
||||
#define _simd_permute_ps_i(a, i) SIMD::permute_ps<i>(a)
|
||||
#define _simd_permute_ps SIMD::permute_ps
|
||||
#define _simd_permute_epi32 SIMD::permute_epi32
|
||||
#define _simd_srlv_epi32 SIMD::srlv_epi32
|
||||
#define _simd_sllv_epi32 SIMD::sllv_epi32
|
||||
|
||||
#define _simd_unpacklo_epi8 SIMD::unpacklo_epi8
|
||||
#define _simd_unpackhi_epi8 SIMD::unpackhi_epi8
|
||||
#define _simd_unpacklo_epi16 SIMD::unpacklo_epi16
|
||||
#define _simd_unpackhi_epi16 SIMD::unpackhi_epi16
|
||||
#define _simd_unpacklo_epi32 SIMD::unpacklo_epi32
|
||||
#define _simd_unpackhi_epi32 SIMD::unpackhi_epi32
|
||||
#define _simd_unpacklo_epi64 SIMD::unpacklo_epi64
|
||||
#define _simd_unpackhi_epi64 SIMD::unpackhi_epi64
|
||||
|
||||
#define _simd_slli_epi32(a, i) SIMD::slli_epi32<i>(a)
|
||||
#define _simd_srai_epi32(a, i) SIMD::srai_epi32<i>(a)
|
||||
#define _simd_srli_epi32(a, i) SIMD::srli_epi32<i>(a)
|
||||
#define _simd_srlisi_ps(a, i) SIMD::srlisi_ps<i>(a)
|
||||
|
||||
#define _simd_fmadd_ps SIMD::fmadd_ps
|
||||
#define _simd_fmsub_ps SIMD::fmsub_ps
|
||||
#define _simd_shuffle_epi8 SIMD::shuffle_epi8
|
||||
|
||||
#define _simd_i32gather_ps(p, o, s) SIMD::i32gather_ps<SIMD::ScaleFactor(s)>(p, o)
|
||||
#define _simd_mask_i32gather_ps(r, p, o, m, s) \
|
||||
SIMD::mask_i32gather_ps<SIMD::ScaleFactor(s)>(r, p, o, m)
|
||||
#define _simd_abs_epi32 SIMD::abs_epi32
|
||||
|
||||
#define _simd_cvtepu8_epi16 SIMD::cvtepu8_epi16
|
||||
#define _simd_cvtepu8_epi32 SIMD::cvtepu8_epi32
|
||||
#define _simd_cvtepu16_epi32 SIMD::cvtepu16_epi32
|
||||
#define _simd_cvtepu16_epi64 SIMD::cvtepu16_epi64
|
||||
#define _simd_cvtepu32_epi64 SIMD::cvtepu32_epi64
|
||||
|
||||
#define _simd_packus_epi16 SIMD::packus_epi16
|
||||
#define _simd_packs_epi16 SIMD::packs_epi16
|
||||
#define _simd_packus_epi32 SIMD::packus_epi32
|
||||
#define _simd_packs_epi32 SIMD::packs_epi32
|
||||
|
||||
#define _simd_unpacklo_ps SIMD::unpacklo_ps
|
||||
#define _simd_unpackhi_ps SIMD::unpackhi_ps
|
||||
#define _simd_unpacklo_pd SIMD::unpacklo_pd
|
||||
#define _simd_unpackhi_pd SIMD::unpackhi_pd
|
||||
#define _simd_insertf128_ps SIMD::insertf128_ps
|
||||
#define _simd_insertf128_pd SIMD::insertf128_pd
|
||||
#define _simd_insertf128_si(a, b, i) SIMD::insertf128_si<i>(a, b)
|
||||
#define _simd_extractf128_ps(a, i) SIMD::extractf128_ps<i>(a)
|
||||
#define _simd_extractf128_pd(a, i) SIMD::extractf128_pd<i>(a)
|
||||
#define _simd_extractf128_si(a, i) SIMD::extractf128_si<i>(a)
|
||||
#define _simd_permute2f128_ps(a, b, i) SIMD::permute2f128_ps<i>(a, b)
|
||||
#define _simd_permute2f128_pd(a, b, i) SIMD::permute2f128_pd<i>(a, b)
|
||||
#define _simd_permute2f128_si(a, b, i) SIMD::permute2f128_si<i>(a, b)
|
||||
#define _simd_shuffle_ps(a, b, i) SIMD::shuffle_ps<i>(a, b)
|
||||
#define _simd_shuffle_pd(a, b, i) SIMD::shuffle_pd<i>(a, b)
|
||||
#define _simd_shuffle_epi32(a, b, imm8) SIMD::shuffle_epi32<imm8>(a, b)
|
||||
#define _simd_shuffle_epi64(a, b, imm8) SIMD::shuffle_epi64<imm8>(a, b)
|
||||
#define _simd_set1_epi32 SIMD::set1_epi32
|
||||
#define _simd_set_epi32 SIMD::set_epi32
|
||||
#define _simd_set_ps SIMD::set_ps
|
||||
#define _simd_set1_epi8 SIMD::set1_epi8
|
||||
#define _simd_setzero_si SIMD::setzero_si
|
||||
#define _simd_cvttps_epi32 SIMD::cvttps_epi32
|
||||
#define _simd_store_si SIMD::store_si
|
||||
#define _simd_broadcast_ss SIMD::broadcast_ss
|
||||
#define _simd_maskstore_ps SIMD::maskstore_ps
|
||||
#define _simd_load_si SIMD::load_si
|
||||
#define _simd_loadu_si SIMD::loadu_si
|
||||
#define _simd_sub_ps SIMD::sub_ps
|
||||
#define _simd_testz_ps SIMD::testz_ps
|
||||
#define _simd_testz_si SIMD::testz_si
|
||||
#define _simd_xor_ps SIMD::xor_ps
|
||||
|
||||
#define _simd_loadu2_si SIMD::loadu2_si
|
||||
#define _simd_storeu2_si SIMD::storeu2_si
|
||||
|
||||
#define _simd_blendv_epi32 SIMD::blendv_epi32
|
||||
#define _simd_vmask_ps SIMD::vmask_ps
|
||||
|
||||
template <int mask>
|
||||
SIMDINLINE SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer const& a, SIMD128::Integer const& b)
|
||||
{
|
||||
return SIMD128::castps_si(
|
||||
SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b)));
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Compute plane equation vA * vX + vB * vY + vC
|
||||
SIMDINLINE simdscalar vplaneps(simdscalar const& vA,
|
||||
simdscalar const& vB,
|
||||
simdscalar const& vC,
|
||||
simdscalar const& vX,
|
||||
simdscalar const& vY)
|
||||
{
|
||||
simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
|
||||
vOut = _simd_fmadd_ps(vB, vY, vOut);
|
||||
return vOut;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Compute plane equation vA * vX + vB * vY + vC
|
||||
SIMDINLINE simd4scalar vplaneps(simd4scalar const& vA,
|
||||
simd4scalar const& vB,
|
||||
simd4scalar const& vC,
|
||||
simd4scalar const& vX,
|
||||
simd4scalar const& vY)
|
||||
{
|
||||
simd4scalar vOut = _simd128_fmadd_ps(vA, vX, vC);
|
||||
vOut = _simd128_fmadd_ps(vB, vY, vOut);
|
||||
return vOut;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Interpolates a single component.
|
||||
/// @param vI - barycentric I
|
||||
/// @param vJ - barycentric J
|
||||
/// @param pInterpBuffer - pointer to attribute barycentric coeffs
|
||||
template <UINT Attrib, UINT Comp, UINT numComponents = 4>
|
||||
static SIMDINLINE simdscalar InterpolateComponent(simdscalar const& vI,
|
||||
simdscalar const& vJ,
|
||||
const float* pInterpBuffer)
|
||||
{
|
||||
const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
|
||||
const float* pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
|
||||
const float* pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
|
||||
|
||||
if ((pInterpA[0] == pInterpB[0]) && (pInterpA[0] == pInterpC[0]))
|
||||
{
|
||||
// Ensure constant attribs are constant. Required for proper
|
||||
// 3D resource copies.
|
||||
return _simd_broadcast_ss(pInterpA);
|
||||
}
|
||||
|
||||
simdscalar vA = _simd_broadcast_ss(pInterpA);
|
||||
simdscalar vB = _simd_broadcast_ss(pInterpB);
|
||||
simdscalar vC = _simd_broadcast_ss(pInterpC);
|
||||
|
||||
simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ);
|
||||
vC = _simd_mul_ps(vk, vC);
|
||||
|
||||
return vplaneps(vA, vB, vC, vI, vJ);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Interpolates a single component (flat shade).
|
||||
/// @param pInterpBuffer - pointer to attribute barycentric coeffs
|
||||
template <UINT Attrib, UINT Comp, UINT numComponents = 4>
|
||||
static SIMDINLINE simdscalar InterpolateComponentFlat(const float* pInterpBuffer)
|
||||
{
|
||||
const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
|
||||
|
||||
simdscalar vA = _simd_broadcast_ss(pInterpA);
|
||||
|
||||
return vA;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Interpolates a single component (flat shade).
|
||||
/// @param pInterpBuffer - pointer to attribute barycentric coeffs
|
||||
template <UINT Attrib, UINT Comp, UINT numComponents = 4>
|
||||
static SIMDINLINE simdscalari InterpolateComponentFlatInt(const uint32_t* pInterpBuffer)
|
||||
{
|
||||
const uint32_t interpA = pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
|
||||
|
||||
simdscalari vA = _simd_set1_epi32(interpA);
|
||||
|
||||
return vA;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Interpolates a single component.
|
||||
/// @param vI - barycentric I
|
||||
/// @param vJ - barycentric J
|
||||
/// @param pInterpBuffer - pointer to attribute barycentric coeffs
|
||||
template <UINT Attrib, UINT Comp, UINT numComponents = 4>
|
||||
static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar const& vI,
|
||||
simd4scalar const& vJ,
|
||||
const float* pInterpBuffer)
|
||||
{
|
||||
const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
|
||||
const float* pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
|
||||
const float* pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
|
||||
|
||||
if ((pInterpA[0] == pInterpB[0]) && (pInterpA[0] == pInterpC[0]))
|
||||
{
|
||||
// Ensure constant attribs are constant. Required for proper
|
||||
// 3D resource copies.
|
||||
return SIMD128::broadcast_ss(pInterpA);
|
||||
}
|
||||
|
||||
simd4scalar vA = SIMD128::broadcast_ss(pInterpA);
|
||||
simd4scalar vB = SIMD128::broadcast_ss(pInterpB);
|
||||
simd4scalar vC = SIMD128::broadcast_ss(pInterpC);
|
||||
|
||||
simd4scalar vk = SIMD128::sub_ps(SIMD128::sub_ps(SIMD128::set1_ps(1.0f), vI), vJ);
|
||||
vC = SIMD128::mul_ps(vk, vC);
|
||||
|
||||
return vplaneps(vA, vB, vC, vI, vJ);
|
||||
}
|
||||
|
||||
static SIMDINLINE simd4scalar _simd128_abs_ps(simd4scalar const& a)
|
||||
{
|
||||
simd4scalari ai = SIMD128::castps_si(a);
|
||||
return SIMD128::castsi_ps(SIMD128::and_si(ai, SIMD128::set1_epi32(0x7fffffff)));
|
||||
}
|
||||
|
||||
static SIMDINLINE simdscalar _simd_abs_ps(simdscalar const& a)
|
||||
{
|
||||
simdscalari ai = _simd_castps_si(a);
|
||||
return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff)));
|
||||
}
|
||||
|
||||
#include "simd16intrin.h"
|
||||
|
||||
#endif //__SWR_SIMDINTRIN_H__
|
||||
|
|
@ -1,234 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#include "simdlib_types.hpp"
|
||||
|
||||
// For documentation, please see the following include...
|
||||
// #include "simdlib_interface.hpp"
|
||||
|
||||
namespace SIMDImpl
|
||||
{
|
||||
namespace SIMD128Impl
|
||||
{
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX
|
||||
struct AVXImpl
|
||||
{
|
||||
#define __SIMD_LIB_AVX_HPP__
|
||||
#include "simdlib_128_avx.inl"
|
||||
#undef __SIMD_LIB_AVX_HPP__
|
||||
}; // struct AVXImpl
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
|
||||
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX2
|
||||
struct AVX2Impl : AVXImpl
|
||||
{
|
||||
#define __SIMD_LIB_AVX2_HPP__
|
||||
#include "simdlib_128_avx2.inl"
|
||||
#undef __SIMD_LIB_AVX2_HPP__
|
||||
}; // struct AVX2Impl
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
|
||||
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
struct AVX512Impl : AVX2Impl
|
||||
{
|
||||
#if defined(SIMD_OPT_128_AVX512)
|
||||
#define __SIMD_LIB_AVX512_HPP__
|
||||
#include "simdlib_128_avx512.inl"
|
||||
#if defined(SIMD_ARCH_KNIGHTS)
|
||||
#include "simdlib_128_avx512_knights.inl"
|
||||
#else // optimize for core
|
||||
#include "simdlib_128_avx512_core.inl"
|
||||
#endif // defined(SIMD_ARCH_KNIGHTS)
|
||||
#undef __SIMD_LIB_AVX512_HPP__
|
||||
#endif // SIMD_OPT_128_AVX512
|
||||
}; // struct AVX2Impl
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
|
||||
struct Traits : SIMDImpl::Traits
|
||||
{
|
||||
#if SIMD_ARCH == SIMD_ARCH_AVX
|
||||
using IsaImpl = AVXImpl;
|
||||
#elif SIMD_ARCH == SIMD_ARCH_AVX2
|
||||
using IsaImpl = AVX2Impl;
|
||||
#elif SIMD_ARCH == SIMD_ARCH_AVX512
|
||||
using IsaImpl = AVX512Impl;
|
||||
#else
|
||||
#error Invalid value for SIMD_ARCH
|
||||
#endif
|
||||
|
||||
using Float = SIMD128Impl::Float;
|
||||
using Double = SIMD128Impl::Double;
|
||||
using Integer = SIMD128Impl::Integer;
|
||||
using Vec4 = SIMD128Impl::Vec4;
|
||||
using Mask = SIMD128Impl::Mask;
|
||||
};
|
||||
} // namespace SIMD128Impl
|
||||
|
||||
namespace SIMD256Impl
|
||||
{
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX
|
||||
struct AVXImpl
|
||||
{
|
||||
#define __SIMD_LIB_AVX_HPP__
|
||||
#include "simdlib_256_avx.inl"
|
||||
#undef __SIMD_LIB_AVX_HPP__
|
||||
}; // struct AVXImpl
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
|
||||
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX2
|
||||
struct AVX2Impl : AVXImpl
|
||||
{
|
||||
#define __SIMD_LIB_AVX2_HPP__
|
||||
#include "simdlib_256_avx2.inl"
|
||||
#undef __SIMD_LIB_AVX2_HPP__
|
||||
}; // struct AVX2Impl
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
|
||||
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
struct AVX512Impl : AVX2Impl
|
||||
{
|
||||
#if defined(SIMD_OPT_256_AVX512)
|
||||
#define __SIMD_LIB_AVX512_HPP__
|
||||
#include "simdlib_256_avx512.inl"
|
||||
#if defined(SIMD_ARCH_KNIGHTS)
|
||||
#include "simdlib_256_avx512_knights.inl"
|
||||
#else // optimize for core
|
||||
#include "simdlib_256_avx512_core.inl"
|
||||
#endif // defined(SIMD_ARCH_KNIGHTS)
|
||||
#undef __SIMD_LIB_AVX512_HPP__
|
||||
#endif // SIMD_OPT_256_AVX512
|
||||
}; // struct AVX2Impl
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
|
||||
struct Traits : SIMDImpl::Traits
|
||||
{
|
||||
#if SIMD_ARCH == SIMD_ARCH_AVX
|
||||
using IsaImpl = AVXImpl;
|
||||
#elif SIMD_ARCH == SIMD_ARCH_AVX2
|
||||
using IsaImpl = AVX2Impl;
|
||||
#elif SIMD_ARCH == SIMD_ARCH_AVX512
|
||||
using IsaImpl = AVX512Impl;
|
||||
#else
|
||||
#error Invalid value for SIMD_ARCH
|
||||
#endif
|
||||
|
||||
using Float = SIMD256Impl::Float;
|
||||
using Double = SIMD256Impl::Double;
|
||||
using Integer = SIMD256Impl::Integer;
|
||||
using Vec4 = SIMD256Impl::Vec4;
|
||||
using Mask = SIMD256Impl::Mask;
|
||||
};
|
||||
} // namespace SIMD256Impl
|
||||
|
||||
namespace SIMD512Impl
|
||||
{
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX
|
||||
template <typename SIMD256T>
|
||||
struct AVXImplBase
|
||||
{
|
||||
#define __SIMD_LIB_AVX_HPP__
|
||||
#include "simdlib_512_emu.inl"
|
||||
#include "simdlib_512_emu_masks.inl"
|
||||
#undef __SIMD_LIB_AVX_HPP__
|
||||
}; // struct AVXImplBase
|
||||
using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>;
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
|
||||
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX2
|
||||
using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>;
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
|
||||
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
struct AVX512Impl : AVXImplBase<SIMD256Impl::AVX512Impl>
|
||||
{
|
||||
#define __SIMD_LIB_AVX512_HPP__
|
||||
#include "simdlib_512_avx512.inl"
|
||||
#include "simdlib_512_avx512_masks.inl"
|
||||
#if defined(SIMD_ARCH_KNIGHTS)
|
||||
#include "simdlib_512_avx512_knights.inl"
|
||||
#include "simdlib_512_avx512_masks_knights.inl"
|
||||
#else // optimize for core
|
||||
#include "simdlib_512_avx512_core.inl"
|
||||
#include "simdlib_512_avx512_masks_core.inl"
|
||||
#endif // defined(SIMD_ARCH_KNIGHTS)
|
||||
#undef __SIMD_LIB_AVX512_HPP__
|
||||
}; // struct AVX512ImplBase
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
|
||||
struct Traits : SIMDImpl::Traits
|
||||
{
|
||||
#if SIMD_ARCH == SIMD_ARCH_AVX
|
||||
using IsaImpl = AVXImpl;
|
||||
#elif SIMD_ARCH == SIMD_ARCH_AVX2
|
||||
using IsaImpl = AVX2Impl;
|
||||
#elif SIMD_ARCH == SIMD_ARCH_AVX512
|
||||
using IsaImpl = AVX512Impl;
|
||||
#else
|
||||
#error Invalid value for SIMD_ARCH
|
||||
#endif
|
||||
|
||||
using Float = SIMD512Impl::Float;
|
||||
using Double = SIMD512Impl::Double;
|
||||
using Integer = SIMD512Impl::Integer;
|
||||
using Vec4 = SIMD512Impl::Vec4;
|
||||
using Mask = SIMD512Impl::Mask;
|
||||
};
|
||||
} // namespace SIMD512Impl
|
||||
} // namespace SIMDImpl
|
||||
|
||||
template <typename Traits>
|
||||
struct SIMDBase : Traits::IsaImpl
|
||||
{
|
||||
using CompareType = typename Traits::CompareType;
|
||||
using ScaleFactor = typename Traits::ScaleFactor;
|
||||
using RoundMode = typename Traits::RoundMode;
|
||||
using SIMD = typename Traits::IsaImpl;
|
||||
using Float = typename Traits::Float;
|
||||
using Double = typename Traits::Double;
|
||||
using Integer = typename Traits::Integer;
|
||||
using Vec4 = typename Traits::Vec4;
|
||||
using Mask = typename Traits::Mask;
|
||||
}; // struct SIMDBase
|
||||
|
||||
using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
|
||||
using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>;
|
||||
using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>;
|
||||
|
||||
template <typename SIMD_T>
|
||||
using CompareType = typename SIMD_T::CompareType;
|
||||
template <typename SIMD_T>
|
||||
using ScaleFactor = typename SIMD_T::ScaleFactor;
|
||||
template <typename SIMD_T>
|
||||
using RoundMode = typename SIMD_T::RoundMode;
|
||||
template <typename SIMD_T>
|
||||
using Float = typename SIMD_T::Float;
|
||||
template <typename SIMD_T>
|
||||
using Double = typename SIMD_T::Double;
|
||||
template <typename SIMD_T>
|
||||
using Integer = typename SIMD_T::Integer;
|
||||
template <typename SIMD_T>
|
||||
using Vec4 = typename SIMD_T::Vec4;
|
||||
template <typename SIMD_T>
|
||||
using Mask = typename SIMD_T::Mask;
|
||||
|
||||
|
|
@ -1,593 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD128 AVX (1) implementation
|
||||
//============================================================================
|
||||
|
||||
#define SIMD_WRAPPER_1(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) { return _mm_##op(a); }
|
||||
|
||||
#define SIMD_WRAPPER_2(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm_##op(a, b); }
|
||||
|
||||
#define SIMD_DWRAPPER_2(op) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm_##op(a, b); }
|
||||
|
||||
#define SIMD_WRAPPER_2I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{ \
|
||||
return _mm_##op(a, b, ImmT); \
|
||||
}
|
||||
|
||||
#define SIMD_DWRAPPER_2I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{ \
|
||||
return _mm_##op(a, b, ImmT); \
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_3(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm_##op(a, b, c); }
|
||||
|
||||
#define SIMD_IWRAPPER_1(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm_##op(a); }
|
||||
|
||||
#define SIMD_IWRAPPER_1I_(op, intrin) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{ \
|
||||
return intrin(a, ImmT); \
|
||||
}
|
||||
#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm_##op)
|
||||
|
||||
#define SIMD_IWRAPPER_2_(op, intrin) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return intrin(a, b); }
|
||||
|
||||
#define SIMD_IWRAPPER_2(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm_##op(a, b); }
|
||||
|
||||
#define SIMD_IFWRAPPER_2(op, intrin) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{ \
|
||||
return castps_si(intrin(castsi_ps(a), castsi_ps(b))); \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{ \
|
||||
return _mm_##op(a, b, ImmT); \
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Single precision floating point arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2(add_ps); // return a + b
|
||||
SIMD_WRAPPER_2(div_ps); // return a / b
|
||||
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
|
||||
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
|
||||
SIMD_WRAPPER_2(mul_ps); // return a * b
|
||||
SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a
|
||||
SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
|
||||
SIMD_WRAPPER_2(sub_ps); // return a - b
|
||||
|
||||
static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
|
||||
{
|
||||
return add_ps(mul_ps(a, b), c);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c) // return (a * b) - c
|
||||
{
|
||||
return sub_ps(mul_ps(a, b), c);
|
||||
}
|
||||
|
||||
template <RoundMode RMT>
|
||||
static SIMDINLINE Float SIMDCALL round_ps(Float a)
|
||||
{
|
||||
return _mm_round_ps(a, static_cast<int>(RMT));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL ceil_ps(Float a)
|
||||
{
|
||||
return round_ps<RoundMode::CEIL_NOEXC>(a);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL floor_ps(Float a)
|
||||
{
|
||||
return round_ps<RoundMode::FLOOR_NOEXC>(a);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Integer (various width) arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
|
||||
SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
|
||||
SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
|
||||
SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
|
||||
|
||||
// return (a * b) & 0xFFFFFFFF
|
||||
//
|
||||
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
|
||||
// and store the low 32 bits of the intermediate integers in dst.
|
||||
SIMD_IWRAPPER_2(mullo_epi32);
|
||||
SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
|
||||
SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
|
||||
SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Logical operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
|
||||
SIMD_IWRAPPER_2_(and_si, _mm_and_si128); // return a & b (int)
|
||||
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
|
||||
SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128); // return (~a) & b (int)
|
||||
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
|
||||
SIMD_IWRAPPER_2_(or_si, _mm_or_si128); // return a | b (int)
|
||||
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
|
||||
SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128); // return a ^ b (int)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Shift operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
|
||||
SIMD_IWRAPPER_1I(slli_epi64); // return a << ImmT
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b (uint32)
|
||||
{
|
||||
int32_t a, count;
|
||||
a = _mm_extract_epi32(vA, 0);
|
||||
count = _mm_extract_epi32(vB, 0);
|
||||
a <<= count;
|
||||
vA = _mm_insert_epi32(vA, a, 0);
|
||||
|
||||
a = _mm_extract_epi32(vA, 1);
|
||||
count = _mm_extract_epi32(vB, 1);
|
||||
a <<= count;
|
||||
vA = _mm_insert_epi32(vA, a, 1);
|
||||
|
||||
a = _mm_extract_epi32(vA, 2);
|
||||
count = _mm_extract_epi32(vB, 2);
|
||||
a <<= count;
|
||||
vA = _mm_insert_epi32(vA, a, 2);
|
||||
|
||||
a = _mm_extract_epi32(vA, 3);
|
||||
count = _mm_extract_epi32(vB, 3);
|
||||
a <<= count;
|
||||
vA = _mm_insert_epi32(vA, a, 3);
|
||||
|
||||
return vA;
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
|
||||
SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
|
||||
SIMD_IWRAPPER_1I_(srli_si, _mm_srli_si128); // return a >> (ImmT*8) (uint)
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL srl_epi64(Integer a, Integer n)
|
||||
{
|
||||
return _mm_srl_epi64(a, n);
|
||||
}
|
||||
|
||||
template <int ImmT> // same as srli_si, but with Float cast to int
|
||||
static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
|
||||
{
|
||||
return castsi_ps(srli_si<ImmT>(castps_si(a)));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b (uint32)
|
||||
{
|
||||
int32_t a, count;
|
||||
a = _mm_extract_epi32(vA, 0);
|
||||
count = _mm_extract_epi32(vB, 0);
|
||||
a >>= count;
|
||||
vA = _mm_insert_epi32(vA, a, 0);
|
||||
|
||||
a = _mm_extract_epi32(vA, 1);
|
||||
count = _mm_extract_epi32(vB, 1);
|
||||
a >>= count;
|
||||
vA = _mm_insert_epi32(vA, a, 1);
|
||||
|
||||
a = _mm_extract_epi32(vA, 2);
|
||||
count = _mm_extract_epi32(vB, 2);
|
||||
a >>= count;
|
||||
vA = _mm_insert_epi32(vA, a, 2);
|
||||
|
||||
a = _mm_extract_epi32(vA, 3);
|
||||
count = _mm_extract_epi32(vB, 3);
|
||||
a >>= count;
|
||||
vA = _mm_insert_epi32(vA, a, 3);
|
||||
|
||||
return vA;
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Conversion operations
|
||||
//-----------------------------------------------------------------------
|
||||
static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
|
||||
{
|
||||
return _mm_castpd_ps(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
|
||||
{
|
||||
return _mm_castps_si128(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
|
||||
{
|
||||
return _mm_castsi128_pd(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
|
||||
{
|
||||
return _mm_castps_pd(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
|
||||
{
|
||||
return _mm_castsi128_ps(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float)
|
||||
{
|
||||
return _mm_cvtepi32_ps(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE int32_t SIMDCALL cvtsi128_si32(Integer a) // return a.v[0]
|
||||
{
|
||||
return _mm_cvtsi128_si32(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL cvtsi32_si128(int32_t n) // return a[0] = n, a[1]...a[3] = 0
|
||||
{
|
||||
return _mm_cvtsi32_si128(n);
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_1(cvtepu8_epi16); // return (int16)a (uint8 --> int16)
|
||||
SIMD_IWRAPPER_1(cvtepu8_epi32); // return (int32)a (uint8 --> int32)
|
||||
SIMD_IWRAPPER_1(cvtepu16_epi32); // return (int32)a (uint16 --> int32)
|
||||
SIMD_IWRAPPER_1(cvtepu16_epi64); // return (int64)a (uint16 --> int64)
|
||||
SIMD_IWRAPPER_1(cvtepu32_epi64); // return (int64)a (uint32 --> int64)
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32)
|
||||
{
|
||||
return _mm_cvtps_epi32(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32)
|
||||
{
|
||||
return _mm_cvttps_epi32(a);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Comparison operations
|
||||
//-----------------------------------------------------------------------
|
||||
template <CompareType CmpTypeT>
|
||||
static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
|
||||
{
|
||||
return _mm_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b)
|
||||
{
|
||||
return cmp_ps<CompareType::LT_OQ>(a, b);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b)
|
||||
{
|
||||
return cmp_ps<CompareType::GT_OQ>(a, b);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b)
|
||||
{
|
||||
return cmp_ps<CompareType::NEQ_OQ>(a, b);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b)
|
||||
{
|
||||
return cmp_ps<CompareType::EQ_OQ>(a, b);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b)
|
||||
{
|
||||
return cmp_ps<CompareType::GE_OQ>(a, b);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b)
|
||||
{
|
||||
return cmp_ps<CompareType::LE_OQ>(a, b);
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
|
||||
SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
|
||||
SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
|
||||
SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
|
||||
SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
|
||||
|
||||
static SIMDINLINE bool SIMDCALL testz_ps(Float a,
|
||||
Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
|
||||
{
|
||||
return 0 != _mm_testz_ps(a, b);
|
||||
}
|
||||
|
||||
static SIMDINLINE bool SIMDCALL testz_si(Integer a,
|
||||
Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
|
||||
{
|
||||
return 0 != _mm_testz_si128(a, b);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Blend / shuffle / permute operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
|
||||
SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
|
||||
Integer b,
|
||||
Float mask) // return mask ? b : a (int)
|
||||
{
|
||||
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
|
||||
Integer b,
|
||||
Integer mask) // return mask ? b : a (int)
|
||||
{
|
||||
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
broadcast_ss(float const* p) // return *p (all elements in vector get same value)
|
||||
{
|
||||
return _mm_broadcast_ss(p);
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm_packs_epi16 and _mm512_packs_epi16
|
||||
SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm_packs_epi32 and _mm512_packs_epi32
|
||||
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm_packus_epi16 and _mm512_packus_epi16
|
||||
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm_packus_epi32 and _mm512_packus_epi32
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
|
||||
{
|
||||
return castps_si(_mm_permutevar_ps(castsi_ps(a), swiz));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
|
||||
{
|
||||
return _mm_permutevar_ps(a, swiz);
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_1I(shuffle_epi32);
|
||||
|
||||
template <int ImmT>
|
||||
static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) = delete;
|
||||
|
||||
SIMD_IWRAPPER_2(shuffle_epi8);
|
||||
SIMD_DWRAPPER_2I(shuffle_pd);
|
||||
SIMD_WRAPPER_2I(shuffle_ps);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi16);
|
||||
|
||||
// SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps);
|
||||
static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
|
||||
{
|
||||
return castps_si(_mm_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2(unpackhi_epi64);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi8);
|
||||
SIMD_DWRAPPER_2(unpackhi_pd);
|
||||
SIMD_WRAPPER_2(unpackhi_ps);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi16);
|
||||
SIMD_IFWRAPPER_2(unpacklo_epi32, _mm_unpacklo_ps);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi64);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi8);
|
||||
SIMD_DWRAPPER_2(unpacklo_pd);
|
||||
SIMD_WRAPPER_2(unpacklo_ps);
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Load / store operations
|
||||
//-----------------------------------------------------------------------
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
|
||||
{
|
||||
uint32_t* pOffsets = (uint32_t*)&idx;
|
||||
Float vResult;
|
||||
float* pResult = (float*)&vResult;
|
||||
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
|
||||
{
|
||||
uint32_t offset = pOffsets[i];
|
||||
offset = offset * static_cast<uint32_t>(ScaleT);
|
||||
pResult[i] = *(float const*)(((uint8_t const*)p + offset));
|
||||
}
|
||||
|
||||
return vResult;
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
load1_ps(float const* p) // return *p (broadcast 1 value to all elements)
|
||||
{
|
||||
return broadcast_ss(p);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
load_ps(float const* p) // return *p (loads SIMD width elements from memory)
|
||||
{
|
||||
return _mm_load_ps(p);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
|
||||
{
|
||||
return _mm_load_si128(&p->v);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem)
|
||||
{
|
||||
return _mm_loadu_ps(p);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem)
|
||||
{
|
||||
return _mm_lddqu_si128(&p->v);
|
||||
}
|
||||
|
||||
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
|
||||
{
|
||||
uint32_t* pOffsets = (uint32_t*)&idx;
|
||||
Float vResult = old;
|
||||
float* pResult = (float*)&vResult;
|
||||
unsigned long index;
|
||||
uint32_t umask = movemask_ps(mask);
|
||||
while (_BitScanForward(&index, umask))
|
||||
{
|
||||
umask &= ~(1 << index);
|
||||
uint32_t offset = pOffsets[index];
|
||||
offset = offset * static_cast<uint32_t>(ScaleT);
|
||||
pResult[index] = *(float const*)(((uint8_t const*)p + offset));
|
||||
}
|
||||
|
||||
return vResult;
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
|
||||
{
|
||||
_mm_maskstore_ps(p, mask, src);
|
||||
}
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
|
||||
{
|
||||
return static_cast<uint32_t>(_mm_movemask_epi8(a));
|
||||
}
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
|
||||
{
|
||||
return static_cast<uint32_t>(_mm_movemask_pd(a));
|
||||
}
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
|
||||
{
|
||||
return static_cast<uint32_t>(_mm_movemask_ps(a));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
|
||||
{
|
||||
return _mm_set1_epi32(i);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
|
||||
{
|
||||
return _mm_set1_epi8(i);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
|
||||
{
|
||||
return _mm_set1_ps(f);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
|
||||
{
|
||||
return _mm_setzero_ps();
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
|
||||
{
|
||||
return _mm_setzero_si128();
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL
|
||||
store_ps(float* p, Float a) // *p = a (stores all elements contiguously in memory)
|
||||
{
|
||||
_mm_store_ps(p, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
|
||||
{
|
||||
_mm_store_si128(&p->v, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL
|
||||
storeu_si(Integer* p, Integer a) // *p = a (same as store_si but allows for unaligned mem)
|
||||
{
|
||||
_mm_storeu_si128(&p->v, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL
|
||||
stream_ps(float* p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache)
|
||||
{
|
||||
_mm_stream_ps(p, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL set_ps(float in3, float in2, float in1, float in0)
|
||||
{
|
||||
return _mm_set_ps(in3, in2, in1, in0);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set_epi32(int in3, int in2, int in1, int in0)
|
||||
{
|
||||
return _mm_set_epi32(in3, in2, in1, in0);
|
||||
}
|
||||
|
||||
template <int ImmT>
|
||||
static SIMDINLINE float SIMDCALL extract_ps(Float a)
|
||||
{
|
||||
int tmp = _mm_extract_ps(a, ImmT);
|
||||
return *reinterpret_cast<float*>(&tmp);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
|
||||
{
|
||||
Integer vec = set1_epi32(mask);
|
||||
const Integer bit = set_epi32(0x08, 0x04, 0x02, 0x01);
|
||||
vec = and_si(vec, bit);
|
||||
vec = cmplt_epi32(setzero_si(), vec);
|
||||
return castsi_ps(vec);
|
||||
}
|
||||
|
||||
#undef SIMD_WRAPPER_1
|
||||
#undef SIMD_WRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2I
|
||||
#undef SIMD_WRAPPER_2I
|
||||
#undef SIMD_WRAPPER_3
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IFWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_1I
|
||||
#undef SIMD_IWRAPPER_1I_
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
|
|
@ -1,66 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX2_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD4 AVX (2) implementation
|
||||
//
|
||||
// Since this implementation inherits from the AVX (1) implementation,
|
||||
// the only operations below ones that replace AVX (1) operations.
|
||||
// Only 2 shifts and 2 gathers were introduced with AVX 2
|
||||
// Also, add native support for FMA operations
|
||||
//============================================================================
|
||||
#define SIMD_WRAPPER_3(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm_##op(a, b, c); }
|
||||
|
||||
SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
|
||||
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b (uint32)
|
||||
{
|
||||
return _mm_sllv_epi32(vA, vB);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b (uint32)
|
||||
{
|
||||
return _mm_srlv_epi32(vA, vB);
|
||||
}
|
||||
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
|
||||
{
|
||||
return _mm_i32gather_ps(p, idx, static_cast<const int>(ScaleT));
|
||||
}
|
||||
|
||||
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
|
||||
{
|
||||
return _mm_mask_i32gather_ps(old, p, idx, mask, static_cast<const int>(ScaleT));
|
||||
}
|
||||
|
||||
#undef SIMD_WRAPPER_3
|
||||
|
|
@ -1,368 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD128 AVX (512) implementation
|
||||
//
|
||||
// Since this implementation inherits from the AVX (2) implementation,
|
||||
// the only operations below ones that replace AVX (2) operations.
|
||||
// These use native AVX512 instructions with masking to enable a larger
|
||||
// register set.
|
||||
//============================================================================
|
||||
|
||||
private:
|
||||
static SIMDINLINE __m512 __conv(Float r)
|
||||
{
|
||||
return _mm512_castps128_ps512(r.v);
|
||||
}
|
||||
static SIMDINLINE __m512d __conv(Double r)
|
||||
{
|
||||
return _mm512_castpd128_pd512(r.v);
|
||||
}
|
||||
static SIMDINLINE __m512i __conv(Integer r)
|
||||
{
|
||||
return _mm512_castsi128_si512(r.v);
|
||||
}
|
||||
static SIMDINLINE Float __conv(__m512 r)
|
||||
{
|
||||
return _mm512_castps512_ps128(r);
|
||||
}
|
||||
static SIMDINLINE Double __conv(__m512d r)
|
||||
{
|
||||
return _mm512_castpd512_pd128(r);
|
||||
}
|
||||
static SIMDINLINE Integer __conv(__m512i r)
|
||||
{
|
||||
return _mm512_castsi512_si128(r);
|
||||
}
|
||||
|
||||
public:
|
||||
#define SIMD_WRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
|
||||
}
|
||||
#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
|
||||
|
||||
#define SIMD_WRAPPER_1I_(op, intrin, mask) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
|
||||
}
|
||||
#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
|
||||
|
||||
#define SIMD_WRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
|
||||
}
|
||||
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
|
||||
|
||||
#define SIMD_WRAPPER_2I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_3_(op, intrin, mask) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \
|
||||
}
|
||||
#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
|
||||
|
||||
#define SIMD_DWRAPPER_2I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT)); \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
|
||||
}
|
||||
#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xf))
|
||||
|
||||
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
|
||||
}
|
||||
#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xf))
|
||||
|
||||
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
|
||||
}
|
||||
#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xf))
|
||||
|
||||
#define SIMD_IWRAPPER_2I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Single precision floating point arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2(add_ps); // return a + b
|
||||
SIMD_WRAPPER_2(div_ps); // return a / b
|
||||
SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
|
||||
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
|
||||
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
|
||||
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
|
||||
SIMD_WRAPPER_2(mul_ps); // return a * b
|
||||
SIMD_WRAPPER_1_(rcp_ps, rcp14_ps, __mmask16(0xf)); // return 1.0f / a
|
||||
SIMD_WRAPPER_1_(rsqrt_ps, rsqrt14_ps, __mmask16(0xf)); // return 1.0f / sqrt(a)
|
||||
SIMD_WRAPPER_2(sub_ps); // return a - b
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Integer (various width) arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32)
|
||||
SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32)
|
||||
SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
|
||||
|
||||
// SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
|
||||
// SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
|
||||
// return (a * b) & 0xFFFFFFFF
|
||||
//
|
||||
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
|
||||
// and store the low 32 bits of the intermediate integers in dst.
|
||||
SIMD_IWRAPPER_2_32(mullo_epi32);
|
||||
SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
|
||||
|
||||
// SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
|
||||
// SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Logical operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xf)); // return a & b (int)
|
||||
SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xf)); // return (~a) & b (int)
|
||||
SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xf)); // return a | b (int)
|
||||
SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xf)); // return a ^ b (int)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Shift operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT
|
||||
SIMD_IWRAPPER_2_32(sllv_epi32); // return a << b (uint32)
|
||||
SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT (int32)
|
||||
SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT (uint32)
|
||||
SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32)
|
||||
|
||||
// use AVX2 version
|
||||
// SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Conversion operations (Use AVX2 versions)
|
||||
//-----------------------------------------------------------------------
|
||||
// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff); // return (int16)a (uint8 --> int16)
|
||||
// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff); // return (int32)a (uint8 --> int32)
|
||||
// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff); // return (int32)a (uint16 --> int32)
|
||||
// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf); // return (int64)a (uint16 --> int64)
|
||||
// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf); // return (int64)a (uint32 --> int64)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Comparison operations (Use AVX2 versions
|
||||
//-----------------------------------------------------------------------
|
||||
// SIMD_IWRAPPER_2_CMP(cmpeq_epi8); // return a == b (int8)
|
||||
// SIMD_IWRAPPER_2_CMP(cmpeq_epi16); // return a == b (int16)
|
||||
// SIMD_IWRAPPER_2_CMP(cmpeq_epi32); // return a == b (int32)
|
||||
// SIMD_IWRAPPER_2_CMP(cmpeq_epi64); // return a == b (int64)
|
||||
// SIMD_IWRAPPER_2_CMP(cmpgt_epi8,); // return a > b (int8)
|
||||
// SIMD_IWRAPPER_2_CMP(cmpgt_epi16); // return a > b (int16)
|
||||
// SIMD_IWRAPPER_2_CMP(cmpgt_epi32); // return a > b (int32)
|
||||
// SIMD_IWRAPPER_2_CMP(cmpgt_epi64); // return a > b (int64)
|
||||
//
|
||||
// static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b) // return a < b (int32)
|
||||
//{
|
||||
// return cmpgt_epi32(b, a);
|
||||
//}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Blend / shuffle / permute operations
|
||||
//-----------------------------------------------------------------------
|
||||
// SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16
|
||||
// and _mm512_packs_epi16 SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation
|
||||
// for _mm256_packs_epi32 and _mm512_packs_epi32 SIMD_IWRAPPER_2_8(packus_epi16); // uint16 -->
|
||||
// uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
|
||||
// SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for
|
||||
// _mm256_packus_epi32 and _mm512_packus_epi32 SIMD_IWRAPPER_2_(permute_epi32,
|
||||
// permutevar8x32_epi32);
|
||||
|
||||
// static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for
|
||||
// each 32-bit lane i (float)
|
||||
//{
|
||||
// return _mm256_permutevar8x32_ps(a, swiz);
|
||||
//}
|
||||
|
||||
SIMD_IWRAPPER_1I_32(shuffle_epi32);
|
||||
// template<int ImmT>
|
||||
// static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
|
||||
//{
|
||||
// return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
|
||||
//}
|
||||
// SIMD_IWRAPPER_2(shuffle_epi8);
|
||||
SIMD_IWRAPPER_2_32(unpackhi_epi32);
|
||||
SIMD_IWRAPPER_2_32(unpacklo_epi32);
|
||||
|
||||
// SIMD_IWRAPPER_2_16(unpackhi_epi16);
|
||||
// SIMD_IWRAPPER_2_64(unpackhi_epi64);
|
||||
// SIMD_IWRAPPER_2_8(unpackhi_epi8);
|
||||
// SIMD_IWRAPPER_2_16(unpacklo_epi16);
|
||||
// SIMD_IWRAPPER_2_64(unpacklo_epi64);
|
||||
// SIMD_IWRAPPER_2_8(unpacklo_epi8);
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Load / store operations
|
||||
//-----------------------------------------------------------------------
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
load_ps(float const* p) // return *p (loads SIMD width elements from memory)
|
||||
{
|
||||
return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
|
||||
{
|
||||
return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem)
|
||||
{
|
||||
return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem)
|
||||
{
|
||||
return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p));
|
||||
}
|
||||
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
|
||||
{
|
||||
return __conv(_mm512_mask_i32gather_ps(
|
||||
_mm512_setzero_ps(), __mmask16(0xf), __conv(idx), p, static_cast<int>(ScaleT)));
|
||||
}
|
||||
|
||||
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
|
||||
{
|
||||
__mmask16 m = 0xf;
|
||||
m = _mm512_mask_test_epi32_mask(
|
||||
m, _mm512_castps_si512(__conv(mask)), _mm512_set1_epi32(0x80000000));
|
||||
return __conv(
|
||||
_mm512_mask_i32gather_ps(__conv(old), m, __conv(idx), p, static_cast<int>(ScaleT)));
|
||||
}
|
||||
|
||||
// static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
|
||||
// {
|
||||
// __mmask64 m = 0xffffull;
|
||||
// return static_cast<uint32_t>(
|
||||
// _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
|
||||
// }
|
||||
|
||||
static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
|
||||
{
|
||||
__mmask16 m = 0xf;
|
||||
m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
|
||||
_mm512_mask_storeu_ps(p, m, __conv(src));
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL
|
||||
store_ps(float* p, Float a) // *p = a (stores all elements contiguously in memory)
|
||||
{
|
||||
_mm512_mask_storeu_ps(p, __mmask16(0xf), __conv(a));
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
|
||||
{
|
||||
_mm512_mask_storeu_epi32(p, __mmask16(0xf), __conv(a));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
|
||||
{
|
||||
return castsi_ps(__conv(_mm512_maskz_set1_epi32(__mmask16(mask & 0xf), -1)));
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// Legacy interface (available only in SIMD256 width)
|
||||
//=======================================================================
|
||||
|
||||
#undef SIMD_WRAPPER_1_
|
||||
#undef SIMD_WRAPPER_1
|
||||
#undef SIMD_WRAPPER_1I_
|
||||
#undef SIMD_WRAPPER_1I
|
||||
#undef SIMD_WRAPPER_2_
|
||||
#undef SIMD_WRAPPER_2
|
||||
#undef SIMD_WRAPPER_2I
|
||||
#undef SIMD_WRAPPER_3_
|
||||
#undef SIMD_WRAPPER_3
|
||||
#undef SIMD_DWRAPPER_1_
|
||||
#undef SIMD_DWRAPPER_1
|
||||
#undef SIMD_DWRAPPER_1I_
|
||||
#undef SIMD_DWRAPPER_1I
|
||||
#undef SIMD_DWRAPPER_2_
|
||||
#undef SIMD_DWRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_1_
|
||||
#undef SIMD_IWRAPPER_1_8
|
||||
#undef SIMD_IWRAPPER_1_16
|
||||
#undef SIMD_IWRAPPER_1_32
|
||||
#undef SIMD_IWRAPPER_1_64
|
||||
#undef SIMD_IWRAPPER_1I_
|
||||
#undef SIMD_IWRAPPER_1I_8
|
||||
#undef SIMD_IWRAPPER_1I_16
|
||||
#undef SIMD_IWRAPPER_1I_32
|
||||
#undef SIMD_IWRAPPER_1I_64
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_2_8
|
||||
#undef SIMD_IWRAPPER_2_16
|
||||
#undef SIMD_IWRAPPER_2_32
|
||||
#undef SIMD_IWRAPPER_2_64
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
//#undef SIMD_IWRAPPER_2I_8
|
||||
//#undef SIMD_IWRAPPER_2I_16
|
||||
//#undef SIMD_IWRAPPER_2I_32
|
||||
//#undef SIMD_IWRAPPER_2I_64
|
||||
|
|
@ -1,196 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD128 AVX (512) implementation
|
||||
//
|
||||
// Since this implementation inherits from the AVX (2) implementation,
|
||||
// the only operations below ones that replace AVX (2) operations.
|
||||
// These use native AVX512 instructions with masking to enable a larger
|
||||
// register set.
|
||||
//============================================================================
|
||||
|
||||
#define SIMD_WRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
|
||||
}
|
||||
#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
|
||||
|
||||
#define SIMD_WRAPPER_1I_(op, intrin, mask) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
|
||||
}
|
||||
#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
|
||||
|
||||
#define SIMD_WRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
|
||||
}
|
||||
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
|
||||
|
||||
#define SIMD_WRAPPER_2I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_3_(op, intrin, mask) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \
|
||||
}
|
||||
#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
|
||||
|
||||
#define SIMD_DWRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
|
||||
}
|
||||
#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
|
||||
|
||||
#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
|
||||
}
|
||||
#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
|
||||
|
||||
#define SIMD_DWRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
|
||||
}
|
||||
#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
|
||||
|
||||
#define SIMD_DWRAPPER_2I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT)); \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
|
||||
}
|
||||
#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
|
||||
#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
|
||||
#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
|
||||
|
||||
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
|
||||
}
|
||||
#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
|
||||
#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
|
||||
#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
|
||||
|
||||
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
|
||||
}
|
||||
#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
|
||||
#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
|
||||
#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
|
||||
|
||||
#define SIMD_IWRAPPER_2I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
|
||||
SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
|
||||
SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and
|
||||
// _mm512_packs_epi16
|
||||
SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and
|
||||
// _mm512_packs_epi32
|
||||
SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and
|
||||
// _mm512_packus_epi16
|
||||
SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and
|
||||
// _mm512_packus_epi32
|
||||
SIMD_IWRAPPER_2_16(unpackhi_epi16);
|
||||
SIMD_IWRAPPER_2_64(unpackhi_epi64);
|
||||
SIMD_IWRAPPER_2_8(unpackhi_epi8);
|
||||
SIMD_IWRAPPER_2_16(unpacklo_epi16);
|
||||
SIMD_IWRAPPER_2_64(unpacklo_epi64);
|
||||
SIMD_IWRAPPER_2_8(unpacklo_epi8);
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
|
||||
{
|
||||
__mmask64 m = 0xffffull;
|
||||
return static_cast<uint32_t>(_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
|
||||
}
|
||||
|
||||
#undef SIMD_WRAPPER_1_
|
||||
#undef SIMD_WRAPPER_1
|
||||
#undef SIMD_WRAPPER_1I_
|
||||
#undef SIMD_WRAPPER_1I
|
||||
#undef SIMD_WRAPPER_2_
|
||||
#undef SIMD_WRAPPER_2
|
||||
#undef SIMD_WRAPPER_2I
|
||||
#undef SIMD_WRAPPER_3_
|
||||
#undef SIMD_WRAPPER_3
|
||||
#undef SIMD_DWRAPPER_1_
|
||||
#undef SIMD_DWRAPPER_1
|
||||
#undef SIMD_DWRAPPER_1I_
|
||||
#undef SIMD_DWRAPPER_1I
|
||||
#undef SIMD_DWRAPPER_2_
|
||||
#undef SIMD_DWRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_1_
|
||||
#undef SIMD_IWRAPPER_1_8
|
||||
#undef SIMD_IWRAPPER_1_16
|
||||
#undef SIMD_IWRAPPER_1_32
|
||||
#undef SIMD_IWRAPPER_1_64
|
||||
#undef SIMD_IWRAPPER_1I_
|
||||
#undef SIMD_IWRAPPER_1I_8
|
||||
#undef SIMD_IWRAPPER_1I_16
|
||||
#undef SIMD_IWRAPPER_1I_32
|
||||
#undef SIMD_IWRAPPER_1I_64
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_2_8
|
||||
#undef SIMD_IWRAPPER_2_16
|
||||
#undef SIMD_IWRAPPER_2_32
|
||||
#undef SIMD_IWRAPPER_2_64
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
//#undef SIMD_IWRAPPER_2I_8
|
||||
//#undef SIMD_IWRAPPER_2I_16
|
||||
//#undef SIMD_IWRAPPER_2I_32
|
||||
//#undef SIMD_IWRAPPER_2I_64
|
||||
|
|
@ -1,34 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD128 AVX (512) implementation for Knights Family
|
||||
//
|
||||
// Since this implementation inherits from the AVX512Base implementation,
|
||||
// the only operations below ones that replace AVX512F / AVX512CD operations
|
||||
// These use native AVX512 instructions with masking to enable a larger
|
||||
// register set.
|
||||
//============================================================================
|
||||
|
|
@ -1,826 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
using SIMD128T = SIMD128Impl::AVXImpl;
|
||||
|
||||
//============================================================================
|
||||
// SIMD256 AVX (1) implementation
|
||||
//============================================================================
|
||||
|
||||
#define SIMD_WRAPPER_1(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float const& a) { return _mm256_##op(a); }
|
||||
|
||||
#define SIMD_WRAPPER_2(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
|
||||
{ \
|
||||
return _mm256_##op(a, b); \
|
||||
}
|
||||
|
||||
#define SIMD_DWRAPPER_2(op) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double const& a, Double const& b) \
|
||||
{ \
|
||||
return _mm256_##op(a, b); \
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_2I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
|
||||
{ \
|
||||
return _mm256_##op(a, b, ImmT); \
|
||||
}
|
||||
|
||||
#define SIMD_DWRAPPER_2I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Double SIMDCALL op(Double const& a, Double const& b) \
|
||||
{ \
|
||||
return _mm256_##op(a, b, ImmT); \
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_3(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \
|
||||
{ \
|
||||
return _mm256_##op(a, b, c); \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a) { return _mm256_##op(a); }
|
||||
|
||||
#define SIMD_IWRAPPER_2(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
|
||||
{ \
|
||||
return _mm256_##op(a, b); \
|
||||
}
|
||||
|
||||
#define SIMD_IFWRAPPER_2(op, intrin) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
|
||||
{ \
|
||||
return castps_si(intrin(castsi_ps(a), castsi_ps(b))); \
|
||||
}
|
||||
|
||||
#define SIMD_IFWRAPPER_2I(op, intrin) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
|
||||
{ \
|
||||
return castps_si(intrin(castsi_ps(a), castsi_ps(b), ImmT)); \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2I_(op, intrin) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
|
||||
{ \
|
||||
return _mm256_##intrin(a, b, ImmT); \
|
||||
}
|
||||
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
|
||||
|
||||
#define SIMD_IWRAPPER_3(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \
|
||||
{ \
|
||||
return _mm256_##op(a, b, c); \
|
||||
}
|
||||
|
||||
// emulated integer simd
|
||||
#define SIMD_EMU_IWRAPPER_1(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
|
||||
{ \
|
||||
return Integer{ \
|
||||
SIMD128T::op(a.v4[0]), \
|
||||
SIMD128T::op(a.v4[1]), \
|
||||
}; \
|
||||
}
|
||||
#define SIMD_EMU_IWRAPPER_1L(op, shift) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
|
||||
{ \
|
||||
return Integer{ \
|
||||
SIMD128T::op(a.v4[0]), \
|
||||
SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])), \
|
||||
}; \
|
||||
} \
|
||||
static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer const& a) \
|
||||
{ \
|
||||
return Integer{ \
|
||||
SIMD128T::op(a), \
|
||||
SIMD128T::op(SIMD128T::template srli_si<shift>(a)), \
|
||||
}; \
|
||||
}
|
||||
|
||||
#define SIMD_EMU_IWRAPPER_1I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
|
||||
{ \
|
||||
return Integer{ \
|
||||
SIMD128T::template op<ImmT>(a.v4[0]), \
|
||||
SIMD128T::template op<ImmT>(a.v4[1]), \
|
||||
}; \
|
||||
}
|
||||
|
||||
#define SIMD_EMU_IWRAPPER_2(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
|
||||
{ \
|
||||
return Integer{ \
|
||||
SIMD128T::op(a.v4[0], b.v4[0]), \
|
||||
SIMD128T::op(a.v4[1], b.v4[1]), \
|
||||
}; \
|
||||
}
|
||||
|
||||
#define SIMD_EMU_IWRAPPER_2I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
|
||||
{ \
|
||||
return Integer{ \
|
||||
SIMD128T::template op<ImmT>(a.v4[0], b.v[0]), \
|
||||
SIMD128T::template op<ImmT>(a.v4[1], b.v[1]), \
|
||||
}; \
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Single precision floating point arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2(add_ps); // return a + b
|
||||
SIMD_WRAPPER_2(div_ps); // return a / b
|
||||
|
||||
static SIMDINLINE Float SIMDCALL fmadd_ps(Float const& a,
|
||||
Float const& b,
|
||||
Float const& c) // return (a * b) + c
|
||||
{
|
||||
return add_ps(mul_ps(a, b), c);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL fmsub_ps(Float const& a,
|
||||
Float const& b,
|
||||
Float const& c) // return (a * b) - c
|
||||
{
|
||||
return sub_ps(mul_ps(a, b), c);
|
||||
}
|
||||
|
||||
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
|
||||
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
|
||||
SIMD_WRAPPER_2(mul_ps); // return a * b
|
||||
SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a
|
||||
SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
|
||||
SIMD_WRAPPER_2(sub_ps); // return a - b
|
||||
|
||||
template <RoundMode RMT>
|
||||
static SIMDINLINE Float SIMDCALL round_ps(Float const& a)
|
||||
{
|
||||
return _mm256_round_ps(a, static_cast<int>(RMT));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a)
|
||||
{
|
||||
return round_ps<RoundMode::CEIL_NOEXC>(a);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL floor_ps(Float const& a)
|
||||
{
|
||||
return round_ps<RoundMode::FLOOR_NOEXC>(a);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Integer (various width) arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_EMU_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
|
||||
SIMD_EMU_IWRAPPER_2(add_epi32); // return a + b (int32)
|
||||
SIMD_EMU_IWRAPPER_2(add_epi8); // return a + b (int8)
|
||||
SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
SIMD_EMU_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
|
||||
SIMD_EMU_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
|
||||
SIMD_EMU_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
|
||||
SIMD_EMU_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
|
||||
SIMD_EMU_IWRAPPER_2(mul_epi32); // return a * b (int32)
|
||||
|
||||
// return (a * b) & 0xFFFFFFFF
|
||||
//
|
||||
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
|
||||
// and store the low 32 bits of the intermediate integers in dst.
|
||||
SIMD_EMU_IWRAPPER_2(mullo_epi32);
|
||||
SIMD_EMU_IWRAPPER_2(sub_epi32); // return a - b (int32)
|
||||
SIMD_EMU_IWRAPPER_2(sub_epi64); // return a - b (int64)
|
||||
SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Logical operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
|
||||
SIMD_IFWRAPPER_2(and_si, _mm256_and_ps); // return a & b (int)
|
||||
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
|
||||
SIMD_IFWRAPPER_2(andnot_si, _mm256_andnot_ps); // return (~a) & b (int)
|
||||
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
|
||||
SIMD_IFWRAPPER_2(or_si, _mm256_or_ps); // return a | b (int)
|
||||
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
|
||||
SIMD_IFWRAPPER_2(xor_si, _mm256_xor_ps); // return a ^ b (int)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Shift operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_EMU_IWRAPPER_1I(slli_epi32); // return a << ImmT
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer const& vA,
|
||||
Integer const& vCount) // return a << b (uint32)
|
||||
{
|
||||
int32_t aHi, aLow, countHi, countLow;
|
||||
__m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
|
||||
__m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
|
||||
__m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
|
||||
__m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
|
||||
|
||||
aHi = _mm_extract_epi32(vAHi, 0);
|
||||
countHi = _mm_extract_epi32(vCountHi, 0);
|
||||
aHi <<= countHi;
|
||||
vAHi = _mm_insert_epi32(vAHi, aHi, 0);
|
||||
|
||||
aLow = _mm_extract_epi32(vALow, 0);
|
||||
countLow = _mm_extract_epi32(vCountLow, 0);
|
||||
aLow <<= countLow;
|
||||
vALow = _mm_insert_epi32(vALow, aLow, 0);
|
||||
|
||||
aHi = _mm_extract_epi32(vAHi, 1);
|
||||
countHi = _mm_extract_epi32(vCountHi, 1);
|
||||
aHi <<= countHi;
|
||||
vAHi = _mm_insert_epi32(vAHi, aHi, 1);
|
||||
|
||||
aLow = _mm_extract_epi32(vALow, 1);
|
||||
countLow = _mm_extract_epi32(vCountLow, 1);
|
||||
aLow <<= countLow;
|
||||
vALow = _mm_insert_epi32(vALow, aLow, 1);
|
||||
|
||||
aHi = _mm_extract_epi32(vAHi, 2);
|
||||
countHi = _mm_extract_epi32(vCountHi, 2);
|
||||
aHi <<= countHi;
|
||||
vAHi = _mm_insert_epi32(vAHi, aHi, 2);
|
||||
|
||||
aLow = _mm_extract_epi32(vALow, 2);
|
||||
countLow = _mm_extract_epi32(vCountLow, 2);
|
||||
aLow <<= countLow;
|
||||
vALow = _mm_insert_epi32(vALow, aLow, 2);
|
||||
|
||||
aHi = _mm_extract_epi32(vAHi, 3);
|
||||
countHi = _mm_extract_epi32(vCountHi, 3);
|
||||
aHi <<= countHi;
|
||||
vAHi = _mm_insert_epi32(vAHi, aHi, 3);
|
||||
|
||||
aLow = _mm_extract_epi32(vALow, 3);
|
||||
countLow = _mm_extract_epi32(vCountLow, 3);
|
||||
aLow <<= countLow;
|
||||
vALow = _mm_insert_epi32(vALow, aLow, 3);
|
||||
|
||||
__m256i ret = _mm256_set1_epi32(0);
|
||||
ret = _mm256_insertf128_si256(ret, vAHi, 1);
|
||||
ret = _mm256_insertf128_si256(ret, vALow, 0);
|
||||
return ret;
|
||||
}
|
||||
|
||||
SIMD_EMU_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
|
||||
SIMD_EMU_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
|
||||
SIMD_EMU_IWRAPPER_1I(srli_si); // return a >> (ImmT*8) (uint)
|
||||
|
||||
template <int ImmT> // same as srli_si, but with Float cast to int
|
||||
static SIMDINLINE Float SIMDCALL srlisi_ps(Float const& a)
|
||||
{
|
||||
return castsi_ps(srli_si<ImmT>(castps_si(a)));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer const& vA,
|
||||
Integer const& vCount) // return a >> b (uint32)
|
||||
{
|
||||
int32_t aHi, aLow, countHi, countLow;
|
||||
__m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
|
||||
__m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
|
||||
__m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
|
||||
__m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
|
||||
|
||||
aHi = _mm_extract_epi32(vAHi, 0);
|
||||
countHi = _mm_extract_epi32(vCountHi, 0);
|
||||
aHi >>= countHi;
|
||||
vAHi = _mm_insert_epi32(vAHi, aHi, 0);
|
||||
|
||||
aLow = _mm_extract_epi32(vALow, 0);
|
||||
countLow = _mm_extract_epi32(vCountLow, 0);
|
||||
aLow >>= countLow;
|
||||
vALow = _mm_insert_epi32(vALow, aLow, 0);
|
||||
|
||||
aHi = _mm_extract_epi32(vAHi, 1);
|
||||
countHi = _mm_extract_epi32(vCountHi, 1);
|
||||
aHi >>= countHi;
|
||||
vAHi = _mm_insert_epi32(vAHi, aHi, 1);
|
||||
|
||||
aLow = _mm_extract_epi32(vALow, 1);
|
||||
countLow = _mm_extract_epi32(vCountLow, 1);
|
||||
aLow >>= countLow;
|
||||
vALow = _mm_insert_epi32(vALow, aLow, 1);
|
||||
|
||||
aHi = _mm_extract_epi32(vAHi, 2);
|
||||
countHi = _mm_extract_epi32(vCountHi, 2);
|
||||
aHi >>= countHi;
|
||||
vAHi = _mm_insert_epi32(vAHi, aHi, 2);
|
||||
|
||||
aLow = _mm_extract_epi32(vALow, 2);
|
||||
countLow = _mm_extract_epi32(vCountLow, 2);
|
||||
aLow >>= countLow;
|
||||
vALow = _mm_insert_epi32(vALow, aLow, 2);
|
||||
|
||||
aHi = _mm_extract_epi32(vAHi, 3);
|
||||
countHi = _mm_extract_epi32(vCountHi, 3);
|
||||
aHi >>= countHi;
|
||||
vAHi = _mm_insert_epi32(vAHi, aHi, 3);
|
||||
|
||||
aLow = _mm_extract_epi32(vALow, 3);
|
||||
countLow = _mm_extract_epi32(vCountLow, 3);
|
||||
aLow >>= countLow;
|
||||
vALow = _mm_insert_epi32(vALow, aLow, 3);
|
||||
|
||||
__m256i ret = _mm256_set1_epi32(0);
|
||||
ret = _mm256_insertf128_si256(ret, vAHi, 1);
|
||||
ret = _mm256_insertf128_si256(ret, vALow, 0);
|
||||
return ret;
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Conversion operations
|
||||
//-----------------------------------------------------------------------
|
||||
static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a)
|
||||
{
|
||||
return _mm256_castpd_ps(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a)
|
||||
{
|
||||
return _mm256_castps_si256(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a)
|
||||
{
|
||||
return _mm256_castsi256_pd(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a)
|
||||
{
|
||||
return _mm256_castps_pd(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL castpd_si(Double const& a) // return *(Integer*)(&a)
|
||||
{
|
||||
return _mm256_castpd_si256(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a)
|
||||
{
|
||||
return _mm256_castsi256_ps(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
cvtepi32_ps(Integer const& a) // return (float)a (int32 --> float)
|
||||
{
|
||||
return _mm256_cvtepi32_ps(a);
|
||||
}
|
||||
|
||||
SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8); // return (int16)a (uint8 --> int16)
|
||||
SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4); // return (int32)a (uint8 --> int32)
|
||||
SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8); // return (int32)a (uint16 --> int32)
|
||||
SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4); // return (int64)a (uint16 --> int64)
|
||||
SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8); // return (int64)a (uint32 --> int64)
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
cvtps_epi32(Float const& a) // return (int32)a (float --> int32)
|
||||
{
|
||||
return _mm256_cvtps_epi32(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
cvttps_epi32(Float const& a) // return (int32)a (rnd_to_zero(float) --> int32)
|
||||
{
|
||||
return _mm256_cvttps_epi32(a);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Comparison operations
|
||||
//-----------------------------------------------------------------------
|
||||
template <CompareType CmpTypeT>
|
||||
static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b
|
||||
{
|
||||
return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b)
|
||||
{
|
||||
return cmp_ps<CompareType::LT_OQ>(a, b);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b)
|
||||
{
|
||||
return cmp_ps<CompareType::GT_OQ>(a, b);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b)
|
||||
{
|
||||
return cmp_ps<CompareType::NEQ_OQ>(a, b);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b)
|
||||
{
|
||||
return cmp_ps<CompareType::EQ_OQ>(a, b);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b)
|
||||
{
|
||||
return cmp_ps<CompareType::GE_OQ>(a, b);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b)
|
||||
{
|
||||
return cmp_ps<CompareType::LE_OQ>(a, b);
|
||||
}
|
||||
|
||||
SIMD_EMU_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
|
||||
SIMD_EMU_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
|
||||
SIMD_EMU_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
|
||||
SIMD_EMU_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
|
||||
SIMD_EMU_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
|
||||
SIMD_EMU_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
|
||||
SIMD_EMU_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
|
||||
SIMD_EMU_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
|
||||
SIMD_EMU_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
|
||||
|
||||
static SIMDINLINE bool SIMDCALL
|
||||
testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
|
||||
{
|
||||
return 0 != _mm256_testz_ps(a, b);
|
||||
}
|
||||
|
||||
static SIMDINLINE bool SIMDCALL
|
||||
testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
|
||||
{
|
||||
return 0 != _mm256_testz_si256(a, b);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Blend / shuffle / permute operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
|
||||
SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps); // return ImmT ? b : a (int32)
|
||||
SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
|
||||
Integer const& b,
|
||||
Float const& mask) // return mask ? b : a (int)
|
||||
{
|
||||
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
|
||||
Integer const& b,
|
||||
Integer const& mask) // return mask ? b : a (int)
|
||||
{
|
||||
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
broadcast_ss(float const* p) // return *p (all elements in vector get same value)
|
||||
{
|
||||
return _mm256_broadcast_ss(p);
|
||||
}
|
||||
|
||||
SIMD_EMU_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
|
||||
SIMD_EMU_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
|
||||
SIMD_EMU_IWRAPPER_2(
|
||||
packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
|
||||
SIMD_EMU_IWRAPPER_2(
|
||||
packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
|
||||
|
||||
template <int ImmT>
|
||||
static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
|
||||
{
|
||||
return _mm256_permute_ps(a, ImmT);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL permute_epi32(
|
||||
Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
|
||||
{
|
||||
Integer result;
|
||||
|
||||
// Ugly slow implementation
|
||||
uint32_t const* pA = reinterpret_cast<uint32_t const*>(&a);
|
||||
uint32_t const* pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
|
||||
uint32_t* pResult = reinterpret_cast<uint32_t*>(&result);
|
||||
|
||||
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
|
||||
{
|
||||
pResult[i] = pA[0xF & pSwiz[i]];
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
|
||||
{
|
||||
Float result;
|
||||
|
||||
// Ugly slow implementation
|
||||
float const* pA = reinterpret_cast<float const*>(&a);
|
||||
uint32_t const* pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
|
||||
float* pResult = reinterpret_cast<float*>(&result);
|
||||
|
||||
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
|
||||
{
|
||||
pResult[i] = pA[0xF & pSwiz[i]];
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
SIMD_WRAPPER_2I(permute2f128_ps);
|
||||
SIMD_DWRAPPER_2I(permute2f128_pd);
|
||||
SIMD_IWRAPPER_2I_(permute2f128_si, permute2f128_si256);
|
||||
|
||||
SIMD_EMU_IWRAPPER_1I(shuffle_epi32);
|
||||
|
||||
template <int ImmT>
|
||||
static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const& a, Integer const& b)
|
||||
{
|
||||
return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
|
||||
}
|
||||
SIMD_EMU_IWRAPPER_2(shuffle_epi8);
|
||||
SIMD_DWRAPPER_2I(shuffle_pd);
|
||||
SIMD_WRAPPER_2I(shuffle_ps);
|
||||
SIMD_EMU_IWRAPPER_2(unpackhi_epi16);
|
||||
SIMD_IFWRAPPER_2(unpackhi_epi32, _mm256_unpackhi_ps);
|
||||
SIMD_EMU_IWRAPPER_2(unpackhi_epi64);
|
||||
SIMD_EMU_IWRAPPER_2(unpackhi_epi8);
|
||||
SIMD_DWRAPPER_2(unpackhi_pd);
|
||||
SIMD_WRAPPER_2(unpackhi_ps);
|
||||
SIMD_EMU_IWRAPPER_2(unpacklo_epi16);
|
||||
SIMD_IFWRAPPER_2(unpacklo_epi32, _mm256_unpacklo_ps);
|
||||
SIMD_EMU_IWRAPPER_2(unpacklo_epi64);
|
||||
SIMD_EMU_IWRAPPER_2(unpacklo_epi8);
|
||||
SIMD_DWRAPPER_2(unpacklo_pd);
|
||||
SIMD_WRAPPER_2(unpacklo_ps);
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Load / store operations
|
||||
//-----------------------------------------------------------------------
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
|
||||
{
|
||||
uint32_t* pOffsets = (uint32_t*)&idx;
|
||||
Float vResult;
|
||||
float* pResult = (float*)&vResult;
|
||||
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
|
||||
{
|
||||
uint32_t offset = pOffsets[i];
|
||||
offset = offset * static_cast<uint32_t>(ScaleT);
|
||||
pResult[i] = *(float const*)(((uint8_t const*)p + offset));
|
||||
}
|
||||
|
||||
return vResult;
|
||||
}
|
||||
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
|
||||
{
|
||||
return i32gather_ps<ScaleT>(p, idx);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
load1_ps(float const* p) // return *p (broadcast 1 value to all elements)
|
||||
{
|
||||
return broadcast_ss(p);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
load_ps(float const* p) // return *p (loads SIMD width elements from memory)
|
||||
{
|
||||
return _mm256_load_ps(p);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
|
||||
{
|
||||
return _mm256_load_si256(&p->v);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem)
|
||||
{
|
||||
return _mm256_loadu_ps(p);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem)
|
||||
{
|
||||
return _mm256_lddqu_si256(&p->v);
|
||||
}
|
||||
|
||||
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
|
||||
{
|
||||
uint32_t* pOffsets = (uint32_t*)&idx;
|
||||
Float vResult = old;
|
||||
float* pResult = (float*)&vResult;
|
||||
unsigned long index = 0;
|
||||
uint32_t umask = movemask_ps(mask);
|
||||
while (_BitScanForward(&index, umask))
|
||||
{
|
||||
umask &= ~(1 << index);
|
||||
uint32_t offset = pOffsets[index];
|
||||
offset = offset * static_cast<uint32_t>(ScaleT);
|
||||
pResult[index] = *(float const*)(((uint8_t const*)p + offset));
|
||||
}
|
||||
|
||||
return vResult;
|
||||
}
|
||||
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
|
||||
{
|
||||
return mask_i32gather_ps<ScaleT>(old, p, idx, mask);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src)
|
||||
{
|
||||
_mm256_maskstore_ps(p, mask, src);
|
||||
}
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a)
|
||||
{
|
||||
return SIMD128T::movemask_epi8(a.v4[0]) | (SIMD128T::movemask_epi8(a.v4[1]) << 16);
|
||||
}
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a)
|
||||
{
|
||||
return static_cast<uint32_t>(_mm256_movemask_pd(a));
|
||||
}
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a)
|
||||
{
|
||||
return static_cast<uint32_t>(_mm256_movemask_ps(a));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
|
||||
{
|
||||
return _mm256_set1_epi32(i);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
|
||||
{
|
||||
return _mm256_set1_epi8(i);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
|
||||
{
|
||||
return _mm256_set1_ps(f);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
|
||||
{
|
||||
return _mm256_setzero_ps();
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
|
||||
{
|
||||
return _mm256_setzero_si256();
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL
|
||||
store_ps(float* p, Float const& a) // *p = a (stores all elements contiguously in memory)
|
||||
{
|
||||
_mm256_store_ps(p, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a
|
||||
{
|
||||
_mm256_store_si256(&p->v, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL
|
||||
stream_ps(float* p, Float const& a) // *p = a (same as store_ps, but doesn't keep memory in cache)
|
||||
{
|
||||
_mm256_stream_ps(p, a);
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// Legacy interface (available only in SIMD256 width)
|
||||
//=======================================================================
|
||||
|
||||
static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const* p)
|
||||
{
|
||||
return _mm256_broadcast_ps(&p->v);
|
||||
}
|
||||
|
||||
template <int ImmT>
|
||||
static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double const& a)
|
||||
{
|
||||
return _mm256_extractf128_pd(a, ImmT);
|
||||
}
|
||||
|
||||
template <int ImmT>
|
||||
static SIMDINLINE SIMD128Impl::Float SIMDCALL extractf128_ps(Float const& a)
|
||||
{
|
||||
return _mm256_extractf128_ps(a, ImmT);
|
||||
}
|
||||
|
||||
template <int ImmT>
|
||||
static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer const& a)
|
||||
{
|
||||
return _mm256_extractf128_si256(a, ImmT);
|
||||
}
|
||||
|
||||
template <int ImmT>
|
||||
static SIMDINLINE Double SIMDCALL insertf128_pd(Double const& a, SIMD128Impl::Double const& b)
|
||||
{
|
||||
return _mm256_insertf128_pd(a, b, ImmT);
|
||||
}
|
||||
|
||||
template <int ImmT>
|
||||
static SIMDINLINE Float SIMDCALL insertf128_ps(Float const& a, SIMD128Impl::Float const& b)
|
||||
{
|
||||
return _mm256_insertf128_ps(a, b, ImmT);
|
||||
}
|
||||
|
||||
template <int ImmT>
|
||||
static SIMDINLINE Integer SIMDCALL insertf128_si(Integer const& a, SIMD128Impl::Integer const& b)
|
||||
{
|
||||
return _mm256_insertf128_si256(a, b, ImmT);
|
||||
}
|
||||
|
||||
#ifndef _mm256_set_m128i
|
||||
#define _mm256_set_m128i(/* SIMD128Impl::Integer */ hi, /* SIMD128Impl::Integer */ lo) \
|
||||
_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
|
||||
#endif
|
||||
|
||||
#ifndef _mm256_loadu2_m128i
|
||||
#define _mm256_loadu2_m128i(/* SIMD128Impl::Integer const* */ hiaddr, \
|
||||
/* SIMD128Impl::Integer const* */ loaddr) \
|
||||
_mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
|
||||
#endif
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi,
|
||||
SIMD128Impl::Integer const* plo)
|
||||
{
|
||||
return _mm256_loadu2_m128i(&phi->v, &plo->v);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
|
||||
{
|
||||
return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
|
||||
{
|
||||
return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer* phi,
|
||||
SIMD128Impl::Integer* plo,
|
||||
Integer const& src)
|
||||
{
|
||||
_mm256_storeu2_m128i(&phi->v, &plo->v, src);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
|
||||
{
|
||||
Integer vec = set1_epi32(mask);
|
||||
const Integer bit = set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
|
||||
vec = and_si(vec, bit);
|
||||
vec = cmplt_epi32(setzero_si(), vec);
|
||||
return castsi_ps(vec);
|
||||
}
|
||||
|
||||
#undef SIMD_WRAPPER_1
|
||||
#undef SIMD_WRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2I
|
||||
#undef SIMD_WRAPPER_2I
|
||||
#undef SIMD_WRAPPER_3
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IFWRAPPER_2
|
||||
#undef SIMD_IFWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_2I_
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_3
|
||||
#undef SIMD_EMU_IWRAPPER_1
|
||||
#undef SIMD_EMU_IWRAPPER_1I
|
||||
#undef SIMD_EMU_IWRAPPER_2
|
||||
#undef SIMD_EMU_IWRAPPER_2I
|
||||
|
|
@ -1,255 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX2_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD256 AVX (2) implementation
|
||||
//
|
||||
// Since this implementation inherits from the AVX (1) implementation,
|
||||
// the only operations below ones that replace AVX (1) operations.
|
||||
// Mostly these are integer operations that are no longer emulated with SSE
|
||||
//============================================================================
|
||||
|
||||
#define SIMD_IWRAPPER_1(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a) { return _mm256_##op(a); }
|
||||
|
||||
#define SIMD_IWRAPPER_1L(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
|
||||
{ \
|
||||
return _mm256_##op(_mm256_castsi256_si128(a)); \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
|
||||
{ \
|
||||
return _mm256_##op(a, ImmT); \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1I_(op, intrin) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
|
||||
{ \
|
||||
return _mm256_##intrin(a, ImmT); \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2_(op, intrin) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
|
||||
{ \
|
||||
return _mm256_##intrin(a, b); \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
|
||||
{ \
|
||||
return _mm256_##op(a, b); \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
|
||||
{ \
|
||||
return _mm256_##op(a, b, ImmT); \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
|
||||
{ \
|
||||
return _mm256_##op(a, b, ImmT); \
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Floating point arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
static SIMDINLINE Float SIMDCALL fmadd_ps(Float const& a,
|
||||
Float const& b,
|
||||
Float const& c) // return (a * b) + c
|
||||
{
|
||||
return _mm256_fmadd_ps(a, b, c);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Integer (various width) arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
|
||||
SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
|
||||
SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
|
||||
SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
|
||||
|
||||
// return (a * b) & 0xFFFFFFFF
|
||||
//
|
||||
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
|
||||
// and store the low 32 bits of the intermediate integers in dst.
|
||||
SIMD_IWRAPPER_2(mullo_epi32);
|
||||
SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
|
||||
SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
|
||||
SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Logical operations
|
||||
//-----------------------------------------------------------------------
|
||||
#if _MSC_VER >= 1920 // && _MSC_FULL_VER < [some_fixed_version]
|
||||
// Some versions of MSVC 2019 don't handle constant folding of and_si() correctly.
|
||||
// Using and_ps instead inhibits the compiler's constant folding and actually issues
|
||||
// the and intrinsic even though both inputs are constant values.
|
||||
#else
|
||||
// Use native integer and intrinsic
|
||||
SIMD_IWRAPPER_2_(and_si, and_si256); // return a & b (int)
|
||||
#endif
|
||||
SIMD_IWRAPPER_2_(andnot_si, andnot_si256); // return (~a) & b (int)
|
||||
SIMD_IWRAPPER_2_(or_si, or_si256); // return a | b (int)
|
||||
SIMD_IWRAPPER_2_(xor_si, xor_si256); // return a ^ b (int)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Shift operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
|
||||
SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32)
|
||||
SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
|
||||
SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
|
||||
SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32)
|
||||
SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint)
|
||||
|
||||
template <int ImmT> // same as srli_si, but with Float cast to int
|
||||
static SIMDINLINE Float SIMDCALL srlisi_ps(Float const& a)
|
||||
{
|
||||
return castsi_ps(srli_si<ImmT>(castps_si(a)));
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Conversion operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1L(cvtepu8_epi16); // return (int16)a (uint8 --> int16)
|
||||
SIMD_IWRAPPER_1L(cvtepu8_epi32); // return (int32)a (uint8 --> int32)
|
||||
SIMD_IWRAPPER_1L(cvtepu16_epi32); // return (int32)a (uint16 --> int32)
|
||||
SIMD_IWRAPPER_1L(cvtepu16_epi64); // return (int64)a (uint16 --> int64)
|
||||
SIMD_IWRAPPER_1L(cvtepu32_epi64); // return (int64)a (uint32 --> int64)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Comparison operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
|
||||
SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
|
||||
SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
|
||||
SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer const& a,
|
||||
Integer const& b) // return a < b (int32)
|
||||
{
|
||||
return cmpgt_epi32(b, a);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Blend / shuffle / permute operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32)
|
||||
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
|
||||
SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
|
||||
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
|
||||
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
|
||||
|
||||
template <int ImmT>
|
||||
static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
|
||||
{
|
||||
return _mm256_permute_ps(a, ImmT);
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
|
||||
{
|
||||
return _mm256_permutevar8x32_ps(a, swiz);
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_1I(shuffle_epi32);
|
||||
template <int ImmT>
|
||||
static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const& a, Integer const& b)
|
||||
{
|
||||
return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
|
||||
}
|
||||
SIMD_IWRAPPER_2(shuffle_epi8);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi16);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi32);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi64);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi8);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi16);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi32);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi64);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi8);
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Load / store operations
|
||||
//-----------------------------------------------------------------------
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
|
||||
{
|
||||
return _mm256_i32gather_ps(p, idx, static_cast<int>(ScaleT));
|
||||
}
|
||||
|
||||
#if _MSC_VER == 1920 // && _MSC_FULL_VER < [some_fixed_version]
|
||||
// Don't use _mm256_mask_i32gather_ps(), the compiler doesn't preserve the mask register
|
||||
// correctly in early versions of MSVC 2019
|
||||
#else
|
||||
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
|
||||
{
|
||||
// g++ in debug mode needs the explicit .v suffix instead of relying on operator __m256()
|
||||
// Only for this intrinsic - not sure why. :(
|
||||
return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast<int>(ScaleT));
|
||||
}
|
||||
#endif
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a)
|
||||
{
|
||||
return static_cast<uint32_t>(_mm256_movemask_epi8(a));
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// Legacy interface (available only in SIMD256 width)
|
||||
//=======================================================================
|
||||
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_1L
|
||||
#undef SIMD_IWRAPPER_1I
|
||||
#undef SIMD_IWRAPPER_1I_
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
|
|
@ -1,349 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD256 AVX (512) implementation
|
||||
//
|
||||
// Since this implementation inherits from the AVX (2) implementation,
|
||||
// the only operations below ones that replace AVX (2) operations.
|
||||
// These use native AVX512 instructions with masking to enable a larger
|
||||
// register set.
|
||||
//============================================================================
|
||||
|
||||
private:
|
||||
static SIMDINLINE __m512 __conv(Float r)
|
||||
{
|
||||
return _mm512_castps256_ps512(r.v);
|
||||
}
|
||||
static SIMDINLINE __m512d __conv(Double r)
|
||||
{
|
||||
return _mm512_castpd256_pd512(r.v);
|
||||
}
|
||||
static SIMDINLINE __m512i __conv(Integer r)
|
||||
{
|
||||
return _mm512_castsi256_si512(r.v);
|
||||
}
|
||||
static SIMDINLINE Float __conv(__m512 r)
|
||||
{
|
||||
return _mm512_castps512_ps256(r);
|
||||
}
|
||||
static SIMDINLINE Double __conv(__m512d r)
|
||||
{
|
||||
return _mm512_castpd512_pd256(r);
|
||||
}
|
||||
static SIMDINLINE Integer __conv(__m512i r)
|
||||
{
|
||||
return _mm512_castsi512_si256(r);
|
||||
}
|
||||
|
||||
public:
|
||||
#define SIMD_WRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
|
||||
}
|
||||
#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xff))
|
||||
|
||||
#define SIMD_WRAPPER_1I_(op, intrin, mask) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
|
||||
}
|
||||
#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xff))
|
||||
|
||||
#define SIMD_WRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
|
||||
}
|
||||
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xff))
|
||||
|
||||
#define SIMD_WRAPPER_2I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT)); \
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_3_(op, intrin, mask) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \
|
||||
}
|
||||
#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xff))
|
||||
|
||||
#define SIMD_DWRAPPER_2I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
|
||||
}
|
||||
#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xff))
|
||||
|
||||
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
|
||||
}
|
||||
#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xff))
|
||||
|
||||
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
|
||||
}
|
||||
#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xff))
|
||||
|
||||
#define SIMD_IWRAPPER_2I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT)); \
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Single precision floating point arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2(add_ps); // return a + b
|
||||
SIMD_WRAPPER_2(div_ps); // return a / b
|
||||
SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
|
||||
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
|
||||
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
|
||||
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
|
||||
SIMD_WRAPPER_2(mul_ps); // return a * b
|
||||
SIMD_WRAPPER_1_(rcp_ps, rcp14_ps, __mmask16(0xff)); // return 1.0f / a
|
||||
SIMD_WRAPPER_1_(rsqrt_ps, rsqrt14_ps, __mmask16(0xff)); // return 1.0f / sqrt(a)
|
||||
SIMD_WRAPPER_2(sub_ps); // return a - b
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Integer (various width) arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32)
|
||||
SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32)
|
||||
SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
|
||||
|
||||
// SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
|
||||
// SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
|
||||
// return (a * b) & 0xFFFFFFFF
|
||||
//
|
||||
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
|
||||
// and store the low 32 bits of the intermediate integers in dst.
|
||||
SIMD_IWRAPPER_2_32(mullo_epi32);
|
||||
SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
|
||||
|
||||
// SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
|
||||
// SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Logical operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xff)); // return a & b (int)
|
||||
SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xff)); // return (~a) & b (int)
|
||||
SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xff)); // return a | b (int)
|
||||
SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xff)); // return a ^ b (int)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Shift operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT
|
||||
SIMD_IWRAPPER_2_32(sllv_epi32); // return a << b (uint32)
|
||||
SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT (int32)
|
||||
SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT (uint32)
|
||||
SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32)
|
||||
|
||||
// use AVX2 version
|
||||
// SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Conversion operations (Use AVX2 versions)
|
||||
//-----------------------------------------------------------------------
|
||||
// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff); // return (int16)a (uint8 --> int16)
|
||||
// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff); // return (int32)a (uint8 --> int32)
|
||||
// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff); // return (int32)a (uint16 --> int32)
|
||||
// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf); // return (int64)a (uint16 --> int64)
|
||||
// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf); // return (int64)a (uint32 --> int64)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Comparison operations (Use AVX2 versions
|
||||
//-----------------------------------------------------------------------
|
||||
// SIMD_IWRAPPER_2_CMP(cmpeq_epi8); // return a == b (int8)
|
||||
// SIMD_IWRAPPER_2_CMP(cmpeq_epi16); // return a == b (int16)
|
||||
// SIMD_IWRAPPER_2_CMP(cmpeq_epi32); // return a == b (int32)
|
||||
// SIMD_IWRAPPER_2_CMP(cmpeq_epi64); // return a == b (int64)
|
||||
// SIMD_IWRAPPER_2_CMP(cmpgt_epi8,); // return a > b (int8)
|
||||
// SIMD_IWRAPPER_2_CMP(cmpgt_epi16); // return a > b (int16)
|
||||
// SIMD_IWRAPPER_2_CMP(cmpgt_epi32); // return a > b (int32)
|
||||
// SIMD_IWRAPPER_2_CMP(cmpgt_epi64); // return a > b (int64)
|
||||
//
|
||||
// static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b) // return a < b (int32)
|
||||
//{
|
||||
// return cmpgt_epi32(b, a);
|
||||
//}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Blend / shuffle / permute operations
|
||||
//-----------------------------------------------------------------------
|
||||
// SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16
|
||||
// and _mm512_packs_epi16 SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation
|
||||
// for _mm256_packs_epi32 and _mm512_packs_epi32 SIMD_IWRAPPER_2_8(packus_epi16); // uint16 -->
|
||||
// uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
|
||||
// SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for
|
||||
// _mm256_packus_epi32 and _mm512_packus_epi32
|
||||
|
||||
// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
|
||||
|
||||
// static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for
|
||||
// each 32-bit lane i (float)
|
||||
//{
|
||||
// return _mm256_permutevar8x32_ps(a, swiz);
|
||||
//}
|
||||
|
||||
SIMD_IWRAPPER_1I_32(shuffle_epi32);
|
||||
// template<int ImmT>
|
||||
// static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
|
||||
//{
|
||||
// return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
|
||||
//}
|
||||
// SIMD_IWRAPPER_2(shuffle_epi8);
|
||||
SIMD_IWRAPPER_2_32(unpackhi_epi32);
|
||||
SIMD_IWRAPPER_2_32(unpacklo_epi32);
|
||||
|
||||
// SIMD_IWRAPPER_2_16(unpackhi_epi16);
|
||||
// SIMD_IWRAPPER_2_64(unpackhi_epi64);
|
||||
// SIMD_IWRAPPER_2_8(unpackhi_epi8);
|
||||
// SIMD_IWRAPPER_2_16(unpacklo_epi16);
|
||||
// SIMD_IWRAPPER_2_64(unpacklo_epi64);
|
||||
// SIMD_IWRAPPER_2_8(unpacklo_epi8);
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Load / store operations
|
||||
//-----------------------------------------------------------------------
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
load_ps(float const* p) // return *p (loads SIMD width elements from memory)
|
||||
{
|
||||
return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
|
||||
{
|
||||
return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem)
|
||||
{
|
||||
return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem)
|
||||
{
|
||||
return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p));
|
||||
}
|
||||
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
|
||||
{
|
||||
return __conv(_mm512_mask_i32gather_ps(
|
||||
_mm512_setzero_ps(), __mmask16(0xff), __conv(idx), p, static_cast<int>(ScaleT)));
|
||||
}
|
||||
|
||||
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
|
||||
{
|
||||
__mmask16 m = 0xff;
|
||||
m = _mm512_mask_test_epi32_mask(
|
||||
m, _mm512_castps_si512(__conv(mask)), _mm512_set1_epi32(0x80000000));
|
||||
return __conv(
|
||||
_mm512_mask_i32gather_ps(__conv(old), m, __conv(idx), p, static_cast<int>(ScaleT)));
|
||||
}
|
||||
|
||||
// static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
|
||||
// {
|
||||
// __mmask64 m = 0xffffffffull;
|
||||
// return static_cast<uint32_t>(
|
||||
// _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
|
||||
// }
|
||||
|
||||
static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
|
||||
{
|
||||
__mmask16 m = 0xff;
|
||||
m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
|
||||
_mm512_mask_storeu_ps(p, m, __conv(src));
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL
|
||||
store_ps(float* p, Float a) // *p = a (stores all elements contiguously in memory)
|
||||
{
|
||||
_mm512_mask_storeu_ps(p, __mmask16(0xff), __conv(a));
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
|
||||
{
|
||||
_mm512_mask_storeu_epi32(p, __mmask16(0xff), __conv(a));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
|
||||
{
|
||||
return castsi_ps(__conv(_mm512_maskz_set1_epi32(__mmask16(mask & 0xff), -1)));
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// Legacy interface (available only in SIMD256 width)
|
||||
//=======================================================================
|
||||
|
||||
#undef SIMD_WRAPPER_1_
|
||||
#undef SIMD_WRAPPER_1
|
||||
#undef SIMD_WRAPPER_1I_
|
||||
#undef SIMD_WRAPPER_1I
|
||||
#undef SIMD_WRAPPER_2_
|
||||
#undef SIMD_WRAPPER_2
|
||||
#undef SIMD_WRAPPER_2I
|
||||
#undef SIMD_WRAPPER_3_
|
||||
#undef SIMD_WRAPPER_3
|
||||
#undef SIMD_IWRAPPER_1_
|
||||
#undef SIMD_IWRAPPER_1_32
|
||||
#undef SIMD_IWRAPPER_1I_
|
||||
#undef SIMD_IWRAPPER_1I_32
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_2_32
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
|
|
@ -1,129 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD256 AVX (512) implementation for Core processors
|
||||
//
|
||||
// Since this implementation inherits from the AVX (2) implementation,
|
||||
// the only operations below ones that replace AVX (2) operations.
|
||||
// These use native AVX512 instructions with masking to enable a larger
|
||||
// register set.
|
||||
//============================================================================
|
||||
|
||||
#define SIMD_DWRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
|
||||
}
|
||||
#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
|
||||
|
||||
#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
|
||||
}
|
||||
#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
|
||||
|
||||
#define SIMD_DWRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
|
||||
}
|
||||
#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
|
||||
|
||||
#define SIMD_IWRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
|
||||
}
|
||||
#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
|
||||
#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
|
||||
#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
|
||||
|
||||
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
|
||||
}
|
||||
#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
|
||||
#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
|
||||
#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
|
||||
|
||||
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{ \
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
|
||||
}
|
||||
#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
|
||||
#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
|
||||
#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
|
||||
|
||||
SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
|
||||
SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
|
||||
SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and
|
||||
// _mm512_packs_epi16
|
||||
SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and
|
||||
// _mm512_packs_epi32
|
||||
SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and
|
||||
// _mm512_packus_epi16
|
||||
SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and
|
||||
// _mm512_packus_epi32
|
||||
SIMD_IWRAPPER_2_16(unpackhi_epi16);
|
||||
SIMD_IWRAPPER_2_64(unpackhi_epi64);
|
||||
SIMD_IWRAPPER_2_8(unpackhi_epi8);
|
||||
SIMD_IWRAPPER_2_16(unpacklo_epi16);
|
||||
SIMD_IWRAPPER_2_64(unpacklo_epi64);
|
||||
SIMD_IWRAPPER_2_8(unpacklo_epi8);
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
|
||||
{
|
||||
__mmask64 m = 0xffffffffull;
|
||||
return static_cast<uint32_t>(_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
|
||||
}
|
||||
|
||||
#undef SIMD_DWRAPPER_1_
|
||||
#undef SIMD_DWRAPPER_1
|
||||
#undef SIMD_DWRAPPER_1I_
|
||||
#undef SIMD_DWRAPPER_1I
|
||||
#undef SIMD_DWRAPPER_2_
|
||||
#undef SIMD_DWRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_1_
|
||||
#undef SIMD_IWRAPPER_1_8
|
||||
#undef SIMD_IWRAPPER_1_16
|
||||
#undef SIMD_IWRAPPER_1_64
|
||||
#undef SIMD_IWRAPPER_1I_
|
||||
#undef SIMD_IWRAPPER_1I_8
|
||||
#undef SIMD_IWRAPPER_1I_16
|
||||
#undef SIMD_IWRAPPER_1I_64
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_2_8
|
||||
#undef SIMD_IWRAPPER_2_16
|
||||
#undef SIMD_IWRAPPER_2_64
|
||||
|
|
@ -1,34 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD256 AVX (512) implementation for Knights Family
|
||||
//
|
||||
// Since this implementation inherits from the AVX (2) implementation,
|
||||
// the only operations below ones that replace AVX (2) operations.
|
||||
// These use native AVX512 instructions with masking to enable a larger
|
||||
// register set.
|
||||
//============================================================================
|
||||
|
|
@ -1,699 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
|
||||
// gcc as of 7.1 was missing these intrinsics
|
||||
#ifndef _mm512_cmpneq_ps_mask
|
||||
#define _mm512_cmpneq_ps_mask(a, b) _mm512_cmp_ps_mask((a), (b), _CMP_NEQ_UQ)
|
||||
#endif
|
||||
|
||||
#ifndef _mm512_cmplt_ps_mask
|
||||
#define _mm512_cmplt_ps_mask(a, b) _mm512_cmp_ps_mask((a), (b), _CMP_LT_OS)
|
||||
#endif
|
||||
|
||||
#ifndef _mm512_cmplt_pd_mask
|
||||
#define _mm512_cmplt_pd_mask(a, b) _mm512_cmp_pd_mask((a), (b), _CMP_LT_OS)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD16 AVX512 (F) implementation (compatible with Knights and Core
|
||||
// processors)
|
||||
//
|
||||
//============================================================================
|
||||
|
||||
static const int TARGET_SIMD_WIDTH = 16;
|
||||
using SIMD256T = SIMD256Impl::AVX2Impl;
|
||||
|
||||
#define SIMD_WRAPPER_1_(op, intrin) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); }
|
||||
|
||||
#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op)
|
||||
|
||||
#define SIMD_WRAPPER_2_(op, intrin) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); }
|
||||
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
|
||||
|
||||
#define SIMD_WRAPPERI_2_(op, intrin) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{ \
|
||||
return _mm512_castsi512_ps( \
|
||||
_mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \
|
||||
}
|
||||
|
||||
#define SIMD_DWRAPPER_2(op) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); }
|
||||
|
||||
#define SIMD_WRAPPER_2I_(op, intrin) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{ \
|
||||
return _mm512_##intrin(a, b, ImmT); \
|
||||
}
|
||||
#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
|
||||
|
||||
#define SIMD_DWRAPPER_2I_(op, intrin) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{ \
|
||||
return _mm512_##intrin(a, b, ImmT); \
|
||||
}
|
||||
#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
|
||||
|
||||
#define SIMD_WRAPPER_3(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); }
|
||||
|
||||
#define SIMD_IWRAPPER_1(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); }
|
||||
#define SIMD_IWRAPPER_1_8(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); }
|
||||
|
||||
#define SIMD_IWRAPPER_1_4(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); }
|
||||
|
||||
#define SIMD_IWRAPPER_1I_(op, intrin) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{ \
|
||||
return intrin(a, ImmT); \
|
||||
}
|
||||
#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
|
||||
|
||||
#define SIMD_IWRAPPER_2_(op, intrin) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); }
|
||||
#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
|
||||
|
||||
#define SIMD_IWRAPPER_2_CMP(op, cmp) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); }
|
||||
|
||||
#define SIMD_IFWRAPPER_2(op, intrin) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{ \
|
||||
return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2I_(op, intrin) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{ \
|
||||
return _mm512_##intrin(a, b, ImmT); \
|
||||
}
|
||||
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
|
||||
|
||||
private:
|
||||
static SIMDINLINE Integer vmask(__mmask16 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi32(m, -1);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer vmask(__mmask8 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi64(m, -1LL);
|
||||
}
|
||||
|
||||
public:
|
||||
//-----------------------------------------------------------------------
|
||||
// Single precision floating point arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2(add_ps); // return a + b
|
||||
SIMD_WRAPPER_2(div_ps); // return a / b
|
||||
SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
|
||||
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
|
||||
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
|
||||
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
|
||||
SIMD_WRAPPER_2(mul_ps); // return a * b
|
||||
SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp14_ps); // return 1.0f / a
|
||||
SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt14_ps); // return 1.0f / sqrt(a)
|
||||
SIMD_WRAPPER_2(sub_ps); // return a - b
|
||||
|
||||
template <RoundMode RMT>
|
||||
static SIMDINLINE Float SIMDCALL round_ps(Float a)
|
||||
{
|
||||
return _mm512_roundscale_ps(a, static_cast<int>(RMT));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL ceil_ps(Float a)
|
||||
{
|
||||
return round_ps<RoundMode::CEIL_NOEXC>(a);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL floor_ps(Float a)
|
||||
{
|
||||
return round_ps<RoundMode::FLOOR_NOEXC>(a);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Integer (various width) arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
|
||||
SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
|
||||
// SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
|
||||
// SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
|
||||
|
||||
// return (a * b) & 0xFFFFFFFF
|
||||
//
|
||||
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
|
||||
// and store the low 32 bits of the intermediate integers in dst.
|
||||
SIMD_IWRAPPER_2(mullo_epi32);
|
||||
SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
|
||||
SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
|
||||
// SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Logical operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_2_(and_si, and_si512); // return a & b (int)
|
||||
SIMD_IWRAPPER_2_(andnot_si, andnot_si512); // return (~a) & b (int)
|
||||
SIMD_IWRAPPER_2_(or_si, or_si512); // return a | b (int)
|
||||
SIMD_IWRAPPER_2_(xor_si, xor_si512); // return a ^ b (int)
|
||||
|
||||
// SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
|
||||
// SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
|
||||
// SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
|
||||
// SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Shift operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
|
||||
SIMD_IWRAPPER_2(sllv_epi32);
|
||||
SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
|
||||
SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
|
||||
|
||||
#if 0
|
||||
SIMD_IWRAPPER_1I_(srli_si, srli_si512); // return a >> (ImmT*8) (uint)
|
||||
|
||||
template<int ImmT> // same as srli_si, but with Float cast to int
|
||||
static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
|
||||
{
|
||||
return castsi_ps(srli_si<ImmT>(castps_si(a)));
|
||||
}
|
||||
#endif
|
||||
|
||||
SIMD_IWRAPPER_2(srlv_epi32);
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Conversion operations
|
||||
//-----------------------------------------------------------------------
|
||||
static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
|
||||
{
|
||||
return _mm512_castpd_ps(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
|
||||
{
|
||||
return _mm512_castps_si512(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
|
||||
{
|
||||
return _mm512_castsi512_pd(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
|
||||
{
|
||||
return _mm512_castps_pd(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL castpd_si(Double a) // return *(Integer*)(&a)
|
||||
{
|
||||
return _mm512_castpd_si512(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
|
||||
{
|
||||
return _mm512_castsi512_ps(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float)
|
||||
{
|
||||
return _mm512_cvtepi32_ps(a);
|
||||
}
|
||||
|
||||
// SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a (uint8 --> int16)
|
||||
SIMD_IWRAPPER_1_4(cvtepu8_epi32); // return (int32)a (uint8 --> int32)
|
||||
SIMD_IWRAPPER_1_8(cvtepu16_epi32); // return (int32)a (uint16 --> int32)
|
||||
SIMD_IWRAPPER_1_4(cvtepu16_epi64); // return (int64)a (uint16 --> int64)
|
||||
SIMD_IWRAPPER_1_8(cvtepu32_epi64); // return (int64)a (uint32 --> int64)
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32)
|
||||
{
|
||||
return _mm512_cvtps_epi32(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32)
|
||||
{
|
||||
return _mm512_cvttps_epi32(a);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Comparison operations
|
||||
//-----------------------------------------------------------------------
|
||||
template <CompareType CmpTypeT>
|
||||
static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
|
||||
{
|
||||
return _mm512_cmp_ps_mask(a, b, static_cast<const int>(CmpTypeT));
|
||||
}
|
||||
|
||||
template <CompareType CmpTypeT>
|
||||
static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
|
||||
{
|
||||
// Legacy vector mask generator
|
||||
__mmask16 result = cmp_ps_mask<CmpTypeT>(a, b);
|
||||
return castsi_ps(vmask(result));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b)
|
||||
{
|
||||
return cmp_ps<CompareType::LT_OQ>(a, b);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b)
|
||||
{
|
||||
return cmp_ps<CompareType::GT_OQ>(a, b);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b)
|
||||
{
|
||||
return cmp_ps<CompareType::NEQ_OQ>(a, b);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b)
|
||||
{
|
||||
return cmp_ps<CompareType::EQ_OQ>(a, b);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b)
|
||||
{
|
||||
return cmp_ps<CompareType::GE_OQ>(a, b);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b)
|
||||
{
|
||||
return cmp_ps<CompareType::LE_OQ>(a, b);
|
||||
}
|
||||
|
||||
template <CompareTypeInt CmpTypeT>
|
||||
static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b)
|
||||
{
|
||||
// Legacy vector mask generator
|
||||
__mmask16 result = _mm512_cmp_epi32_mask(a, b, static_cast<const int>(CmpTypeT));
|
||||
return vmask(result);
|
||||
}
|
||||
template <CompareTypeInt CmpTypeT>
|
||||
static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b)
|
||||
{
|
||||
// Legacy vector mask generator
|
||||
__mmask8 result = _mm512_cmp_epi64_mask(a, b, static_cast<const int>(CmpTypeT));
|
||||
return vmask(result);
|
||||
}
|
||||
|
||||
// SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8<CompareTypeInt::EQ>); // return a == b (int8)
|
||||
// SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>); // return a == b (int16)
|
||||
SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>); // return a == b (int32)
|
||||
SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>); // return a == b (int64)
|
||||
// SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8<CompareTypeInt::GT>); // return a > b (int8)
|
||||
// SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>); // return a > b (int16)
|
||||
SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>); // return a > b (int32)
|
||||
SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>); // return a > b (int64)
|
||||
SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>); // return a < b (int32)
|
||||
|
||||
static SIMDINLINE bool SIMDCALL testz_ps(Float a,
|
||||
Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
|
||||
{
|
||||
return (0 == static_cast<int>(_mm512_test_epi32_mask(castps_si(a), castps_si(b))));
|
||||
}
|
||||
|
||||
static SIMDINLINE bool SIMDCALL testz_si(Integer a,
|
||||
Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
|
||||
{
|
||||
return (0 == static_cast<int>(_mm512_test_epi32_mask(a, b)));
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Blend / shuffle / permute operations
|
||||
//-----------------------------------------------------------------------
|
||||
template <int ImmT>
|
||||
static SIMDINLINE Float blend_ps(Float a, Float b) // return ImmT ? b : a (float)
|
||||
{
|
||||
return _mm512_mask_blend_ps(__mmask16(ImmT), a, b);
|
||||
}
|
||||
|
||||
template <int ImmT>
|
||||
static SIMDINLINE Integer blend_epi32(Integer a, Integer b) // return ImmT ? b : a (int32)
|
||||
{
|
||||
return _mm512_mask_blend_epi32(__mmask16(ImmT), a, b);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float blendv_ps(Float a, Float b, Float mask) // return mask ? b : a (float)
|
||||
{
|
||||
return _mm512_mask_blend_ps(__mmask16(movemask_ps(mask)), a, b);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
|
||||
Integer b,
|
||||
Float mask) // return mask ? b : a (int)
|
||||
{
|
||||
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
|
||||
Integer b,
|
||||
Integer mask) // return mask ? b : a (int)
|
||||
{
|
||||
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
broadcast_ss(float const* p) // return *p (all elements in vector get same value)
|
||||
{
|
||||
return _mm512_set1_ps(*p);
|
||||
}
|
||||
|
||||
template <int imm>
|
||||
static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
|
||||
{
|
||||
return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), imm));
|
||||
}
|
||||
|
||||
template <int imm>
|
||||
static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
|
||||
{
|
||||
return _mm512_extractf64x4_pd(a, imm);
|
||||
}
|
||||
|
||||
template <int imm>
|
||||
static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
|
||||
{
|
||||
return _mm512_extracti64x4_epi64(a, imm);
|
||||
}
|
||||
|
||||
template <int imm>
|
||||
static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
|
||||
{
|
||||
return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(a), _mm256_castps_pd(b), imm));
|
||||
}
|
||||
|
||||
template <int imm>
|
||||
static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
|
||||
{
|
||||
return _mm512_insertf64x4(a, b, imm);
|
||||
}
|
||||
|
||||
template <int imm>
|
||||
static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
|
||||
{
|
||||
return _mm512_inserti64x4(a, b, imm);
|
||||
}
|
||||
|
||||
// SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 and
|
||||
// _mm512_packs_epi16 SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32
|
||||
// and _mm512_packs_epi32 SIMD_IWRAPPER_2(packus_epi16); // See documentation for
|
||||
// _mm512_packus_epi16 and _mm512_packus_epi16 SIMD_IWRAPPER_2(packus_epi32); // See documentation
|
||||
// for _mm512_packus_epi32 and _mm512_packus_epi32
|
||||
|
||||
template <int ImmT>
|
||||
static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
|
||||
{
|
||||
return _mm512_permute_ps(a, ImmT);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
|
||||
{
|
||||
return _mm512_permutexvar_epi32(swiz, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
|
||||
{
|
||||
return _mm512_permutexvar_ps(swiz, a);
|
||||
}
|
||||
|
||||
SIMD_WRAPPER_2I_(permute2f128_ps, shuffle_f32x4);
|
||||
SIMD_DWRAPPER_2I_(permute2f128_pd, shuffle_f64x2);
|
||||
SIMD_IWRAPPER_2I_(permute2f128_si, shuffle_i32x4);
|
||||
|
||||
SIMD_IWRAPPER_1I(shuffle_epi32);
|
||||
|
||||
// SIMD_IWRAPPER_2(shuffle_epi8);
|
||||
SIMD_DWRAPPER_2I(shuffle_pd);
|
||||
SIMD_WRAPPER_2I(shuffle_ps);
|
||||
|
||||
template <int ImmT>
|
||||
static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
|
||||
{
|
||||
return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2(unpackhi_epi16);
|
||||
|
||||
// SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps);
|
||||
static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
|
||||
{
|
||||
return castps_si(_mm512_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2(unpackhi_epi64);
|
||||
// SIMD_IWRAPPER_2(unpackhi_epi8);
|
||||
SIMD_DWRAPPER_2(unpackhi_pd);
|
||||
SIMD_WRAPPER_2(unpackhi_ps);
|
||||
// SIMD_IWRAPPER_2(unpacklo_epi16);
|
||||
SIMD_IFWRAPPER_2(unpacklo_epi32, unpacklo_ps);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi64);
|
||||
// SIMD_IWRAPPER_2(unpacklo_epi8);
|
||||
SIMD_DWRAPPER_2(unpacklo_pd);
|
||||
SIMD_WRAPPER_2(unpacklo_ps);
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Load / store operations
|
||||
//-----------------------------------------------------------------------
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
|
||||
{
|
||||
return _mm512_i32gather_ps(idx, p, static_cast<int>(ScaleT));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
load1_ps(float const* p) // return *p (broadcast 1 value to all elements)
|
||||
{
|
||||
return broadcast_ss(p);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
load_ps(float const* p) // return *p (loads SIMD width elements from memory)
|
||||
{
|
||||
return _mm512_load_ps(p);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
|
||||
{
|
||||
return _mm512_load_si512(&p->v);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem)
|
||||
{
|
||||
return _mm512_loadu_ps(p);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem)
|
||||
{
|
||||
return _mm512_loadu_si512(p);
|
||||
}
|
||||
|
||||
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
|
||||
{
|
||||
__mmask16 k = _mm512_test_epi32_mask(castps_si(mask), set1_epi32(0x80000000));
|
||||
|
||||
return _mm512_mask_i32gather_ps(old, k, idx, p, static_cast<int>(ScaleT));
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
|
||||
{
|
||||
Mask m = _mm512_cmplt_epi32_mask(mask, setzero_si());
|
||||
_mm512_mask_store_ps(p, m, src);
|
||||
}
|
||||
|
||||
// static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
|
||||
//{
|
||||
// __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si());
|
||||
// return static_cast<uint64_t>(m);
|
||||
//}
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
|
||||
{
|
||||
__mmask8 m = _mm512_test_epi64_mask(castpd_si(a), set1_epi64(0x8000000000000000LL));
|
||||
return static_cast<uint32_t>(m);
|
||||
}
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
|
||||
{
|
||||
__mmask16 m = _mm512_test_epi32_mask(castps_si(a), set1_epi32(0x80000000));
|
||||
return static_cast<uint32_t>(m);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set1_epi64(long long i) // return i (all elements are same value)
|
||||
{
|
||||
return _mm512_set1_epi64(i);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
|
||||
{
|
||||
return _mm512_set1_epi32(i);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
|
||||
{
|
||||
return _mm512_set1_epi8(i);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
|
||||
{
|
||||
return _mm512_set1_ps(f);
|
||||
}
|
||||
|
||||
static SIMDINLINE Double SIMDCALL setzero_pd() // return 0 (double)
|
||||
{
|
||||
return _mm512_setzero_pd();
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
|
||||
{
|
||||
return _mm512_setzero_ps();
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
|
||||
{
|
||||
return _mm512_setzero_si512();
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL
|
||||
store_ps(float* p, Float a) // *p = a (stores all elements contiguously in memory)
|
||||
{
|
||||
_mm512_store_ps(p, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
|
||||
{
|
||||
_mm512_store_si512(&p->v, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL
|
||||
storeu_si(Integer* p, Integer a) // *p = a (same as store_si but allows for unaligned mem)
|
||||
{
|
||||
_mm512_storeu_si512(&p->v, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL
|
||||
stream_ps(float* p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache)
|
||||
{
|
||||
_mm512_stream_ps(p, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set_epi32(int i15,
|
||||
int i14,
|
||||
int i13,
|
||||
int i12,
|
||||
int i11,
|
||||
int i10,
|
||||
int i9,
|
||||
int i8,
|
||||
int i7,
|
||||
int i6,
|
||||
int i5,
|
||||
int i4,
|
||||
int i3,
|
||||
int i2,
|
||||
int i1,
|
||||
int i0)
|
||||
{
|
||||
return _mm512_set_epi32(i15, i14, i13, i12, i11, i10, i9, i8, i7, i6, i5, i4, i3, i2, i1, i0);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
|
||||
{
|
||||
return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL set_ps(float i15,
|
||||
float i14,
|
||||
float i13,
|
||||
float i12,
|
||||
float i11,
|
||||
float i10,
|
||||
float i9,
|
||||
float i8,
|
||||
float i7,
|
||||
float i6,
|
||||
float i5,
|
||||
float i4,
|
||||
float i3,
|
||||
float i2,
|
||||
float i1,
|
||||
float i0)
|
||||
{
|
||||
return _mm512_set_ps(i15, i14, i13, i12, i11, i10, i9, i8, i7, i6, i5, i4, i3, i2, i1, i0);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
|
||||
{
|
||||
return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
|
||||
{
|
||||
return castsi_ps(_mm512_maskz_mov_epi32(__mmask16(mask), set1_epi32(-1)));
|
||||
}
|
||||
|
||||
#undef SIMD_WRAPPER_1_
|
||||
#undef SIMD_WRAPPER_1
|
||||
#undef SIMD_WRAPPER_2
|
||||
#undef SIMD_WRAPPER_2_
|
||||
#undef SIMD_WRAPPERI_2_
|
||||
#undef SIMD_DWRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2I
|
||||
#undef SIMD_WRAPPER_2I_
|
||||
#undef SIMD_WRAPPER_3_
|
||||
#undef SIMD_WRAPPER_2I
|
||||
#undef SIMD_WRAPPER_3
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IFWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_1I
|
||||
#undef SIMD_IWRAPPER_1I_
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
|
|
@ -1,186 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD16 AVX512 (F) implementation for Core processors
|
||||
//
|
||||
//============================================================================
|
||||
|
||||
#define SIMD_WRAPPER_1_(op, intrin) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); }
|
||||
|
||||
#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op)
|
||||
|
||||
#define SIMD_WRAPPER_2_(op, intrin) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); }
|
||||
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
|
||||
|
||||
#define SIMD_WRAPPERI_2_(op, intrin) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{ \
|
||||
return _mm512_castsi512_ps( \
|
||||
_mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \
|
||||
}
|
||||
|
||||
#define SIMD_DWRAPPER_2(op) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); }
|
||||
|
||||
#define SIMD_WRAPPER_2I_(op, intrin) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{ \
|
||||
return _mm512_##intrin(a, b, ImmT); \
|
||||
}
|
||||
#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
|
||||
|
||||
#define SIMD_DWRAPPER_2I_(op, intrin) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{ \
|
||||
return _mm512_##intrin(a, b, ImmT); \
|
||||
}
|
||||
#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
|
||||
|
||||
#define SIMD_WRAPPER_3(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); }
|
||||
|
||||
#define SIMD_IWRAPPER_1(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); }
|
||||
#define SIMD_IWRAPPER_1_8(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); }
|
||||
|
||||
#define SIMD_IWRAPPER_1_4(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); }
|
||||
|
||||
#define SIMD_IWRAPPER_1I_(op, intrin) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{ \
|
||||
return intrin(a, ImmT); \
|
||||
}
|
||||
#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
|
||||
|
||||
#define SIMD_IWRAPPER_2_(op, intrin) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); }
|
||||
#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
|
||||
|
||||
#define SIMD_IWRAPPER_2_CMP(op, cmp) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); }
|
||||
|
||||
#define SIMD_IFWRAPPER_2(op, intrin) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{ \
|
||||
return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2I_(op, intrin) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{ \
|
||||
return _mm512_##intrin(a, b, ImmT); \
|
||||
}
|
||||
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
|
||||
|
||||
private:
|
||||
static SIMDINLINE Integer vmask(__mmask32 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi16(m, -1);
|
||||
}
|
||||
static SIMDINLINE Integer vmask(__mmask64 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi8(m, -1);
|
||||
}
|
||||
|
||||
public:
|
||||
SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
|
||||
SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
|
||||
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
|
||||
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
|
||||
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
|
||||
|
||||
SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a (uint8 --> int16)
|
||||
|
||||
template <CompareTypeInt CmpTypeT>
|
||||
static SIMDINLINE Integer SIMDCALL cmp_epi8(Integer a, Integer b)
|
||||
{
|
||||
// Legacy vector mask generator
|
||||
__mmask64 result = _mm512_cmp_epi8_mask(a, b, static_cast<const int>(CmpTypeT));
|
||||
return vmask(result);
|
||||
}
|
||||
template <CompareTypeInt CmpTypeT>
|
||||
static SIMDINLINE Integer SIMDCALL cmp_epi16(Integer a, Integer b)
|
||||
{
|
||||
// Legacy vector mask generator
|
||||
__mmask32 result = _mm512_cmp_epi16_mask(a, b, static_cast<const int>(CmpTypeT));
|
||||
return vmask(result);
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8<CompareTypeInt::EQ>); // return a == b (int8)
|
||||
SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>); // return a == b (int16)
|
||||
SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8<CompareTypeInt::GT>); // return a > b (int8)
|
||||
SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>); // return a > b (int16)
|
||||
|
||||
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16
|
||||
SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32
|
||||
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16
|
||||
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32
|
||||
|
||||
SIMD_IWRAPPER_2(unpackhi_epi8); // See documentation for _mm512_unpackhi_epi8
|
||||
SIMD_IWRAPPER_2(unpacklo_epi16); // See documentation for _mm512_unpacklo_epi16
|
||||
SIMD_IWRAPPER_2(unpacklo_epi8); // See documentation for _mm512_unpacklo_epi8
|
||||
|
||||
SIMD_IWRAPPER_2(shuffle_epi8);
|
||||
|
||||
static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
|
||||
{
|
||||
__mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si());
|
||||
return static_cast<uint64_t>(m);
|
||||
}
|
||||
|
||||
#undef SIMD_WRAPPER_1_
|
||||
#undef SIMD_WRAPPER_1
|
||||
#undef SIMD_WRAPPER_2
|
||||
#undef SIMD_WRAPPER_2_
|
||||
#undef SIMD_WRAPPERI_2_
|
||||
#undef SIMD_DWRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2I
|
||||
#undef SIMD_WRAPPER_2I_
|
||||
#undef SIMD_WRAPPER_3_
|
||||
#undef SIMD_WRAPPER_2I
|
||||
#undef SIMD_WRAPPER_3
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IFWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_1I
|
||||
#undef SIMD_IWRAPPER_1I_
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
|
|
@ -1,132 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD16 AVX512 (F) implementation for Knights Family Processors
|
||||
//
|
||||
//============================================================================
|
||||
|
||||
#define SIMD_WRAPPER_1_(op, intrin) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); }
|
||||
|
||||
#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op)
|
||||
|
||||
#define SIMD_WRAPPER_2_(op, intrin) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); }
|
||||
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
|
||||
|
||||
#define SIMD_WRAPPERI_2_(op, intrin) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{ \
|
||||
return _mm512_castsi512_ps( \
|
||||
_mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \
|
||||
}
|
||||
|
||||
#define SIMD_DWRAPPER_2(op) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); }
|
||||
|
||||
#define SIMD_WRAPPER_2I_(op, intrin) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{ \
|
||||
return _mm512_##intrin(a, b, ImmT); \
|
||||
}
|
||||
#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
|
||||
|
||||
#define SIMD_DWRAPPER_2I_(op, intrin) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{ \
|
||||
return _mm512_##intrin(a, b, ImmT); \
|
||||
}
|
||||
#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
|
||||
|
||||
#define SIMD_WRAPPER_3(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); }
|
||||
|
||||
#define SIMD_IWRAPPER_1(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); }
|
||||
#define SIMD_IWRAPPER_1_8(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); }
|
||||
|
||||
#define SIMD_IWRAPPER_1_4(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); }
|
||||
|
||||
#define SIMD_IWRAPPER_1I_(op, intrin) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{ \
|
||||
return intrin(a, ImmT); \
|
||||
}
|
||||
#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
|
||||
|
||||
#define SIMD_IWRAPPER_2_(op, intrin) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); }
|
||||
#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
|
||||
|
||||
#define SIMD_IWRAPPER_2_CMP(op, cmp) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); }
|
||||
|
||||
#define SIMD_IFWRAPPER_2(op, intrin) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{ \
|
||||
return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2I_(op, intrin) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{ \
|
||||
return _mm512_##intrin(a, b, ImmT); \
|
||||
}
|
||||
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
|
||||
|
||||
SIMD_WRAPPERI_2_(and_ps, and_epi32); // return a & b (float treated as int)
|
||||
SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32); // return (~a) & b (float treated as int)
|
||||
SIMD_WRAPPERI_2_(or_ps, or_epi32); // return a | b (float treated as int)
|
||||
SIMD_WRAPPERI_2_(xor_ps, xor_epi32); // return a ^ b (float treated as int)
|
||||
|
||||
#undef SIMD_WRAPPER_1_
|
||||
#undef SIMD_WRAPPER_1
|
||||
#undef SIMD_WRAPPER_2
|
||||
#undef SIMD_WRAPPER_2_
|
||||
#undef SIMD_WRAPPERI_2_
|
||||
#undef SIMD_DWRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2I
|
||||
#undef SIMD_WRAPPER_2I_
|
||||
#undef SIMD_WRAPPER_3_
|
||||
#undef SIMD_WRAPPER_2I
|
||||
#undef SIMD_WRAPPER_3
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IFWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_1I
|
||||
#undef SIMD_IWRAPPER_1I_
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
|
|
@ -1,27 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
// Implement mask-enabled SIMD functions
|
||||
|
|
@ -1,27 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
// Implement mask-enabled SIMD functions
|
||||
|
|
@ -1,27 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
// Implement mask-enabled SIMD functions
|
||||
|
|
@ -1,852 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD16 AVX (1) implementation
|
||||
//============================================================================
|
||||
|
||||
static const int TARGET_SIMD_WIDTH = 8;
|
||||
using SIMD128T = SIMD128Impl::AVXImpl;
|
||||
|
||||
#define SIMD_WRAPPER_1(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float const& a) \
|
||||
{ \
|
||||
return Float{ \
|
||||
SIMD256T::op(a.v8[0]), \
|
||||
SIMD256T::op(a.v8[1]), \
|
||||
}; \
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_2(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
|
||||
{ \
|
||||
return Float{ \
|
||||
SIMD256T::op(a.v8[0], b.v8[0]), \
|
||||
SIMD256T::op(a.v8[1], b.v8[1]), \
|
||||
}; \
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_2I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
|
||||
{ \
|
||||
return Float{ \
|
||||
SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]), \
|
||||
SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
|
||||
}; \
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_2I_1(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
|
||||
{ \
|
||||
return Float{ \
|
||||
SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]), \
|
||||
SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]), \
|
||||
}; \
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_3(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \
|
||||
{ \
|
||||
return Float{ \
|
||||
SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]), \
|
||||
SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]), \
|
||||
}; \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
|
||||
{ \
|
||||
return Integer{ \
|
||||
SIMD256T::op(a.v8[0]), \
|
||||
SIMD256T::op(a.v8[1]), \
|
||||
}; \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
|
||||
{ \
|
||||
return Integer{ \
|
||||
SIMD256T::op(a.v8[0], b.v8[0]), \
|
||||
SIMD256T::op(a.v8[1], b.v8[1]), \
|
||||
}; \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2I(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
|
||||
{ \
|
||||
return Integer{ \
|
||||
SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]), \
|
||||
SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
|
||||
}; \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2I_1(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
|
||||
{ \
|
||||
return Integer{ \
|
||||
SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]), \
|
||||
SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]), \
|
||||
}; \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2I_2(op) \
|
||||
template <int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
|
||||
{ \
|
||||
return Integer{ \
|
||||
SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]), \
|
||||
SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]), \
|
||||
}; \
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_3(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \
|
||||
{ \
|
||||
return Integer{ \
|
||||
SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]), \
|
||||
SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]), \
|
||||
}; \
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Single precision floating point arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2(add_ps); // return a + b
|
||||
SIMD_WRAPPER_2(div_ps); // return a / b
|
||||
SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
|
||||
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
|
||||
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
|
||||
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
|
||||
SIMD_WRAPPER_2(mul_ps); // return a * b
|
||||
SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a
|
||||
SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
|
||||
SIMD_WRAPPER_2(sub_ps); // return a - b
|
||||
|
||||
template <RoundMode RMT>
|
||||
static SIMDINLINE Float SIMDCALL round_ps(Float const& a)
|
||||
{
|
||||
return Float{
|
||||
SIMD256T::template round_ps<RMT>(a.v8[0]),
|
||||
SIMD256T::template round_ps<RMT>(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a)
|
||||
{
|
||||
return round_ps<RoundMode::CEIL_NOEXC>(a);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL floor_ps(Float const& a)
|
||||
{
|
||||
return round_ps<RoundMode::FLOOR_NOEXC>(a);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Integer (various width) arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
|
||||
SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
|
||||
SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
|
||||
SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
|
||||
|
||||
// return (a * b) & 0xFFFFFFFF
|
||||
//
|
||||
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
|
||||
// and store the low 32 bits of the intermediate integers in dst.
|
||||
SIMD_IWRAPPER_2(mullo_epi32);
|
||||
SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
|
||||
SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
|
||||
SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Logical operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
|
||||
SIMD_IWRAPPER_2(and_si); // return a & b (int)
|
||||
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
|
||||
SIMD_IWRAPPER_2(andnot_si); // return (~a) & b (int)
|
||||
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
|
||||
SIMD_IWRAPPER_2(or_si); // return a | b (int)
|
||||
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
|
||||
SIMD_IWRAPPER_2(xor_si); // return a ^ b (int)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Shift operations
|
||||
//-----------------------------------------------------------------------
|
||||
template <int ImmT>
|
||||
static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const& a) // return a << ImmT
|
||||
{
|
||||
return Integer{
|
||||
SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
|
||||
SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32)
|
||||
|
||||
template <int ImmT>
|
||||
static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const& a) // return a >> ImmT (int32)
|
||||
{
|
||||
return Integer{
|
||||
SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
|
||||
SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
template <int ImmT>
|
||||
static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const& a) // return a >> ImmT (uint32)
|
||||
{
|
||||
return Integer{
|
||||
SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
|
||||
SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
template <int ImmT> // for each 128-bit lane:
|
||||
static SIMDINLINE Integer SIMDCALL srli_si(Integer const& a) // return a >> (ImmT*8) (uint)
|
||||
{
|
||||
return Integer{
|
||||
SIMD256T::template srli_si<ImmT>(a.v8[0]),
|
||||
SIMD256T::template srli_si<ImmT>(a.v8[1]),
|
||||
};
|
||||
}
|
||||
template <int ImmT>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
srlisi_ps(Float const& a) // same as srli_si, but with Float cast to int
|
||||
{
|
||||
return Float{
|
||||
SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
|
||||
SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Conversion operations
|
||||
//-----------------------------------------------------------------------
|
||||
static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a)
|
||||
{
|
||||
return Float{
|
||||
SIMD256T::castpd_ps(a.v8[0]),
|
||||
SIMD256T::castpd_ps(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a)
|
||||
{
|
||||
return Integer{
|
||||
SIMD256T::castps_si(a.v8[0]),
|
||||
SIMD256T::castps_si(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a)
|
||||
{
|
||||
return Double{
|
||||
SIMD256T::castsi_pd(a.v8[0]),
|
||||
SIMD256T::castsi_pd(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a)
|
||||
{
|
||||
return Double{
|
||||
SIMD256T::castps_pd(a.v8[0]),
|
||||
SIMD256T::castps_pd(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a)
|
||||
{
|
||||
return Float{
|
||||
SIMD256T::castsi_ps(a.v8[0]),
|
||||
SIMD256T::castsi_ps(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
cvtepi32_ps(Integer const& a) // return (float)a (int32 --> float)
|
||||
{
|
||||
return Float{
|
||||
SIMD256T::cvtepi32_ps(a.v8[0]),
|
||||
SIMD256T::cvtepi32_ps(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
cvtepu8_epi16(SIMD256Impl::Integer const& a) // return (int16)a (uint8 --> int16)
|
||||
{
|
||||
return Integer{
|
||||
SIMD256T::cvtepu8_epi16(a.v4[0]),
|
||||
SIMD256T::cvtepu8_epi16(a.v4[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
cvtepu8_epi32(SIMD256Impl::Integer const& a) // return (int32)a (uint8 --> int32)
|
||||
{
|
||||
return Integer{
|
||||
SIMD256T::cvtepu8_epi32(a.v4[0]),
|
||||
SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
cvtepu16_epi32(SIMD256Impl::Integer const& a) // return (int32)a (uint16 --> int32)
|
||||
{
|
||||
return Integer{
|
||||
SIMD256T::cvtepu16_epi32(a.v4[0]),
|
||||
SIMD256T::cvtepu16_epi32(a.v4[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
cvtepu16_epi64(SIMD256Impl::Integer const& a) // return (int64)a (uint16 --> int64)
|
||||
{
|
||||
return Integer{
|
||||
SIMD256T::cvtepu16_epi64(a.v4[0]),
|
||||
SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
cvtepu32_epi64(SIMD256Impl::Integer const& a) // return (int64)a (uint32 --> int64)
|
||||
{
|
||||
return Integer{
|
||||
SIMD256T::cvtepu32_epi64(a.v4[0]),
|
||||
SIMD256T::cvtepu32_epi64(a.v4[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
cvtps_epi32(Float const& a) // return (int32)a (float --> int32)
|
||||
{
|
||||
return Integer{
|
||||
SIMD256T::cvtps_epi32(a.v8[0]),
|
||||
SIMD256T::cvtps_epi32(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
cvttps_epi32(Float const& a) // return (int32)a (rnd_to_zero(float) --> int32)
|
||||
{
|
||||
return Integer{
|
||||
SIMD256T::cvtps_epi32(a.v8[0]),
|
||||
SIMD256T::cvtps_epi32(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Comparison operations
|
||||
//-----------------------------------------------------------------------
|
||||
template <CompareType CmpTypeT>
|
||||
static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b
|
||||
{
|
||||
return Float{
|
||||
SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
|
||||
SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
|
||||
};
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b)
|
||||
{
|
||||
return cmp_ps<CompareType::LT_OQ>(a, b);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b)
|
||||
{
|
||||
return cmp_ps<CompareType::GT_OQ>(a, b);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b)
|
||||
{
|
||||
return cmp_ps<CompareType::NEQ_OQ>(a, b);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b)
|
||||
{
|
||||
return cmp_ps<CompareType::EQ_OQ>(a, b);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b)
|
||||
{
|
||||
return cmp_ps<CompareType::GE_OQ>(a, b);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b)
|
||||
{
|
||||
return cmp_ps<CompareType::LE_OQ>(a, b);
|
||||
}
|
||||
|
||||
template <CompareType CmpTypeT>
|
||||
static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const& a, Float const& b)
|
||||
{
|
||||
return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
|
||||
SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
|
||||
SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
|
||||
SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
|
||||
SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
|
||||
|
||||
static SIMDINLINE bool SIMDCALL
|
||||
testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
|
||||
{
|
||||
return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) & SIMD256T::testz_ps(a.v8[1], b.v8[1]));
|
||||
}
|
||||
|
||||
static SIMDINLINE bool SIMDCALL
|
||||
testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
|
||||
{
|
||||
return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) & SIMD256T::testz_si(a.v8[1], b.v8[1]));
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Blend / shuffle / permute operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
|
||||
SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32)
|
||||
SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
|
||||
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
|
||||
Integer const& b,
|
||||
Float const& mask) // return mask ? b : a (int)
|
||||
{
|
||||
return Integer{
|
||||
SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
|
||||
SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
|
||||
Integer const& b,
|
||||
Integer const& mask) // return mask ? b : a (int)
|
||||
{
|
||||
return Integer{
|
||||
SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
|
||||
SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
broadcast_ss(float const* p) // return *p (all elements in vector get same value)
|
||||
{
|
||||
float f = *p;
|
||||
return Float{
|
||||
SIMD256T::set1_ps(f),
|
||||
SIMD256T::set1_ps(f),
|
||||
};
|
||||
}
|
||||
|
||||
template <int imm>
|
||||
static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const& a)
|
||||
{
|
||||
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
|
||||
return a.v8[imm];
|
||||
}
|
||||
|
||||
template <int imm>
|
||||
static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const& a)
|
||||
{
|
||||
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
|
||||
return a.v8[imm];
|
||||
}
|
||||
|
||||
template <int imm>
|
||||
static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const& a)
|
||||
{
|
||||
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
|
||||
return a.v8[imm];
|
||||
}
|
||||
|
||||
template <int imm>
|
||||
static SIMDINLINE Float SIMDCALL insert_ps(Float const& a, SIMD256Impl::Float const& b)
|
||||
{
|
||||
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
|
||||
Float r = a;
|
||||
r.v8[imm] = b;
|
||||
return r;
|
||||
}
|
||||
|
||||
template <int imm>
|
||||
static SIMDINLINE Double SIMDCALL insert_pd(Double const& a, SIMD256Impl::Double const& b)
|
||||
{
|
||||
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
|
||||
Double r = a;
|
||||
r.v8[imm] = b;
|
||||
return r;
|
||||
}
|
||||
|
||||
template <int imm>
|
||||
static SIMDINLINE Integer SIMDCALL insert_si(Integer const& a, SIMD256Impl::Integer const& b)
|
||||
{
|
||||
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
|
||||
Integer r = a;
|
||||
r.v8[imm] = b;
|
||||
return r;
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
|
||||
SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
|
||||
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
|
||||
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
|
||||
|
||||
template <int ImmT>
|
||||
static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
|
||||
{
|
||||
return Float{
|
||||
SIMD256T::template permute_ps<ImmT>(a.v8[0]),
|
||||
SIMD256T::template permute_ps<ImmT>(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL permute_epi32(
|
||||
Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
|
||||
{
|
||||
return castps_si(permute_ps(castsi_ps(a), swiz));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
|
||||
{
|
||||
const auto mask = SIMD256T::set1_epi32(7);
|
||||
|
||||
auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], mask));
|
||||
auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], mask));
|
||||
|
||||
auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask));
|
||||
auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask));
|
||||
|
||||
return Float{
|
||||
SIMD256T::blendv_ps(
|
||||
lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))),
|
||||
SIMD256T::blendv_ps(
|
||||
hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))),
|
||||
};
|
||||
}
|
||||
|
||||
// All of the 512-bit permute2f128_XX intrinsics do the following:
|
||||
//
|
||||
// SELECT4(src, control) {
|
||||
// CASE(control[1:0])
|
||||
// 0 : tmp[127:0] : = src[127:0]
|
||||
// 1 : tmp[127:0] : = src[255:128]
|
||||
// 2 : tmp[127:0] : = src[383:256]
|
||||
// 3 : tmp[127:0] : = src[511:384]
|
||||
// ESAC
|
||||
// RETURN tmp[127:0]
|
||||
// }
|
||||
//
|
||||
// dst[127:0] : = SELECT4(a[511:0], imm8[1:0])
|
||||
// dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
|
||||
// dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
|
||||
// dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
|
||||
// dst[MAX:512] : = 0
|
||||
//
|
||||
// Since the 256-bit AVX instructions use a 4-bit control field (instead
|
||||
// of 2-bit for AVX512), we need to expand the control bits sent to the
|
||||
// AVX instructions for emulation.
|
||||
//
|
||||
template <int shuf>
|
||||
static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const& a, Float const& b)
|
||||
{
|
||||
return Float{
|
||||
SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
|
||||
a.v8[1]),
|
||||
SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
|
||||
b.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
template <int shuf>
|
||||
static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const& a, Double const& b)
|
||||
{
|
||||
return Double{
|
||||
SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
|
||||
a.v8[1]),
|
||||
SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
|
||||
b.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
template <int shuf>
|
||||
static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const& a, Integer const& b)
|
||||
{
|
||||
return Integer{
|
||||
SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
|
||||
a.v8[1]),
|
||||
SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
|
||||
b.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2I_1(shuffle_epi32);
|
||||
SIMD_IWRAPPER_2I_2(shuffle_epi64);
|
||||
SIMD_IWRAPPER_2(shuffle_epi8);
|
||||
SIMD_WRAPPER_2I_1(shuffle_pd);
|
||||
SIMD_WRAPPER_2I_1(shuffle_ps);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi16);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi32);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi64);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi8);
|
||||
SIMD_WRAPPER_2(unpackhi_pd);
|
||||
SIMD_WRAPPER_2(unpackhi_ps);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi16);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi32);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi64);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi8);
|
||||
SIMD_WRAPPER_2(unpacklo_pd);
|
||||
SIMD_WRAPPER_2(unpacklo_ps);
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Load / store operations
|
||||
//-----------------------------------------------------------------------
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
|
||||
{
|
||||
return Float{
|
||||
SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
|
||||
SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
|
||||
{
|
||||
return Float{
|
||||
SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[0]),
|
||||
SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
load1_ps(float const* p) // return *p (broadcast 1 value to all elements)
|
||||
{
|
||||
return broadcast_ss(p);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
load_ps(float const* p) // return *p (loads SIMD width elements from memory)
|
||||
{
|
||||
return Float{SIMD256T::load_ps(p), SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
|
||||
{
|
||||
return Integer{
|
||||
SIMD256T::load_si(&p->v8[0]),
|
||||
SIMD256T::load_si(&p->v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem)
|
||||
{
|
||||
return Float{SIMD256T::loadu_ps(p), SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem)
|
||||
{
|
||||
return Integer{
|
||||
SIMD256T::loadu_si(&p->v8[0]),
|
||||
SIMD256T::loadu_si(&p->v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
|
||||
{
|
||||
return Float{
|
||||
SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
|
||||
SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
|
||||
{
|
||||
return Float{
|
||||
SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
|
||||
SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src)
|
||||
{
|
||||
SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
|
||||
SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
|
||||
}
|
||||
|
||||
static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const& a)
|
||||
{
|
||||
uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
|
||||
mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
|
||||
|
||||
return mask;
|
||||
}
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a)
|
||||
{
|
||||
uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
|
||||
mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
|
||||
|
||||
return mask;
|
||||
}
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a)
|
||||
{
|
||||
uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
|
||||
mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
|
||||
|
||||
return mask;
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
|
||||
{
|
||||
return Integer{SIMD256T::set1_epi32(i), SIMD256T::set1_epi32(i)};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
|
||||
{
|
||||
return Integer{SIMD256T::set1_epi8(i), SIMD256T::set1_epi8(i)};
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
|
||||
{
|
||||
return Float{SIMD256T::set1_ps(f), SIMD256T::set1_ps(f)};
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
|
||||
{
|
||||
return Float{SIMD256T::setzero_ps(), SIMD256T::setzero_ps()};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
|
||||
{
|
||||
return Integer{SIMD256T::setzero_si(), SIMD256T::setzero_si()};
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL
|
||||
store_ps(float* p, Float const& a) // *p = a (stores all elements contiguously in memory)
|
||||
{
|
||||
SIMD256T::store_ps(p, a.v8[0]);
|
||||
SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a
|
||||
{
|
||||
SIMD256T::store_si(&p->v8[0], a.v8[0]);
|
||||
SIMD256T::store_si(&p->v8[1], a.v8[1]);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL
|
||||
stream_ps(float* p, Float const& a) // *p = a (same as store_ps, but doesn't keep memory in cache)
|
||||
{
|
||||
SIMD256T::stream_ps(p, a.v8[0]);
|
||||
SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set_epi32(int i15,
|
||||
int i14,
|
||||
int i13,
|
||||
int i12,
|
||||
int i11,
|
||||
int i10,
|
||||
int i9,
|
||||
int i8,
|
||||
int i7,
|
||||
int i6,
|
||||
int i5,
|
||||
int i4,
|
||||
int i3,
|
||||
int i2,
|
||||
int i1,
|
||||
int i0)
|
||||
{
|
||||
return Integer{SIMD256T::set_epi32(i7, i6, i5, i4, i3, i2, i1, i0),
|
||||
SIMD256T::set_epi32(i15, i14, i13, i12, i11, i10, i9, i8)};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL
|
||||
set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
|
||||
{
|
||||
return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL set_ps(float i15,
|
||||
float i14,
|
||||
float i13,
|
||||
float i12,
|
||||
float i11,
|
||||
float i10,
|
||||
float i9,
|
||||
float i8,
|
||||
float i7,
|
||||
float i6,
|
||||
float i5,
|
||||
float i4,
|
||||
float i3,
|
||||
float i2,
|
||||
float i1,
|
||||
float i0)
|
||||
{
|
||||
return Float{SIMD256T::set_ps(i7, i6, i5, i4, i3, i2, i1, i0),
|
||||
SIMD256T::set_ps(i15, i14, i13, i12, i11, i10, i9, i8)};
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL
|
||||
set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
|
||||
{
|
||||
return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
|
||||
{
|
||||
return Float{SIMD256T::vmask_ps(mask), SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH)};
|
||||
}
|
||||
|
||||
#undef SIMD_WRAPPER_1
|
||||
#undef SIMD_WRAPPER_2
|
||||
#undef SIMD_WRAPPER_2I
|
||||
#undef SIMD_WRAPPER_2I_1
|
||||
#undef SIMD_WRAPPER_3
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_2I_1
|
||||
#undef SIMD_IWRAPPER_3
|
||||
|
|
@ -1,27 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
// no backwards compatibility for simd mask-enabled functions
|
||||
|
|
@ -1,332 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#pragma once
|
||||
#if 0
|
||||
//===========================================================================
|
||||
// Placeholder name representing either SIMD4, SIMD256, or SIMD16 structures.
|
||||
//===========================================================================
|
||||
struct SIMD256 // or SIMD4 or SIMD16
|
||||
{
|
||||
//=======================================================================
|
||||
// SIMD Types
|
||||
//
|
||||
// These typedefs are examples. The SIMD256 and SIMD16 implementations will
|
||||
// use different base types with this same naming.
|
||||
using Float = __m256; // Packed single-precision float vector
|
||||
using Double = __m256d; // Packed double-precision float vector
|
||||
using Integer = __m256i; // Packed integer vector (mutable element widths)
|
||||
using Mask = uint8_t; // Integer representing mask bits
|
||||
|
||||
//=======================================================================
|
||||
// Standard interface
|
||||
// (available in both SIMD256 and SIMD16 widths)
|
||||
//=======================================================================
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Single precision floating point arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
static Float add_ps(Float a, Float b); // return a + b
|
||||
static Float div_ps(Float a, Float b); // return a / b
|
||||
static Float fmadd_ps(Float a, Float b, Float c); // return (a * b) + c
|
||||
static Float fmsub_ps(Float a, Float b, Float c); // return (a * b) - c
|
||||
static Float max_ps(Float a, Float b); // return (a > b) ? a : b
|
||||
static Float min_ps(Float a, Float b); // return (a < b) ? a : b
|
||||
static Float mul_ps(Float a, Float b); // return a * b
|
||||
static Float rcp_ps(Float a); // return 1.0f / a
|
||||
static Float rsqrt_ps(Float a); // return 1.0f / sqrt(a)
|
||||
static Float sub_ps(Float a, Float b); // return a - b
|
||||
|
||||
enum class RoundMode
|
||||
{
|
||||
TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + (signof(value))0.5)
|
||||
TO_NEG_INF = 0x01, // Round to negative infinity
|
||||
TO_POS_INF = 0x02, // Round to positive infinity
|
||||
TO_ZERO = 0x03, // Round to 0 a.k.a. truncate
|
||||
CUR_DIRECTION = 0x04, // Round in direction set in MXCSR register
|
||||
|
||||
RAISE_EXC = 0x00, // Raise exception on overflow
|
||||
NO_EXC = 0x08, // Suppress exceptions
|
||||
|
||||
NINT = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC),
|
||||
NINT_NOEXC = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC),
|
||||
FLOOR = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC),
|
||||
FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC),
|
||||
CEIL = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC),
|
||||
CEIL_NOEXC = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC),
|
||||
TRUNC = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC),
|
||||
TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC),
|
||||
RINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC),
|
||||
NEARBYINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC),
|
||||
};
|
||||
|
||||
// return round_func(a)
|
||||
//
|
||||
// round_func is chosen on the RMT template parameter. See the documentation
|
||||
// for the RoundMode enumeration above.
|
||||
template <RoundMode RMT>
|
||||
static Float round_ps(Float a); // return round(a)
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Integer (various width) arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
static Integer abs_epi32(Integer a); // return absolute_value(a) (int32)
|
||||
static Integer add_epi32(Integer a, Integer b); // return a + b (int32)
|
||||
static Integer add_epi8(Integer a, Integer b); // return a + b (int8)
|
||||
static Integer adds_epu8(Integer a, Integer b); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
static Integer max_epi32(Integer a, Integer b); // return (a > b) ? a : b (int32)
|
||||
static Integer max_epu32(Integer a, Integer b); // return (a > b) ? a : b (uint32)
|
||||
static Integer min_epi32(Integer a, Integer b); // return (a < b) ? a : b (int32)
|
||||
static Integer min_epu32(Integer a, Integer b); // return (a < b) ? a : b (uint32)
|
||||
static Integer mul_epi32(Integer a, Integer b); // return a * b (int32)
|
||||
|
||||
// return (a * b) & 0xFFFFFFFF
|
||||
//
|
||||
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
|
||||
// and store the low 32 bits of the intermediate integers in dst.
|
||||
static Float mullo_epi32(Integer a, Integer b);
|
||||
|
||||
static Integer sub_epi32(Integer a, Integer b); // return a - b (int32)
|
||||
static Integer sub_epi64(Integer a, Integer b); // return a - b (int64)
|
||||
static Integer subs_epu8(Integer a, Integer b); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Logical operations
|
||||
//-----------------------------------------------------------------------
|
||||
static Float and_ps(Float a, Float b); // return a & b (float treated as int)
|
||||
static Integer and_si(Integer a, Integer b); // return a & b (int)
|
||||
static Float andnot_ps(Float a, Float b); // return (~a) & b (float treated as int)
|
||||
static Integer andnot_si(Integer a, Integer b); // return (~a) & b (int)
|
||||
static Float or_ps(Float a, Float b); // return a | b (float treated as int)
|
||||
static Float or_si(Integer a, Integer b); // return a | b (int)
|
||||
static Float xor_ps(Float a, Float b); // return a ^ b (float treated as int)
|
||||
static Integer xor_si(Integer a, Integer b); // return a ^ b (int)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Shift operations
|
||||
//-----------------------------------------------------------------------
|
||||
template<int ImmT>
|
||||
static Integer slli_epi32(Integer a); // return a << ImmT
|
||||
static Integer sllv_epi32(Integer a, Integer b); // return a << b
|
||||
template<int ImmT>
|
||||
static Integer srai_epi32(Integer a); // return a >> ImmT (int32)
|
||||
template<int ImmT>
|
||||
static Integer srli_epi32(Integer a); // return a >> ImmT (uint32)
|
||||
template<int ImmT> // for each 128-bit lane:
|
||||
static Integer srli_si(Integer a); // return a >> (ImmT*8) (uint)
|
||||
template<int ImmT>
|
||||
static Float srlisi_ps(Float a); // same as srli_si, but with Float cast to int
|
||||
static Integer srlv_epi32(Integer a, Integer b); // return a >> b (uint32)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Conversion operations
|
||||
//-----------------------------------------------------------------------
|
||||
static Float castpd_ps(Double a); // return *(Float*)(&a)
|
||||
static Integer castps_si(Float a); // return *(Integer*)(&a)
|
||||
static Double castsi_pd(Integer a); // return *(Double*)(&a)
|
||||
static Double castps_pd(Float a); // return *(Double*)(&a)
|
||||
static Float castsi_ps(Integer a); // return *(Float*)(&a)
|
||||
static Float cvtepi32_ps(Integer a); // return (float)a (int32 --> float)
|
||||
static Integer cvtepu8_epi16(Integer a); // return (int16)a (uint8 --> int16)
|
||||
static Integer cvtepu8_epi32(Integer a); // return (int32)a (uint8 --> int32)
|
||||
static Integer cvtepu16_epi32(Integer a); // return (int32)a (uint16 --> int32)
|
||||
static Integer cvtepu16_epi64(Integer a); // return (int64)a (uint16 --> int64)
|
||||
static Integer cvtepu32_epi64(Integer a); // return (int64)a (uint32 --> int64)
|
||||
static Integer cvtps_epi32(Float a); // return (int32)a (float --> int32)
|
||||
static Integer cvttps_epi32(Float a); // return (int32)a (rnd_to_zero(float) --> int32)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Comparison operations
|
||||
//-----------------------------------------------------------------------
|
||||
|
||||
// Comparison types used with cmp_ps:
|
||||
// - ordered comparisons are always false if either operand is NaN
|
||||
// - unordered comparisons are always true if either operand is NaN
|
||||
// - signaling comparisons raise an exception if either operand is NaN
|
||||
// - non-signaling comparisons will never raise an exception
|
||||
//
|
||||
// Ordered: return (a != NaN) && (b != NaN) && (a cmp b)
|
||||
// Unordered: return (a == NaN) || (b == NaN) || (a cmp b)
|
||||
enum class CompareType
|
||||
{
|
||||
EQ_OQ = 0x00, // Equal (ordered, nonsignaling)
|
||||
LT_OS = 0x01, // Less-than (ordered, signaling)
|
||||
LE_OS = 0x02, // Less-than-or-equal (ordered, signaling)
|
||||
UNORD_Q = 0x03, // Unordered (nonsignaling)
|
||||
NEQ_UQ = 0x04, // Not-equal (unordered, nonsignaling)
|
||||
NLT_US = 0x05, // Not-less-than (unordered, signaling)
|
||||
NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling)
|
||||
ORD_Q = 0x07, // Ordered (nonsignaling)
|
||||
EQ_UQ = 0x08, // Equal (unordered, non-signaling)
|
||||
NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling)
|
||||
NGT_US = 0x0A, // Not-greater-than (unordered, signaling)
|
||||
FALSE_OQ = 0x0B, // False (ordered, nonsignaling)
|
||||
NEQ_OQ = 0x0C, // Not-equal (ordered, non-signaling)
|
||||
GE_OS = 0x0D, // Greater-than-or-equal (ordered, signaling)
|
||||
GT_OS = 0x0E, // Greater-than (ordered, signaling)
|
||||
TRUE_UQ = 0x0F, // True (unordered, non-signaling)
|
||||
EQ_OS = 0x10, // Equal (ordered, signaling)
|
||||
LT_OQ = 0x11, // Less-than (ordered, nonsignaling)
|
||||
LE_OQ = 0x12, // Less-than-or-equal (ordered, nonsignaling)
|
||||
UNORD_S = 0x13, // Unordered (signaling)
|
||||
NEQ_US = 0x14, // Not-equal (unordered, signaling)
|
||||
NLT_UQ = 0x15, // Not-less-than (unordered, nonsignaling)
|
||||
NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
|
||||
ORD_S = 0x17, // Ordered (signaling)
|
||||
EQ_US = 0x18, // Equal (unordered, signaling)
|
||||
NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
|
||||
NGT_UQ = 0x1A, // Not-greater-than (unordered, nonsignaling)
|
||||
FALSE_OS = 0x1B, // False (ordered, signaling)
|
||||
NEQ_OS = 0x1C, // Not-equal (ordered, signaling)
|
||||
GE_OQ = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
|
||||
GT_OQ = 0x1E, // Greater-than (ordered, nonsignaling)
|
||||
TRUE_US = 0x1F, // True (unordered, signaling)
|
||||
};
|
||||
|
||||
// return a (CmpTypeT) b (float)
|
||||
//
|
||||
// See documentation for CompareType above for valid values for CmpTypeT.
|
||||
template<CompareType CmpTypeT>
|
||||
static Float cmp_ps(Float a, Float b); // return a (CmtTypeT) b (see above)
|
||||
static Float cmpgt_ps(Float a, Float b); // return cmp_ps<CompareType::GT_OQ>(a, b)
|
||||
static Float cmple_ps(Float a, Float b); // return cmp_ps<CompareType::LE_OQ>(a, b)
|
||||
static Float cmplt_ps(Float a, Float b); // return cmp_ps<CompareType::LT_OQ>(a, b)
|
||||
static Float cmpneq_ps(Float a, Float b); // return cmp_ps<CompareType::NEQ_OQ>(a, b)
|
||||
static Float cmpeq_ps(Float a, Float b); // return cmp_ps<CompareType::EQ_OQ>(a, b)
|
||||
static Float cmpge_ps(Float a, Float b); // return cmp_ps<CompareType::GE_OQ>(a, b)
|
||||
static Integer cmpeq_epi8(Integer a, Integer b); // return a == b (int8)
|
||||
static Integer cmpeq_epi16(Integer a, Integer b); // return a == b (int16)
|
||||
static Integer cmpeq_epi32(Integer a, Integer b); // return a == b (int32)
|
||||
static Integer cmpeq_epi64(Integer a, Integer b); // return a == b (int64)
|
||||
static Integer cmpgt_epi8(Integer a, Integer b); // return a > b (int8)
|
||||
static Integer cmpgt_epi16(Integer a, Integer b); // return a > b (int16)
|
||||
static Integer cmpgt_epi32(Integer a, Integer b); // return a > b (int32)
|
||||
static Integer cmpgt_epi64(Integer a, Integer b); // return a > b (int64)
|
||||
static Integer cmplt_epi32(Integer a, Integer b); // return a < b (int32)
|
||||
static bool testz_ps(Float a, Float b); // return all_lanes_zero(a & b) ? 1 : 0 (float)
|
||||
static bool testz_si(Integer a, Integer b); // return all_lanes_zero(a & b) ? 1 : 0 (int)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Blend / shuffle / permute operations
|
||||
//-----------------------------------------------------------------------
|
||||
template<int ImmT>
|
||||
static Float blend_ps(Float a, Float b); // return ImmT ? b : a (float)
|
||||
static Integer blendv_epi32(Integer a, Integer b, Float mask); // return mask ? b : a (int)
|
||||
static Float blendv_ps(Float a, Float b, Float mask); // return mask ? b : a (float)
|
||||
static Float broadcast_ss(float const *p); // return *p (all elements in vector get same value)
|
||||
static Integer packs_epi16(Integer a, Integer b); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
|
||||
static Integer packs_epi32(Integer a, Integer b); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
|
||||
static Integer packus_epi16(Integer a, Integer b); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
|
||||
static Integer packus_epi32(Integer a, Integer b); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
|
||||
static Float permute_epi32(Integer a, Integer swiz); // return a[swiz[i]] for each 32-bit lane i (int32)
|
||||
static Float permute_ps(Float a, Integer swiz); // return a[swiz[i]] for each 32-bit lane i (float)
|
||||
template<int SwizT>
|
||||
static Integer shuffle_epi32(Integer a, Integer b);
|
||||
template<int SwizT>
|
||||
static Integer shuffle_epi64(Integer a, Integer b);
|
||||
static Integer shuffle_epi8(Integer a, Integer b);
|
||||
template<int SwizT>
|
||||
static Float shuffle_pd(Double a, Double b);
|
||||
template<int SwizT>
|
||||
static Float shuffle_ps(Float a, Float b);
|
||||
static Integer unpackhi_epi16(Integer a, Integer b);
|
||||
static Integer unpackhi_epi32(Integer a, Integer b);
|
||||
static Integer unpackhi_epi64(Integer a, Integer b);
|
||||
static Integer unpackhi_epi8(Integer a, Integer b);
|
||||
static Float unpackhi_pd(Double a, Double b);
|
||||
static Float unpackhi_ps(Float a, Float b);
|
||||
static Integer unpacklo_epi16(Integer a, Integer b);
|
||||
static Integer unpacklo_epi32(Integer a, Integer b);
|
||||
static Integer unpacklo_epi64(Integer a, Integer b);
|
||||
static Integer unpacklo_epi8(Integer a, Integer b);
|
||||
static Float unpacklo_pd(Double a, Double b);
|
||||
static Float unpacklo_ps(Float a, Float b);
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Load / store operations
|
||||
//-----------------------------------------------------------------------
|
||||
enum class ScaleFactor
|
||||
{
|
||||
SF_1, // No scaling
|
||||
SF_2, // Scale offset by 2
|
||||
SF_4, // Scale offset by 4
|
||||
SF_8, // Scale offset by 8
|
||||
};
|
||||
|
||||
template<ScaleFactor ScaleT = ScaleFactor::SF_1>
|
||||
static Float i32gather_ps(float const* p, Integer idx); // return *(float*)(((int8*)p) + (idx * ScaleT))
|
||||
static Float load1_ps(float const *p); // return *p (broadcast 1 value to all elements)
|
||||
static Float load_ps(float const *p); // return *p (loads SIMD width elements from memory)
|
||||
static Integer load_si(Integer const *p); // return *p
|
||||
static Float loadu_ps(float const *p); // return *p (same as load_ps but allows for unaligned mem)
|
||||
static Integer loadu_si(Integer const *p); // return *p (same as load_si but allows for unaligned mem)
|
||||
|
||||
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
|
||||
template<int ScaleT>
|
||||
static Float mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask);
|
||||
|
||||
static void maskstore_ps(float *p, Integer mask, Float src);
|
||||
static int movemask_epi8(Integer a);
|
||||
static int movemask_pd(Double a);
|
||||
static int movemask_ps(Float a);
|
||||
static Integer set1_epi32(int i); // return i (all elements are same value)
|
||||
static Integer set1_epi8(char i); // return i (all elements are same value)
|
||||
static Float set1_ps(float f); // return f (all elements are same value)
|
||||
static Float setzero_ps(); // return 0 (float)
|
||||
static Integer setzero_si(); // return 0 (integer)
|
||||
static void store_ps(float *p, Float a); // *p = a (stores all elements contiguously in memory)
|
||||
static void store_si(Integer *p, Integer a); // *p = a
|
||||
static void stream_ps(float *p, Float a); // *p = a (same as store_ps, but doesn't keep memory in cache)
|
||||
|
||||
//=======================================================================
|
||||
// Legacy interface (available only in SIMD256 width)
|
||||
//=======================================================================
|
||||
|
||||
static Float broadcast_ps(__m128 const *p);
|
||||
template<int ImmT>
|
||||
static __m128d extractf128_pd(Double a);
|
||||
template<int ImmT>
|
||||
static __m128 extractf128_ps(Float a);
|
||||
template<int ImmT>
|
||||
static __m128i extractf128_si(Integer a);
|
||||
template<int ImmT>
|
||||
static Double insertf128_pd(Double a, __m128d b);
|
||||
template<int ImmT>
|
||||
static Float insertf128_ps(Float a, __m128 b);
|
||||
template<int ImmT>
|
||||
static Integer insertf128_si(Integer a, __m128i b);
|
||||
static Integer loadu2_si(__m128 const* phi, __m128 const* plo);
|
||||
template<int ImmT>
|
||||
static Double permute2f128_pd(Double a, Double b);
|
||||
template<int ImmT>
|
||||
static Float permute2f128_ps(Float a, Float b);
|
||||
template<int ImmT>
|
||||
static Integer permute2f128_si(Integer a, Integer b);
|
||||
static Integer set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0);
|
||||
static void storeu2_si(__m128i *phi, __m128i *plo, Integer src);
|
||||
|
||||
//=======================================================================
|
||||
// Advanced masking interface (currently available only in SIMD16 width)
|
||||
//=======================================================================
|
||||
};
|
||||
#endif // #if 0
|
||||
|
|
@ -1,457 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#if !defined(__cplusplus)
|
||||
#error C++ compilation required
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <inttypes.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#define SIMD_ARCH_AVX 0
|
||||
#define SIMD_ARCH_AVX2 1
|
||||
#define SIMD_ARCH_AVX512 2
|
||||
|
||||
#if !defined(SIMD_ARCH)
|
||||
#define SIMD_ARCH SIMD_ARCH_AVX
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define SIMDCALL __vectorcall
|
||||
#define SIMDINLINE __forceinline
|
||||
#define SIMDALIGN(type_, align_) __declspec(align(align_)) type_
|
||||
#else
|
||||
#define SIMDCALL
|
||||
#define SIMDINLINE inline
|
||||
#define SIMDALIGN(type_, align_) type_ __attribute__((aligned(align_)))
|
||||
#endif
|
||||
|
||||
// For documentation, please see the following include...
|
||||
// #include "simdlib_interface.hpp"
|
||||
|
||||
namespace SIMDImpl
|
||||
{
|
||||
enum class CompareType
|
||||
{
|
||||
EQ_OQ = 0x00, // Equal (ordered, nonsignaling)
|
||||
LT_OS = 0x01, // Less-than (ordered, signaling)
|
||||
LE_OS = 0x02, // Less-than-or-equal (ordered, signaling)
|
||||
UNORD_Q = 0x03, // Unordered (nonsignaling)
|
||||
NEQ_UQ = 0x04, // Not-equal (unordered, nonsignaling)
|
||||
NLT_US = 0x05, // Not-less-than (unordered, signaling)
|
||||
NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling)
|
||||
ORD_Q = 0x07, // Ordered (nonsignaling)
|
||||
EQ_UQ = 0x08, // Equal (unordered, non-signaling)
|
||||
NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling)
|
||||
NGT_US = 0x0A, // Not-greater-than (unordered, signaling)
|
||||
FALSE_OQ = 0x0B, // False (ordered, nonsignaling)
|
||||
NEQ_OQ = 0x0C, // Not-equal (ordered, non-signaling)
|
||||
GE_OS = 0x0D, // Greater-than-or-equal (ordered, signaling)
|
||||
GT_OS = 0x0E, // Greater-than (ordered, signaling)
|
||||
TRUE_UQ = 0x0F, // True (unordered, non-signaling)
|
||||
EQ_OS = 0x10, // Equal (ordered, signaling)
|
||||
LT_OQ = 0x11, // Less-than (ordered, nonsignaling)
|
||||
LE_OQ = 0x12, // Less-than-or-equal (ordered, nonsignaling)
|
||||
UNORD_S = 0x13, // Unordered (signaling)
|
||||
NEQ_US = 0x14, // Not-equal (unordered, signaling)
|
||||
NLT_UQ = 0x15, // Not-less-than (unordered, nonsignaling)
|
||||
NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
|
||||
ORD_S = 0x17, // Ordered (signaling)
|
||||
EQ_US = 0x18, // Equal (unordered, signaling)
|
||||
NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
|
||||
NGT_UQ = 0x1A, // Not-greater-than (unordered, nonsignaling)
|
||||
FALSE_OS = 0x1B, // False (ordered, signaling)
|
||||
NEQ_OS = 0x1C, // Not-equal (ordered, signaling)
|
||||
GE_OQ = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
|
||||
GT_OQ = 0x1E, // Greater-than (ordered, nonsignaling)
|
||||
TRUE_US = 0x1F, // True (unordered, signaling)
|
||||
};
|
||||
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
enum class CompareTypeInt
|
||||
{
|
||||
EQ = _MM_CMPINT_EQ, // Equal
|
||||
LT = _MM_CMPINT_LT, // Less than
|
||||
LE = _MM_CMPINT_LE, // Less than or Equal
|
||||
NE = _MM_CMPINT_NE, // Not Equal
|
||||
GE = _MM_CMPINT_GE, // Greater than or Equal
|
||||
GT = _MM_CMPINT_GT, // Greater than
|
||||
};
|
||||
#endif // SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
|
||||
enum class ScaleFactor
|
||||
{
|
||||
SF_1 = 1, // No scaling
|
||||
SF_2 = 2, // Scale offset by 2
|
||||
SF_4 = 4, // Scale offset by 4
|
||||
SF_8 = 8, // Scale offset by 8
|
||||
};
|
||||
|
||||
enum class RoundMode
|
||||
{
|
||||
TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + 0.5)
|
||||
TO_NEG_INF = 0x01, // Round to negative infinity
|
||||
TO_POS_INF = 0x02, // Round to positive infinity
|
||||
TO_ZERO = 0x03, // Round to 0 a.k.a. truncate
|
||||
CUR_DIRECTION = 0x04, // Round in direction set in MXCSR register
|
||||
|
||||
RAISE_EXC = 0x00, // Raise exception on overflow
|
||||
NO_EXC = 0x08, // Suppress exceptions
|
||||
|
||||
NINT = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC),
|
||||
NINT_NOEXC = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC),
|
||||
FLOOR = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC),
|
||||
FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC),
|
||||
CEIL = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC),
|
||||
CEIL_NOEXC = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC),
|
||||
TRUNC = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC),
|
||||
TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC),
|
||||
RINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC),
|
||||
NEARBYINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC),
|
||||
};
|
||||
|
||||
struct Traits
|
||||
{
|
||||
using CompareType = SIMDImpl::CompareType;
|
||||
using ScaleFactor = SIMDImpl::ScaleFactor;
|
||||
using RoundMode = SIMDImpl::RoundMode;
|
||||
};
|
||||
|
||||
// Attribute, 4-dimensional attribute in SIMD SOA layout
|
||||
template <typename Float, typename Integer, typename Double>
|
||||
union Vec4
|
||||
{
|
||||
Float v[4];
|
||||
Integer vi[4];
|
||||
Double vd[4];
|
||||
struct
|
||||
{
|
||||
Float x;
|
||||
Float y;
|
||||
Float z;
|
||||
Float w;
|
||||
};
|
||||
SIMDINLINE Float& SIMDCALL operator[](const int i) { return v[i]; }
|
||||
SIMDINLINE Float const& SIMDCALL operator[](const int i) const { return v[i]; }
|
||||
SIMDINLINE Vec4& SIMDCALL operator=(Vec4 const& in)
|
||||
{
|
||||
v[0] = in.v[0];
|
||||
v[1] = in.v[1];
|
||||
v[2] = in.v[2];
|
||||
v[3] = in.v[3];
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
namespace SIMD128Impl
|
||||
{
|
||||
union Float
|
||||
{
|
||||
SIMDINLINE Float() = default;
|
||||
SIMDINLINE Float(__m128 in) : v(in) {}
|
||||
SIMDINLINE Float& SIMDCALL operator=(__m128 in)
|
||||
{
|
||||
v = in;
|
||||
return *this;
|
||||
}
|
||||
SIMDINLINE Float& SIMDCALL operator=(Float const& in)
|
||||
{
|
||||
v = in.v;
|
||||
return *this;
|
||||
}
|
||||
SIMDINLINE SIMDCALL operator __m128() const { return v; }
|
||||
|
||||
SIMDALIGN(__m128, 16) v;
|
||||
};
|
||||
|
||||
union Integer
|
||||
{
|
||||
SIMDINLINE Integer() = default;
|
||||
SIMDINLINE Integer(__m128i in) : v(in) {}
|
||||
SIMDINLINE Integer& SIMDCALL operator=(__m128i in)
|
||||
{
|
||||
v = in;
|
||||
return *this;
|
||||
}
|
||||
SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
|
||||
{
|
||||
v = in.v;
|
||||
return *this;
|
||||
}
|
||||
SIMDINLINE SIMDCALL operator __m128i() const { return v; }
|
||||
|
||||
SIMDALIGN(__m128i, 16) v;
|
||||
};
|
||||
|
||||
union Double
|
||||
{
|
||||
SIMDINLINE Double() = default;
|
||||
SIMDINLINE Double(__m128d in) : v(in) {}
|
||||
SIMDINLINE Double& SIMDCALL operator=(__m128d in)
|
||||
{
|
||||
v = in;
|
||||
return *this;
|
||||
}
|
||||
SIMDINLINE Double& SIMDCALL operator=(Double const& in)
|
||||
{
|
||||
v = in.v;
|
||||
return *this;
|
||||
}
|
||||
SIMDINLINE SIMDCALL operator __m128d() const { return v; }
|
||||
|
||||
SIMDALIGN(__m128d, 16) v;
|
||||
};
|
||||
|
||||
using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
|
||||
using Mask = uint8_t;
|
||||
|
||||
static const uint32_t SIMD_WIDTH = 4;
|
||||
} // namespace SIMD128Impl
|
||||
|
||||
namespace SIMD256Impl
|
||||
{
|
||||
union Float
|
||||
{
|
||||
SIMDINLINE Float() = default;
|
||||
SIMDINLINE Float(__m256 in) : v(in) {}
|
||||
SIMDINLINE Float(SIMD128Impl::Float const& in_lo,
|
||||
SIMD128Impl::Float const& in_hi = _mm_setzero_ps())
|
||||
{
|
||||
v = _mm256_insertf128_ps(_mm256_castps128_ps256(in_lo), in_hi, 0x1);
|
||||
}
|
||||
SIMDINLINE Float& SIMDCALL operator=(__m256 in)
|
||||
{
|
||||
v = in;
|
||||
return *this;
|
||||
}
|
||||
SIMDINLINE Float& SIMDCALL operator=(Float const& in)
|
||||
{
|
||||
v = in.v;
|
||||
return *this;
|
||||
}
|
||||
SIMDINLINE SIMDCALL operator __m256() const { return v; }
|
||||
|
||||
SIMDALIGN(__m256, 32) v;
|
||||
SIMD128Impl::Float v4[2];
|
||||
};
|
||||
|
||||
union Integer
|
||||
{
|
||||
SIMDINLINE Integer() = default;
|
||||
SIMDINLINE Integer(__m256i in) : v(in) {}
|
||||
SIMDINLINE Integer(SIMD128Impl::Integer const& in_lo,
|
||||
SIMD128Impl::Integer const& in_hi = _mm_setzero_si128())
|
||||
{
|
||||
v = _mm256_insertf128_si256(_mm256_castsi128_si256(in_lo), in_hi, 0x1);
|
||||
}
|
||||
SIMDINLINE Integer& SIMDCALL operator=(__m256i in)
|
||||
{
|
||||
v = in;
|
||||
return *this;
|
||||
}
|
||||
SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
|
||||
{
|
||||
v = in.v;
|
||||
return *this;
|
||||
}
|
||||
SIMDINLINE SIMDCALL operator __m256i() const { return v; }
|
||||
|
||||
SIMDALIGN(__m256i, 32) v;
|
||||
SIMD128Impl::Integer v4[2];
|
||||
};
|
||||
|
||||
union Double
|
||||
{
|
||||
SIMDINLINE Double() = default;
|
||||
SIMDINLINE Double(__m256d const& in) : v(in) {}
|
||||
SIMDINLINE Double(SIMD128Impl::Double const& in_lo,
|
||||
SIMD128Impl::Double const& in_hi = _mm_setzero_pd())
|
||||
{
|
||||
v = _mm256_insertf128_pd(_mm256_castpd128_pd256(in_lo), in_hi, 0x1);
|
||||
}
|
||||
SIMDINLINE Double& SIMDCALL operator=(__m256d in)
|
||||
{
|
||||
v = in;
|
||||
return *this;
|
||||
}
|
||||
SIMDINLINE Double& SIMDCALL operator=(Double const& in)
|
||||
{
|
||||
v = in.v;
|
||||
return *this;
|
||||
}
|
||||
SIMDINLINE SIMDCALL operator __m256d() const { return v; }
|
||||
|
||||
SIMDALIGN(__m256d, 32) v;
|
||||
SIMD128Impl::Double v4[2];
|
||||
};
|
||||
|
||||
using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
|
||||
using Mask = uint8_t;
|
||||
|
||||
static const uint32_t SIMD_WIDTH = 8;
|
||||
} // namespace SIMD256Impl
|
||||
|
||||
namespace SIMD512Impl
|
||||
{
|
||||
#if !(defined(__AVX512F__) || defined(_ZMMINTRIN_H_INCLUDED))
|
||||
// Define AVX512 types if not included via immintrin.h.
|
||||
// All data members of these types are ONLY to viewed
|
||||
// in a debugger. Do NOT access them via code!
|
||||
union __m512
|
||||
{
|
||||
private:
|
||||
float m512_f32[16];
|
||||
};
|
||||
struct __m512d
|
||||
{
|
||||
private:
|
||||
double m512d_f64[8];
|
||||
};
|
||||
|
||||
union __m512i
|
||||
{
|
||||
private:
|
||||
int8_t m512i_i8[64];
|
||||
int16_t m512i_i16[32];
|
||||
int32_t m512i_i32[16];
|
||||
int64_t m512i_i64[8];
|
||||
uint8_t m512i_u8[64];
|
||||
uint16_t m512i_u16[32];
|
||||
uint32_t m512i_u32[16];
|
||||
uint64_t m512i_u64[8];
|
||||
};
|
||||
|
||||
using __mmask16 = uint16_t;
|
||||
#endif
|
||||
|
||||
#if defined(__INTEL_COMPILER) || (SIMD_ARCH >= SIMD_ARCH_AVX512)
|
||||
#define SIMD_ALIGNMENT_BYTES 64
|
||||
#else
|
||||
#define SIMD_ALIGNMENT_BYTES 32
|
||||
#endif
|
||||
|
||||
union Float
|
||||
{
|
||||
SIMDINLINE Float() = default;
|
||||
SIMDINLINE Float(__m512 in) : v(in) {}
|
||||
SIMDINLINE Float(SIMD256Impl::Float const& in_lo,
|
||||
SIMD256Impl::Float const& in_hi = _mm256_setzero_ps())
|
||||
{
|
||||
v8[0] = in_lo;
|
||||
v8[1] = in_hi;
|
||||
}
|
||||
SIMDINLINE Float& SIMDCALL operator=(__m512 in)
|
||||
{
|
||||
v = in;
|
||||
return *this;
|
||||
}
|
||||
SIMDINLINE Float& SIMDCALL operator=(Float const& in)
|
||||
{
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
v = in.v;
|
||||
#else
|
||||
v8[0] = in.v8[0];
|
||||
v8[1] = in.v8[1];
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
SIMDINLINE SIMDCALL operator __m512() const { return v; }
|
||||
|
||||
SIMDALIGN(__m512, SIMD_ALIGNMENT_BYTES) v;
|
||||
SIMD256Impl::Float v8[2];
|
||||
};
|
||||
|
||||
union Integer
|
||||
{
|
||||
SIMDINLINE Integer() = default;
|
||||
SIMDINLINE Integer(__m512i in) : v(in) {}
|
||||
SIMDINLINE Integer(SIMD256Impl::Integer const& in_lo,
|
||||
SIMD256Impl::Integer const& in_hi = _mm256_setzero_si256())
|
||||
{
|
||||
v8[0] = in_lo;
|
||||
v8[1] = in_hi;
|
||||
}
|
||||
SIMDINLINE Integer& SIMDCALL operator=(__m512i in)
|
||||
{
|
||||
v = in;
|
||||
return *this;
|
||||
}
|
||||
SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
|
||||
{
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
v = in.v;
|
||||
#else
|
||||
v8[0] = in.v8[0];
|
||||
v8[1] = in.v8[1];
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
SIMDINLINE SIMDCALL operator __m512i() const { return v; }
|
||||
|
||||
SIMDALIGN(__m512i, SIMD_ALIGNMENT_BYTES) v;
|
||||
SIMD256Impl::Integer v8[2];
|
||||
};
|
||||
|
||||
union Double
|
||||
{
|
||||
SIMDINLINE Double() = default;
|
||||
SIMDINLINE Double(__m512d in) : v(in) {}
|
||||
SIMDINLINE Double(SIMD256Impl::Double const& in_lo,
|
||||
SIMD256Impl::Double const& in_hi = _mm256_setzero_pd())
|
||||
{
|
||||
v8[0] = in_lo;
|
||||
v8[1] = in_hi;
|
||||
}
|
||||
SIMDINLINE Double& SIMDCALL operator=(__m512d in)
|
||||
{
|
||||
v = in;
|
||||
return *this;
|
||||
}
|
||||
SIMDINLINE Double& SIMDCALL operator=(Double const& in)
|
||||
{
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
v = in.v;
|
||||
#else
|
||||
v8[0] = in.v8[0];
|
||||
v8[1] = in.v8[1];
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
SIMDINLINE SIMDCALL operator __m512d() const { return v; }
|
||||
|
||||
SIMDALIGN(__m512d, SIMD_ALIGNMENT_BYTES) v;
|
||||
SIMD256Impl::Double v8[2];
|
||||
};
|
||||
|
||||
typedef SIMDImpl::Vec4<Float, Integer, Double> SIMDALIGN(Vec4, 64);
|
||||
using Mask = __mmask16;
|
||||
|
||||
static const uint32_t SIMD_WIDTH = 16;
|
||||
|
||||
#undef SIMD_ALIGNMENT_BYTES
|
||||
} // namespace SIMD512Impl
|
||||
} // namespace SIMDImpl
|
||||
|
|
@ -1,299 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
|
||||
#include "common/os.h"
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
#include <algorithm>
|
||||
#include <mutex>
|
||||
|
||||
#if SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma comment(lib, "user32.lib")
|
||||
#endif // _WIN32
|
||||
|
||||
namespace ConsoleUtils
|
||||
{
|
||||
enum class TextColor
|
||||
{
|
||||
BLACK = 0,
|
||||
#if defined(_WIN32)
|
||||
RED = 4,
|
||||
GREEN = 2,
|
||||
BLUE = 1,
|
||||
#else
|
||||
RED = 1,
|
||||
GREEN = 2,
|
||||
BLUE = 4,
|
||||
#endif // _WIN32
|
||||
PURPLE = static_cast<uint32_t>(RED) | static_cast<uint32_t>(BLUE),
|
||||
CYAN = static_cast<uint32_t>(GREEN) | static_cast<uint32_t>(BLUE),
|
||||
YELLOW = static_cast<uint32_t>(RED) | static_cast<uint32_t>(GREEN),
|
||||
WHITE =
|
||||
static_cast<uint32_t>(RED) | static_cast<uint32_t>(GREEN) | static_cast<uint32_t>(BLUE),
|
||||
};
|
||||
|
||||
enum class TextStyle
|
||||
{
|
||||
NORMAL = 0,
|
||||
INTENSITY = 1,
|
||||
};
|
||||
|
||||
void SetTextColor(FILE* stream,
|
||||
TextColor color = TextColor::WHITE,
|
||||
TextStyle style = TextStyle::NORMAL)
|
||||
{
|
||||
#if defined(_WIN32)
|
||||
|
||||
HANDLE hConsoleHandle = nullptr;
|
||||
if (stream == stderr)
|
||||
{
|
||||
hConsoleHandle = GetStdHandle(STD_ERROR_HANDLE);
|
||||
}
|
||||
else if (stream == stdout)
|
||||
{
|
||||
hConsoleHandle = GetStdHandle(STD_OUTPUT_HANDLE);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Not a console stream, do nothing
|
||||
return;
|
||||
}
|
||||
|
||||
WORD textAttributes = static_cast<WORD>(color);
|
||||
if (style == TextStyle::INTENSITY)
|
||||
{
|
||||
textAttributes |= FOREGROUND_INTENSITY;
|
||||
}
|
||||
SetConsoleTextAttribute(hConsoleHandle, textAttributes);
|
||||
|
||||
#else // !_WIN32
|
||||
|
||||
// Print ANSI codes
|
||||
uint32_t cc =
|
||||
30 + ((style == TextStyle::INTENSITY) ? 60 : 0) + static_cast<uint32_t>(color);
|
||||
fprintf(stream, "\033[0m\033[%d;%dm", static_cast<uint32_t>(style), cc);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void ResetTextColor(FILE* stream)
|
||||
{
|
||||
#if defined(_WIN32)
|
||||
|
||||
SetTextColor(stream);
|
||||
|
||||
#else // !_WIN32
|
||||
|
||||
// Print ANSI codes
|
||||
fprintf(stream, "\033[0m");
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::mutex g_stderrMutex;
|
||||
} // namespace ConsoleUtils
|
||||
|
||||
bool SwrAssert(bool chkDebugger,
|
||||
bool& enabled,
|
||||
const char* pExpression,
|
||||
const char* pFileName,
|
||||
uint32_t lineNum,
|
||||
const char* pFunction,
|
||||
const char* pFmtString,
|
||||
...)
|
||||
{
|
||||
using namespace ConsoleUtils;
|
||||
std::lock_guard<std::mutex> l(g_stderrMutex);
|
||||
|
||||
SetTextColor(stderr, TextColor::CYAN, TextStyle::NORMAL);
|
||||
|
||||
fprintf(stderr, "%s(%d): ", pFileName, lineNum);
|
||||
|
||||
SetTextColor(stderr, TextColor::RED, TextStyle::INTENSITY);
|
||||
|
||||
fprintf(stderr, "ASSERT: %s\n", pExpression);
|
||||
|
||||
SetTextColor(stderr, TextColor::CYAN, TextStyle::INTENSITY);
|
||||
fprintf(stderr, "\t%s\n", pFunction);
|
||||
|
||||
if (pFmtString)
|
||||
{
|
||||
SetTextColor(stderr, TextColor::YELLOW, TextStyle::INTENSITY);
|
||||
fprintf(stderr, "\t");
|
||||
va_list args;
|
||||
va_start(args, pFmtString);
|
||||
vfprintf(stderr, pFmtString, args);
|
||||
va_end(args);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
ResetTextColor(stderr);
|
||||
fflush(stderr);
|
||||
|
||||
#if defined(_WIN32)
|
||||
static const int MAX_MESSAGE_LEN = 2048;
|
||||
char msgBuf[MAX_MESSAGE_LEN];
|
||||
|
||||
sprintf_s(msgBuf, "%s(%d): ASSERT: %s\n", pFileName, lineNum, pExpression);
|
||||
msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
|
||||
msgBuf[MAX_MESSAGE_LEN - 1] = 0;
|
||||
OutputDebugStringA(msgBuf);
|
||||
|
||||
sprintf_s(msgBuf, "\t%s\n", pFunction);
|
||||
msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
|
||||
msgBuf[MAX_MESSAGE_LEN - 1] = 0;
|
||||
OutputDebugStringA(msgBuf);
|
||||
|
||||
int offset = 0;
|
||||
|
||||
if (pFmtString)
|
||||
{
|
||||
va_list args;
|
||||
va_start(args, pFmtString);
|
||||
offset = _vsnprintf_s(msgBuf, sizeof(msgBuf), sizeof(msgBuf), pFmtString, args);
|
||||
va_end(args);
|
||||
|
||||
if (offset < 0)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
OutputDebugStringA("\t");
|
||||
OutputDebugStringA(msgBuf);
|
||||
OutputDebugStringA("\n");
|
||||
}
|
||||
|
||||
if (enabled && KNOB_ENABLE_ASSERT_DIALOGS)
|
||||
{
|
||||
int retval = sprintf_s(&msgBuf[offset],
|
||||
MAX_MESSAGE_LEN - offset,
|
||||
"\n\n"
|
||||
"File: %s\n"
|
||||
"Line: %d\n"
|
||||
"\n"
|
||||
"Expression: %s\n\n"
|
||||
"Cancel: Disable this assert for the remainder of the process\n"
|
||||
"Try Again: Break into the debugger\n"
|
||||
"Continue: Continue execution (but leave assert enabled)",
|
||||
pFileName,
|
||||
lineNum,
|
||||
pExpression);
|
||||
|
||||
if (retval < 0)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
offset += retval;
|
||||
|
||||
if (!IsDebuggerPresent())
|
||||
{
|
||||
sprintf_s(&msgBuf[offset],
|
||||
MAX_MESSAGE_LEN - offset,
|
||||
"\n\n*** NO DEBUGGER DETECTED ***\n\nPressing \"Try Again\" will cause a "
|
||||
"program crash!");
|
||||
}
|
||||
|
||||
retval = MessageBoxA(nullptr,
|
||||
msgBuf,
|
||||
"Assert Failed",
|
||||
MB_CANCELTRYCONTINUE | MB_ICONEXCLAMATION | MB_SETFOREGROUND);
|
||||
|
||||
switch (retval)
|
||||
{
|
||||
case IDCANCEL:
|
||||
enabled = false;
|
||||
return false;
|
||||
|
||||
case IDTRYAGAIN:
|
||||
return true;
|
||||
|
||||
case IDCONTINUE:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return (IsDebuggerPresent() || !chkDebugger) && enabled;
|
||||
}
|
||||
#endif // _WIN32
|
||||
|
||||
return enabled;
|
||||
}
|
||||
|
||||
void SwrTrace(
|
||||
const char* pFileName, uint32_t lineNum, const char* pFunction, const char* pFmtString, ...)
|
||||
{
|
||||
using namespace ConsoleUtils;
|
||||
std::lock_guard<std::mutex> l(g_stderrMutex);
|
||||
|
||||
SetTextColor(stderr, TextColor::CYAN, TextStyle::NORMAL);
|
||||
|
||||
fprintf(stderr, "%s(%d): TRACE in %s:\n", pFileName, lineNum, pFunction);
|
||||
|
||||
if (pFmtString)
|
||||
{
|
||||
SetTextColor(stderr, TextColor::PURPLE, TextStyle::INTENSITY);
|
||||
fprintf(stderr, "\t");
|
||||
va_list args;
|
||||
va_start(args, pFmtString);
|
||||
vfprintf(stderr, pFmtString, args);
|
||||
va_end(args);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
ResetTextColor(stderr);
|
||||
fflush(stderr);
|
||||
|
||||
#if defined(_WIN32)
|
||||
static const int MAX_MESSAGE_LEN = 2048;
|
||||
char msgBuf[MAX_MESSAGE_LEN];
|
||||
|
||||
sprintf_s(msgBuf, "%s(%d): TRACE in %s\n", pFileName, lineNum, pFunction);
|
||||
msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
|
||||
msgBuf[MAX_MESSAGE_LEN - 1] = 0;
|
||||
OutputDebugStringA(msgBuf);
|
||||
|
||||
int offset = 0;
|
||||
|
||||
if (pFmtString)
|
||||
{
|
||||
va_list args;
|
||||
va_start(args, pFmtString);
|
||||
offset = _vsnprintf_s(msgBuf, sizeof(msgBuf), sizeof(msgBuf), pFmtString, args);
|
||||
va_end(args);
|
||||
|
||||
if (offset < 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
OutputDebugStringA("\t");
|
||||
OutputDebugStringA(msgBuf);
|
||||
OutputDebugStringA("\n");
|
||||
}
|
||||
#endif // _WIN32
|
||||
}
|
||||
|
||||
#endif // SWR_ENABLE_ASSERTS
|
||||
|
|
@ -1,242 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef __SWR_ASSERT_H__
|
||||
#define __SWR_ASSERT_H__
|
||||
|
||||
#if !defined(__SWR_OS_H__)
|
||||
#error swr_assert.h should not be included directly, please include "common/os.h" instead.
|
||||
#endif
|
||||
|
||||
//=============================================================================
|
||||
//
|
||||
// MACROS defined in this file:
|
||||
//
|
||||
// - SWR_ASSUME(expression, ...): Tell compiler that the expression is true.
|
||||
// Helps with static code analysis as well.
|
||||
// DO NOT USE if code after this dynamically
|
||||
// checks for errors and handles them. The
|
||||
// compiler may optimize out the error check.
|
||||
//
|
||||
// - SWR_ASSERT(expression, ...): Inform the user is expression is false.
|
||||
// This check is only conditionally made,
|
||||
// usually only in debug mode.
|
||||
//
|
||||
// - SWR_REL_ASSERT(expression, ...): Unconditionally enabled version of SWR_ASSERT
|
||||
//
|
||||
// - SWR_ASSUME_ASSERT(expression, ...): Conditionally enabled SWR_ASSERT. Uses
|
||||
// SWR_ASSUME if SWR_ASSERT is disabled.
|
||||
// DO NOT USE in combination with actual
|
||||
// error checking (see SWR_ASSUME)
|
||||
//
|
||||
// - SWR_REL_ASSUME_ASSERT(expression, ...): Same as SWR_REL_ASSERT.
|
||||
//
|
||||
//=============================================================================
|
||||
|
||||
// Stupid preprocessor tricks to avoid -Wall / -W4 warnings
|
||||
#if defined(_MSC_VER)
|
||||
#define _SWR_WARN_DISABLE __pragma(warning(push)) __pragma(warning(disable : 4127))
|
||||
#define _SWR_WARN_RESTORE __pragma(warning(pop))
|
||||
#else // ! MSVC compiler
|
||||
#define _SWR_WARN_DISABLE
|
||||
#define _SWR_WARN_RESTORE
|
||||
#endif
|
||||
|
||||
#define _SWR_MACRO_START \
|
||||
do \
|
||||
{
|
||||
#define _SWR_MACRO_END \
|
||||
_SWR_WARN_DISABLE \
|
||||
} \
|
||||
while (0) \
|
||||
_SWR_WARN_RESTORE
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define SWR_ASSUME(e, ...) \
|
||||
_SWR_MACRO_START __assume(e); \
|
||||
_SWR_MACRO_END
|
||||
#elif defined(__clang__)
|
||||
#define SWR_ASSUME(e, ...) \
|
||||
_SWR_MACRO_START __builtin_assume(e); \
|
||||
_SWR_MACRO_END
|
||||
#elif defined(__GNUC__)
|
||||
#define SWR_ASSUME(e, ...) \
|
||||
_SWR_MACRO_START((e) ? ((void)0) : __builtin_unreachable()); \
|
||||
_SWR_MACRO_END
|
||||
#else
|
||||
#define SWR_ASSUME(e, ...) \
|
||||
_SWR_MACRO_START ASSUME(e); \
|
||||
_SWR_MACRO_END
|
||||
#endif
|
||||
|
||||
#if !defined(SWR_ENABLE_ASSERTS)
|
||||
|
||||
#if !defined(NDEBUG)
|
||||
#define SWR_ENABLE_ASSERTS 1
|
||||
#else
|
||||
#define SWR_ENABLE_ASSERTS 0
|
||||
#endif // _DEBUG
|
||||
|
||||
#endif // SWR_ENABLE_ASSERTS
|
||||
|
||||
#if !defined(SWR_ENABLE_REL_ASSERTS)
|
||||
#define SWR_ENABLE_REL_ASSERTS 1
|
||||
#endif
|
||||
|
||||
#if SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
|
||||
#include "assert.h"
|
||||
|
||||
#if !defined(__cplusplus)
|
||||
|
||||
#pragma message("C++ is required for SWR Asserts, falling back to assert.h")
|
||||
|
||||
#if SWR_ENABLE_ASSERTS
|
||||
#define SWR_ASSERT(e, ...) assert(e)
|
||||
#endif
|
||||
|
||||
#if SWR_ENABLE_REL_ASSERTS
|
||||
#define SWR_REL_ASSERT(e, ...) assert(e)
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
bool SwrAssert(bool chkDebugger,
|
||||
bool& enabled,
|
||||
const char* pExpression,
|
||||
const char* pFileName,
|
||||
uint32_t lineNum,
|
||||
const char* function,
|
||||
const char* pFmtString = nullptr,
|
||||
...);
|
||||
|
||||
void SwrTrace(
|
||||
const char* pFileName, uint32_t lineNum, const char* function, const char* pFmtString, ...);
|
||||
|
||||
#define _SWR_ASSERT(chkDebugger, e, ...) \
|
||||
_SWR_MACRO_START \
|
||||
bool expFailed = !(e); \
|
||||
if (expFailed) \
|
||||
{ \
|
||||
static bool swrAssertEnabled = true; \
|
||||
expFailed = SwrAssert( \
|
||||
chkDebugger, swrAssertEnabled, #e, __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__); \
|
||||
if (expFailed) \
|
||||
{ \
|
||||
DEBUGBREAK; \
|
||||
} \
|
||||
} \
|
||||
_SWR_MACRO_END
|
||||
|
||||
#define _SWR_INVALID(chkDebugger, ...) \
|
||||
_SWR_MACRO_START \
|
||||
static bool swrAssertEnabled = true; \
|
||||
bool expFailed = SwrAssert( \
|
||||
chkDebugger, swrAssertEnabled, "", __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__); \
|
||||
if (expFailed) \
|
||||
{ \
|
||||
DEBUGBREAK; \
|
||||
} \
|
||||
_SWR_MACRO_END
|
||||
|
||||
#define _SWR_TRACE(_fmtstr, ...) SwrTrace(__FILE__, __LINE__, __FUNCTION__, _fmtstr, ##__VA_ARGS__);
|
||||
|
||||
#if SWR_ENABLE_ASSERTS
|
||||
#define SWR_ASSERT(e, ...) _SWR_ASSERT(true, e, ##__VA_ARGS__)
|
||||
#define SWR_ASSUME_ASSERT(e, ...) SWR_ASSERT(e, ##__VA_ARGS__)
|
||||
#define SWR_TRACE(_fmtstr, ...) _SWR_TRACE(_fmtstr, ##__VA_ARGS__)
|
||||
#endif // SWR_ENABLE_ASSERTS
|
||||
|
||||
#if SWR_ENABLE_REL_ASSERTS
|
||||
#define SWR_REL_ASSERT(e, ...) _SWR_ASSERT(false, e, ##__VA_ARGS__)
|
||||
#define SWR_REL_ASSUME_ASSERT(e, ...) SWR_REL_ASSERT(e, ##__VA_ARGS__)
|
||||
#define SWR_REL_TRACE(_fmtstr, ...) _SWR_TRACE(_fmtstr, ##__VA_ARGS__)
|
||||
|
||||
// SWR_INVALID is always enabled
|
||||
// Funky handling to allow 0 arguments with g++/gcc
|
||||
// This is needed because you can't "swallow commas" with ##_VA_ARGS__ unless
|
||||
// there is a first argument to the macro. So having a macro that can optionally
|
||||
// accept 0 arguments is tricky.
|
||||
#define _SWR_INVALID_0() _SWR_INVALID(false)
|
||||
#define _SWR_INVALID_1(...) _SWR_INVALID(false, ##__VA_ARGS__)
|
||||
#define _SWR_INVALID_VARGS_(_10, _9, _8, _7, _6, _5, _4, _3, _2, _1, N, ...) N
|
||||
#define _SWR_INVALID_VARGS(...) _SWR_INVALID_VARGS_(__VA_ARGS__, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1)
|
||||
#define _SWR_INVALID_VARGS_0() 1, 2, 3, 4, 5, 6, 7, 9, 9, 10
|
||||
#define _SWR_INVALID_CONCAT_(a, b) a##b
|
||||
#define _SWR_INVALID_CONCAT(a, b) _SWR_INVALID_CONCAT_(a, b)
|
||||
#define SWR_INVALID(...) \
|
||||
_SWR_INVALID_CONCAT(_SWR_INVALID_, _SWR_INVALID_VARGS(_SWR_INVALID_VARGS_0 __VA_ARGS__())) \
|
||||
(__VA_ARGS__)
|
||||
|
||||
#define SWR_STATIC_ASSERT(expression, ...) \
|
||||
static_assert((expression), "Failed:\n " #expression "\n " __VA_ARGS__);
|
||||
|
||||
#endif // SWR_ENABLE_REL_ASSERTS
|
||||
|
||||
#endif // C++
|
||||
|
||||
#endif // SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
|
||||
|
||||
// Needed to allow passing bitfield members to sizeof() in disabled asserts
|
||||
template <typename T>
|
||||
static bool SwrSizeofWorkaround(T)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
#if !SWR_ENABLE_ASSERTS
|
||||
#define SWR_ASSERT(e, ...) \
|
||||
_SWR_MACRO_START(void) sizeof(SwrSizeofWorkaround(e)); \
|
||||
_SWR_MACRO_END
|
||||
#define SWR_ASSUME_ASSERT(e, ...) SWR_ASSUME(e, ##__VA_ARGS__)
|
||||
#define SWR_TRACE(_fmtstr, ...) \
|
||||
_SWR_MACRO_START(void)(0); \
|
||||
_SWR_MACRO_END
|
||||
#endif
|
||||
|
||||
#if !SWR_ENABLE_REL_ASSERTS
|
||||
#define SWR_REL_ASSERT(e, ...) \
|
||||
_SWR_MACRO_START(void) sizeof(SwrSizeofWorkaround(e)); \
|
||||
_SWR_MACRO_END
|
||||
#define SWR_INVALID(...) \
|
||||
_SWR_MACRO_START(void)(0); \
|
||||
_SWR_MACRO_END
|
||||
#define SWR_REL_ASSUME_ASSERT(e, ...) SWR_ASSUME(e, ##__VA_ARGS__)
|
||||
#define SWR_REL_TRACE(_fmtstr, ...) \
|
||||
_SWR_MACRO_START(void)(0); \
|
||||
_SWR_MACRO_END
|
||||
#define SWR_STATIC_ASSERT(e, ...) \
|
||||
_SWR_MACRO_START(void) sizeof(SwrSizeofWorkaround(e)); \
|
||||
_SWR_MACRO_END
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define SWR_FUNCTION_DECL __FUNCSIG__
|
||||
#elif (defined(__GNUC__) || defined(__clang__))
|
||||
#define SWR_FUNCTION_DECL __PRETTY_FUNCTION__
|
||||
#else
|
||||
#define SWR_FUNCTION_DECL __FUNCTION__
|
||||
#endif
|
||||
|
||||
#define SWR_NOT_IMPL SWR_INVALID("%s not implemented", SWR_FUNCTION_DECL)
|
||||
|
||||
#endif //__SWR_ASSERT_H__
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,772 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file api.h
|
||||
*
|
||||
* @brief API definitions
|
||||
*
|
||||
******************************************************************************/
|
||||
|
||||
#ifndef __SWR_API_H__
|
||||
#define __SWR_API_H__
|
||||
|
||||
#include "common/os.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <algorithm>
|
||||
|
||||
#include "common/intrin.h"
|
||||
#include "common/formats.h"
|
||||
#include "core/state.h"
|
||||
|
||||
typedef void(SWR_API* PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2, uint64_t data3);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Rectangle structure
|
||||
struct SWR_RECT
|
||||
{
|
||||
int32_t xmin; ///< inclusive
|
||||
int32_t ymin; ///< inclusive
|
||||
int32_t xmax; ///< exclusive
|
||||
int32_t ymax; ///< exclusive
|
||||
|
||||
bool operator==(const SWR_RECT& rhs)
|
||||
{
|
||||
return (this->ymin == rhs.ymin && this->ymax == rhs.ymax && this->xmin == rhs.xmin &&
|
||||
this->xmax == rhs.xmax);
|
||||
}
|
||||
|
||||
bool operator!=(const SWR_RECT& rhs) { return !(*this == rhs); }
|
||||
|
||||
SWR_RECT& Intersect(const SWR_RECT& other)
|
||||
{
|
||||
this->xmin = std::max(this->xmin, other.xmin);
|
||||
this->ymin = std::max(this->ymin, other.ymin);
|
||||
this->xmax = std::min(this->xmax, other.xmax);
|
||||
this->ymax = std::min(this->ymax, other.ymax);
|
||||
|
||||
if (xmax - xmin < 0 || ymax - ymin < 0)
|
||||
{
|
||||
// Zero area
|
||||
ymin = ymax = xmin = xmax = 0;
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
SWR_RECT& operator&=(const SWR_RECT& other) { return Intersect(other); }
|
||||
|
||||
SWR_RECT& Union(const SWR_RECT& other)
|
||||
{
|
||||
this->xmin = std::min(this->xmin, other.xmin);
|
||||
this->ymin = std::min(this->ymin, other.ymin);
|
||||
this->xmax = std::max(this->xmax, other.xmax);
|
||||
this->ymax = std::max(this->ymax, other.ymax);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
SWR_RECT& operator|=(const SWR_RECT& other) { return Union(other); }
|
||||
|
||||
void Translate(int32_t x, int32_t y)
|
||||
{
|
||||
xmin += x;
|
||||
ymin += y;
|
||||
xmax += x;
|
||||
ymax += y;
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Function signature for load hot tiles
|
||||
/// @param hDC - handle to DRAW_CONTEXT
|
||||
/// @param dstFormat - format of the hot tile
|
||||
/// @param renderTargetIndex - render target to store, can be color, depth or stencil
|
||||
/// @param x - destination x coordinate
|
||||
/// @param y - destination y coordinate
|
||||
/// @param pDstHotTile - pointer to the hot tile surface
|
||||
typedef void(SWR_API* PFN_LOAD_TILE)(HANDLE hDC,
|
||||
HANDLE hWorkerPrivateData,
|
||||
SWR_FORMAT dstFormat,
|
||||
SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
|
||||
uint32_t x,
|
||||
uint32_t y,
|
||||
uint32_t renderTargetArrayIndex,
|
||||
uint8_t* pDstHotTile);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Function signature for store hot tiles
|
||||
/// @param hDC - handle to DRAW_CONTEXT
|
||||
/// @param srcFormat - format of the hot tile
|
||||
/// @param renderTargetIndex - render target to store, can be color, depth or stencil
|
||||
/// @param x - destination x coordinate
|
||||
/// @param y - destination y coordinate
|
||||
/// @param pSrcHotTile - pointer to the hot tile surface
|
||||
typedef void(SWR_API* PFN_STORE_TILE)(HANDLE hDC,
|
||||
HANDLE hWorkerPrivateData,
|
||||
SWR_FORMAT srcFormat,
|
||||
SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
|
||||
uint32_t x,
|
||||
uint32_t y,
|
||||
uint32_t renderTargetArrayIndex,
|
||||
uint8_t* pSrcHotTile);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Function signature for clearing from the hot tiles clear value
|
||||
/// @param hPrivateContext - handle to private data
|
||||
/// @param renderTargetIndex - render target to store, can be color, depth or stencil
|
||||
/// @param x - destination x coordinate
|
||||
/// @param y - destination y coordinate
|
||||
/// @param renderTargetArrayIndex - render target array offset from arrayIndex
|
||||
/// @param pClearColor - pointer to the hot tile's clear value
|
||||
typedef void(SWR_API* PFN_CLEAR_TILE)(HANDLE hPrivateContext,
|
||||
HANDLE hWorkerPrivateData,
|
||||
SWR_RENDERTARGET_ATTACHMENT rtIndex,
|
||||
uint32_t x,
|
||||
uint32_t y,
|
||||
uint32_t renderTargetArrayIndex,
|
||||
const float* pClearColor);
|
||||
|
||||
typedef void*(SWR_API* PFN_TRANSLATE_GFXPTR_FOR_READ)(HANDLE hPrivateContext,
|
||||
gfxptr_t xpAddr,
|
||||
bool* pbNullTileAccessed,
|
||||
HANDLE hPrivateWorkerData);
|
||||
|
||||
typedef void*(SWR_API* PFN_TRANSLATE_GFXPTR_FOR_WRITE)(HANDLE hPrivateContext,
|
||||
gfxptr_t xpAddr,
|
||||
bool* pbNullTileAccessed,
|
||||
HANDLE hPrivateWorkerData);
|
||||
|
||||
typedef gfxptr_t(SWR_API* PFN_MAKE_GFXPTR)(HANDLE hPrivateContext, void* sysAddr);
|
||||
|
||||
typedef HANDLE(SWR_API* PFN_CREATE_MEMORY_CONTEXT)(HANDLE hExternalMemory);
|
||||
|
||||
typedef void(SWR_API* PFN_DESTROY_MEMORY_CONTEXT)(HANDLE hExternalMemory, HANDLE hMemoryContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Callback to allow driver to update their copy of streamout write offset.
|
||||
/// This is call is made for any draw operation that has streamout enabled
|
||||
/// and has updated the write offset.
|
||||
/// @param hPrivateContext - handle to private data
|
||||
/// @param soBufferSlot - buffer slot for write offset
|
||||
/// @param soWriteOffset - update value for so write offset.
|
||||
typedef void(SWR_API* PFN_UPDATE_SO_WRITE_OFFSET)(HANDLE hPrivateContext,
|
||||
uint32_t soBufferSlot,
|
||||
uint32_t soWriteOffset);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Callback to allow driver to update their copy of stats.
|
||||
/// @param hPrivateContext - handle to private data
|
||||
/// @param pStats - pointer to draw stats
|
||||
typedef void(SWR_API* PFN_UPDATE_STATS)(HANDLE hPrivateContext, const SWR_STATS* pStats);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Callback to allow driver to update their copy of FE stats.
|
||||
/// @note Its optimal to have a separate callback for FE stats since
|
||||
/// there is only one DC per FE thread. This means we do not have
|
||||
/// to sum up the stats across all of the workers.
|
||||
/// @param hPrivateContext - handle to private data
|
||||
/// @param pStats - pointer to draw stats
|
||||
typedef void(SWR_API* PFN_UPDATE_STATS_FE)(HANDLE hPrivateContext, const SWR_STATS_FE* pStats);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Callback to allow driver to update StreamOut status
|
||||
/// @param hPrivateContext - handle to private data
|
||||
/// @param numPrims - number of primitives written to StreamOut buffer
|
||||
typedef void(SWR_API* PFN_UPDATE_STREAMOUT)(HANDLE hPrivateContext, uint64_t numPrims);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// BucketManager
|
||||
/// Forward Declaration (see rdtsc_buckets.h for full definition)
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
class BucketManager;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// SWR_THREADING_INFO
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
struct SWR_THREADING_INFO
|
||||
{
|
||||
uint32_t BASE_NUMA_NODE;
|
||||
uint32_t BASE_CORE;
|
||||
uint32_t BASE_THREAD;
|
||||
uint32_t MAX_WORKER_THREADS;
|
||||
uint32_t MAX_NUMA_NODES;
|
||||
uint32_t MAX_CORES_PER_NUMA_NODE;
|
||||
uint32_t MAX_THREADS_PER_CORE;
|
||||
bool SINGLE_THREADED;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// SWR_API_THREADING_INFO
|
||||
/// Data used to reserve HW threads for API use
|
||||
/// API Threads are reserved from numa nodes / cores used for
|
||||
/// SWR Worker threads. Specifying reserved threads here can reduce
|
||||
/// the total number of SWR worker threads.
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
struct SWR_API_THREADING_INFO
|
||||
{
|
||||
uint32_t numAPIReservedThreads; // Default is 1 if SWR_API_THREADING_INFO is not sent
|
||||
uint32_t bindAPIThread0; // Default is true if numAPIReservedThreads is > 0,
|
||||
// binds thread used in SwrCreateContext to API Reserved
|
||||
// thread 0
|
||||
uint32_t numAPIThreadsPerCore; // 0 - means use all threads per core, else clamp to this number.
|
||||
// Independent of KNOB_MAX_THREADS_PER_CORE.
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// SWR_CONTEXT
|
||||
/// Forward Declaration (see context.h for full definition)
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
struct SWR_CONTEXT;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// SWR_WORKER_PRIVATE_STATE
|
||||
/// Data used to allocate per-worker thread private data. A pointer
|
||||
/// to this data will be passed in to each shader function.
|
||||
/// The first field of this private data must be SWR_WORKER_DATA
|
||||
/// perWorkerPrivateStateSize must be >= sizeof SWR_WORKER_DATA
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
struct SWR_WORKER_PRIVATE_STATE
|
||||
{
|
||||
typedef void(SWR_API* PFN_WORKER_DATA)(SWR_CONTEXT* pContext, HANDLE hWorkerPrivateData, uint32_t iWorkerNum);
|
||||
|
||||
size_t perWorkerPrivateStateSize; ///< Amount of data to allocate per-worker
|
||||
PFN_WORKER_DATA pfnInitWorkerData; ///< Init function for worker data. If null
|
||||
///< worker data will be initialized to 0.
|
||||
PFN_WORKER_DATA pfnFinishWorkerData; ///< Finish / destroy function for worker data.
|
||||
///< Can be null.
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// SWR_CREATECONTEXT_INFO
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
struct SWR_CREATECONTEXT_INFO
|
||||
{
|
||||
// External functions (e.g. sampler) need per draw context state.
|
||||
// Use SwrGetPrivateContextState() to access private state.
|
||||
size_t privateStateSize;
|
||||
|
||||
// Optional per-worker state, can be NULL for no worker-private data
|
||||
SWR_WORKER_PRIVATE_STATE* pWorkerPrivateState;
|
||||
|
||||
// Callback functions
|
||||
PFN_LOAD_TILE pfnLoadTile;
|
||||
PFN_STORE_TILE pfnStoreTile;
|
||||
PFN_TRANSLATE_GFXPTR_FOR_READ pfnTranslateGfxptrForRead;
|
||||
PFN_TRANSLATE_GFXPTR_FOR_WRITE pfnTranslateGfxptrForWrite;
|
||||
PFN_MAKE_GFXPTR pfnMakeGfxPtr;
|
||||
PFN_CREATE_MEMORY_CONTEXT pfnCreateMemoryContext;
|
||||
PFN_DESTROY_MEMORY_CONTEXT pfnDestroyMemoryContext;
|
||||
PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
|
||||
PFN_UPDATE_STATS pfnUpdateStats;
|
||||
PFN_UPDATE_STATS_FE pfnUpdateStatsFE;
|
||||
PFN_UPDATE_STREAMOUT pfnUpdateStreamOut;
|
||||
|
||||
|
||||
// Pointer to rdtsc buckets mgr returned to the caller.
|
||||
// Only populated when KNOB_ENABLE_RDTSC is set
|
||||
BucketManager* pBucketMgr;
|
||||
|
||||
// Output: size required memory passed to for SwrSaveState / SwrRestoreState
|
||||
size_t contextSaveSize;
|
||||
|
||||
// ArchRast event manager.
|
||||
HANDLE hArEventManager;
|
||||
|
||||
// handle to external memory for worker data to create memory contexts
|
||||
HANDLE hExternalMemory;
|
||||
|
||||
// Input (optional): Threading info that overrides any set KNOB values.
|
||||
SWR_THREADING_INFO* pThreadInfo;
|
||||
|
||||
// Input (optional): Info for reserving API threads
|
||||
SWR_API_THREADING_INFO* pApiThreadInfo;
|
||||
|
||||
// Input: if set to non-zero value, overrides KNOB value for maximum
|
||||
// number of draws in flight
|
||||
uint32_t MAX_DRAWS_IN_FLIGHT;
|
||||
|
||||
std::string contextName;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Create SWR Context.
|
||||
/// @param pCreateInfo - pointer to creation info.
|
||||
SWR_FUNC(HANDLE, SwrCreateContext, SWR_CREATECONTEXT_INFO* pCreateInfo);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Destroys SWR Context.
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
SWR_FUNC(void, SwrDestroyContext, HANDLE hContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Bind current thread to an API reserved HW thread
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param apiThreadId - index of reserved HW thread to bind to.
|
||||
SWR_FUNC(void, SwrBindApiThread, HANDLE hContext, uint32_t apiThreadId);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Saves API state associated with hContext
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pOutputStateBlock - Memory block to receive API state data
|
||||
/// @param memSize - Size of memory pointed to by pOutputStateBlock
|
||||
SWR_FUNC(void, SwrSaveState, HANDLE hContext, void* pOutputStateBlock, size_t memSize);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Restores API state to hContext previously saved with SwrSaveState
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pStateBlock - Memory block to read API state data from
|
||||
/// @param memSize - Size of memory pointed to by pStateBlock
|
||||
SWR_FUNC(void, SwrRestoreState, HANDLE hContext, const void* pStateBlock, size_t memSize);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Sync cmd. Executes the callback func when all rendering up to this sync
|
||||
/// has been completed
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pfnFunc - pointer to callback function,
|
||||
/// @param userData - user data to pass back
|
||||
SWR_FUNC(void,
|
||||
SwrSync,
|
||||
HANDLE hContext,
|
||||
PFN_CALLBACK_FUNC pfnFunc,
|
||||
uint64_t userData,
|
||||
uint64_t userData2,
|
||||
uint64_t userData3);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Stall cmd. Stalls the backend until all previous work has been completed.
|
||||
/// Frontend work can continue to make progress
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
SWR_FUNC(void, SwrStallBE, HANDLE hContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Blocks until all rendering has been completed.
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
SWR_FUNC(void, SwrWaitForIdle, HANDLE hContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Blocks until all FE rendering has been completed.
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
SWR_FUNC(void, SwrWaitForIdleFE, HANDLE hContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set vertex buffer state.
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param numBuffers - Number of vertex buffer state descriptors.
|
||||
/// @param pVertexBuffers - Array of vertex buffer state descriptors.
|
||||
SWR_FUNC(void,
|
||||
SwrSetVertexBuffers,
|
||||
HANDLE hContext,
|
||||
uint32_t numBuffers,
|
||||
const SWR_VERTEX_BUFFER_STATE* pVertexBuffers);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set index buffer
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pIndexBuffer - Index buffer.
|
||||
SWR_FUNC(void, SwrSetIndexBuffer, HANDLE hContext, const SWR_INDEX_BUFFER_STATE* pIndexBuffer);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set fetch shader pointer.
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pfnFetchFunc - Pointer to shader.
|
||||
SWR_FUNC(void, SwrSetFetchFunc, HANDLE hContext, PFN_FETCH_FUNC pfnFetchFunc);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set streamout shader pointer.
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pfnSoFunc - Pointer to shader.
|
||||
/// @param streamIndex - specifies stream
|
||||
SWR_FUNC(void, SwrSetSoFunc, HANDLE hContext, PFN_SO_FUNC pfnSoFunc, uint32_t streamIndex);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set streamout state
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pSoState - Pointer to streamout state.
|
||||
SWR_FUNC(void, SwrSetSoState, HANDLE hContext, SWR_STREAMOUT_STATE* pSoState);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set streamout buffer state
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pSoBuffer - Pointer to streamout buffer.
|
||||
/// @param slot - Slot to bind SO buffer to.
|
||||
SWR_FUNC(void, SwrSetSoBuffers, HANDLE hContext, SWR_STREAMOUT_BUFFER* pSoBuffer, uint32_t slot);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set vertex shader pointer.
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pfnVertexFunc - Pointer to shader.
|
||||
SWR_FUNC(void, SwrSetVertexFunc, HANDLE hContext, PFN_VERTEX_FUNC pfnVertexFunc);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set frontend state.
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pState - Pointer to state
|
||||
SWR_FUNC(void, SwrSetFrontendState, HANDLE hContext, SWR_FRONTEND_STATE* pState);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set geometry shader state.
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pState - Pointer to state
|
||||
SWR_FUNC(void, SwrSetGsState, HANDLE hContext, SWR_GS_STATE* pState);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set geometry shader
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pState - Pointer to geometry shader function
|
||||
SWR_FUNC(void, SwrSetGsFunc, HANDLE hContext, PFN_GS_FUNC pfnGsFunc);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set compute shader
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pfnCsFunc - Pointer to compute shader function
|
||||
/// @param totalThreadsInGroup - product of thread group dimensions.
|
||||
/// @param totalSpillFillSize - size in bytes needed for spill/fill.
|
||||
/// @param scratchSpaceSizePerInstance - size of the scratch space needed per simd instance
|
||||
/// @param numInstances - number of simd instances that are run per execution of the shader
|
||||
SWR_FUNC(void,
|
||||
SwrSetCsFunc,
|
||||
HANDLE hContext,
|
||||
PFN_CS_FUNC pfnCsFunc,
|
||||
uint32_t totalThreadsInGroup,
|
||||
uint32_t totalSpillFillSize,
|
||||
uint32_t scratchSpaceSizePerInstance,
|
||||
uint32_t numInstances);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set tessellation state.
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pState - Pointer to state
|
||||
SWR_FUNC(void, SwrSetTsState, HANDLE hContext, SWR_TS_STATE* pState);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set hull shader
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pfnFunc - Pointer to shader function
|
||||
SWR_FUNC(void, SwrSetHsFunc, HANDLE hContext, PFN_HS_FUNC pfnFunc);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set domain shader
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pfnFunc - Pointer to shader function
|
||||
SWR_FUNC(void, SwrSetDsFunc, HANDLE hContext, PFN_DS_FUNC pfnFunc);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set depth stencil state
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pState - Pointer to state.
|
||||
SWR_FUNC(void, SwrSetDepthStencilState, HANDLE hContext, SWR_DEPTH_STENCIL_STATE* pState);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set backend state
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pState - Pointer to state.
|
||||
SWR_FUNC(void, SwrSetBackendState, HANDLE hContext, SWR_BACKEND_STATE* pState);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set depth bounds state
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pState - Pointer to state.
|
||||
SWR_FUNC(void, SwrSetDepthBoundsState, HANDLE hContext, SWR_DEPTH_BOUNDS_STATE* pState);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set pixel shader state
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pState - Pointer to state.
|
||||
SWR_FUNC(void, SwrSetPixelShaderState, HANDLE hContext, SWR_PS_STATE* pState);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set blend state
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pState - Pointer to state.
|
||||
SWR_FUNC(void, SwrSetBlendState, HANDLE hContext, SWR_BLEND_STATE* pState);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set blend function
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param renderTarget - render target index
|
||||
/// @param pfnBlendFunc - function pointer
|
||||
SWR_FUNC(
|
||||
void, SwrSetBlendFunc, HANDLE hContext, uint32_t renderTarget, PFN_BLEND_JIT_FUNC pfnBlendFunc);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief SwrDraw
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param topology - Specifies topology for draw.
|
||||
/// @param startVertex - Specifies start vertex in vertex buffer for draw.
|
||||
/// @param primCount - Number of vertices.
|
||||
SWR_FUNC(void,
|
||||
SwrDraw,
|
||||
HANDLE hContext,
|
||||
PRIMITIVE_TOPOLOGY topology,
|
||||
uint32_t startVertex,
|
||||
uint32_t primCount);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief SwrDrawInstanced
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param topology - Specifies topology for draw.
|
||||
/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
|
||||
/// @param numInstances - How many instances to render.
|
||||
/// @param startVertex - Specifies start vertex for draw. (vertex data)
|
||||
/// @param startInstance - Which instance to start sequentially fetching from in each buffer
|
||||
/// (instanced data)
|
||||
SWR_FUNC(void,
|
||||
SwrDrawInstanced,
|
||||
HANDLE hContext,
|
||||
PRIMITIVE_TOPOLOGY topology,
|
||||
uint32_t numVertsPerInstance,
|
||||
uint32_t numInstances,
|
||||
uint32_t startVertex,
|
||||
uint32_t startInstance);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief DrawIndexed
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param topology - Specifies topology for draw.
|
||||
/// @param numIndices - Number of indices to read sequentially from index buffer.
|
||||
/// @param indexOffset - Starting index into index buffer.
|
||||
/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
|
||||
SWR_FUNC(void,
|
||||
SwrDrawIndexed,
|
||||
HANDLE hContext,
|
||||
PRIMITIVE_TOPOLOGY topology,
|
||||
uint32_t numIndices,
|
||||
uint32_t indexOffset,
|
||||
int32_t baseVertex);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief SwrDrawIndexedInstanced
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param topology - Specifies topology for draw.
|
||||
/// @param numIndices - Number of indices to read sequentially from index buffer.
|
||||
/// @param numInstances - Number of instances to render.
|
||||
/// @param indexOffset - Starting index into index buffer.
|
||||
/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
|
||||
/// @param startInstance - Which instance to start sequentially fetching from in each buffer
|
||||
/// (instanced data)
|
||||
SWR_FUNC(void,
|
||||
SwrDrawIndexedInstanced,
|
||||
HANDLE hContext,
|
||||
PRIMITIVE_TOPOLOGY topology,
|
||||
uint32_t numIndices,
|
||||
uint32_t numInstances,
|
||||
uint32_t indexOffset,
|
||||
int32_t baseVertex,
|
||||
uint32_t startInstance);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief SwrInvalidateTiles
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to
|
||||
/// invalidate.
|
||||
/// @param invalidateRect - The pixel-coordinate rectangle to invalidate. This will be expanded to
|
||||
/// be hottile size-aligned.
|
||||
SWR_FUNC(void,
|
||||
SwrInvalidateTiles,
|
||||
HANDLE hContext,
|
||||
uint32_t attachmentMask,
|
||||
const SWR_RECT& invalidateRect);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief SwrDiscardRect
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
|
||||
/// @param rect - The pixel-coordinate rectangle to discard. Only fully-covered hottiles will be
|
||||
/// discarded.
|
||||
SWR_FUNC(void, SwrDiscardRect, HANDLE hContext, uint32_t attachmentMask, const SWR_RECT& rect);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief SwrDispatch
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param threadGroupCountX - Number of thread groups dispatched in X direction
|
||||
/// @param threadGroupCountY - Number of thread groups dispatched in Y direction
|
||||
/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
|
||||
SWR_FUNC(void,
|
||||
SwrDispatch,
|
||||
HANDLE hContext,
|
||||
uint32_t threadGroupCountX,
|
||||
uint32_t threadGroupCountY,
|
||||
uint32_t threadGroupCountZ);
|
||||
|
||||
/// @note this enum needs to be kept in sync with HOTTILE_STATE!
|
||||
enum SWR_TILE_STATE
|
||||
{
|
||||
SWR_TILE_INVALID = 0, // tile is in uninitialized state and should be loaded with surface contents
|
||||
// before rendering
|
||||
SWR_TILE_DIRTY = 2, // tile contains newer data than surface it represents
|
||||
SWR_TILE_RESOLVED = 3, // is in sync with surface it represents
|
||||
};
|
||||
|
||||
/// @todo Add a good description for what attachments are and when and why you would use the
|
||||
/// different SWR_TILE_STATEs.
|
||||
SWR_FUNC(void,
|
||||
SwrStoreTiles,
|
||||
HANDLE hContext,
|
||||
uint32_t attachmentMask,
|
||||
SWR_TILE_STATE postStoreTileState,
|
||||
const SWR_RECT& storeRect);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief SwrClearRenderTarget - Clear attached render targets / depth / stencil
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param attachmentMask - combination of SWR_ATTACHMENT_*_BIT attachments to clear
|
||||
/// @param renderTargetArrayIndex - the RT array index to clear
|
||||
/// @param clearColor - color use for clearing render targets
|
||||
/// @param z - depth value use for clearing depth buffer
|
||||
/// @param stencil - stencil value used for clearing stencil buffer
|
||||
/// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers
|
||||
SWR_FUNC(void,
|
||||
SwrClearRenderTarget,
|
||||
HANDLE hContext,
|
||||
uint32_t attachmentMask,
|
||||
uint32_t renderTargetArrayIndex,
|
||||
const float clearColor[4],
|
||||
float z,
|
||||
uint8_t stencil,
|
||||
const SWR_RECT& clearRect);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief SwrSetRastState
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param pRastState - New SWR_RASTSTATE used for SwrDraw* commands
|
||||
SWR_FUNC(void, SwrSetRastState, HANDLE hContext, const SWR_RASTSTATE* pRastState);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief SwrSetViewports
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param numViewports - number of viewports passed in
|
||||
/// @param pViewports - Specifies extents of viewport.
|
||||
/// @param pMatrices - If not specified then SWR computes a default one.
|
||||
SWR_FUNC(void,
|
||||
SwrSetViewports,
|
||||
HANDLE hContext,
|
||||
uint32_t numViewports,
|
||||
const SWR_VIEWPORT* pViewports,
|
||||
const SWR_VIEWPORT_MATRICES* pMatrices);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief SwrSetScissorRects
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param numScissors - number of scissors passed in
|
||||
/// @param pScissors - array of scissors
|
||||
SWR_FUNC(
|
||||
void, SwrSetScissorRects, HANDLE hContext, uint32_t numScissors, const SWR_RECT* pScissors);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Returns a pointer to the private context state for the current
|
||||
/// draw operation. This is used for external componets such as the
|
||||
/// sampler.
|
||||
///
|
||||
/// @note Client needs to resend private state prior to each draw call.
|
||||
/// Also, SWR is responsible for the private state memory.
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
SWR_FUNC(void*, SwrGetPrivateContextState, HANDLE hContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Clients can use this to allocate memory for draw/dispatch
|
||||
/// operations. The memory will automatically be freed once operation
|
||||
/// has completed. Client can use this to allocate binding tables,
|
||||
/// etc. needed for shader execution.
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param size - Size of allocation
|
||||
/// @param align - Alignment needed for allocation.
|
||||
SWR_FUNC(void*, SwrAllocDrawContextMemory, HANDLE hContext, uint32_t size, uint32_t align);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Enables stats counting
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param enable - If true then counts are incremented.
|
||||
SWR_FUNC(void, SwrEnableStatsFE, HANDLE hContext, bool enable);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Enables stats counting
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
/// @param enable - If true then counts are incremented.
|
||||
SWR_FUNC(void, SwrEnableStatsBE, HANDLE hContext, bool enable);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Mark end of frame - used for performance profiling
|
||||
/// @param hContext - Handle passed back from SwrCreateContext
|
||||
SWR_FUNC(void, SwrEndFrame, HANDLE hContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Initialize swr backend and memory internal tables
|
||||
SWR_FUNC(void, SwrInit);
|
||||
|
||||
|
||||
struct SWR_INTERFACE
|
||||
{
|
||||
PFNSwrCreateContext pfnSwrCreateContext;
|
||||
PFNSwrDestroyContext pfnSwrDestroyContext;
|
||||
PFNSwrBindApiThread pfnSwrBindApiThread;
|
||||
PFNSwrSaveState pfnSwrSaveState;
|
||||
PFNSwrRestoreState pfnSwrRestoreState;
|
||||
PFNSwrSync pfnSwrSync;
|
||||
PFNSwrStallBE pfnSwrStallBE;
|
||||
PFNSwrWaitForIdle pfnSwrWaitForIdle;
|
||||
PFNSwrWaitForIdleFE pfnSwrWaitForIdleFE;
|
||||
PFNSwrSetVertexBuffers pfnSwrSetVertexBuffers;
|
||||
PFNSwrSetIndexBuffer pfnSwrSetIndexBuffer;
|
||||
PFNSwrSetFetchFunc pfnSwrSetFetchFunc;
|
||||
PFNSwrSetSoFunc pfnSwrSetSoFunc;
|
||||
PFNSwrSetSoState pfnSwrSetSoState;
|
||||
PFNSwrSetSoBuffers pfnSwrSetSoBuffers;
|
||||
PFNSwrSetVertexFunc pfnSwrSetVertexFunc;
|
||||
PFNSwrSetFrontendState pfnSwrSetFrontendState;
|
||||
PFNSwrSetGsState pfnSwrSetGsState;
|
||||
PFNSwrSetGsFunc pfnSwrSetGsFunc;
|
||||
PFNSwrSetCsFunc pfnSwrSetCsFunc;
|
||||
PFNSwrSetTsState pfnSwrSetTsState;
|
||||
PFNSwrSetHsFunc pfnSwrSetHsFunc;
|
||||
PFNSwrSetDsFunc pfnSwrSetDsFunc;
|
||||
PFNSwrSetDepthStencilState pfnSwrSetDepthStencilState;
|
||||
PFNSwrSetBackendState pfnSwrSetBackendState;
|
||||
PFNSwrSetDepthBoundsState pfnSwrSetDepthBoundsState;
|
||||
PFNSwrSetPixelShaderState pfnSwrSetPixelShaderState;
|
||||
PFNSwrSetBlendState pfnSwrSetBlendState;
|
||||
PFNSwrSetBlendFunc pfnSwrSetBlendFunc;
|
||||
PFNSwrDraw pfnSwrDraw;
|
||||
PFNSwrDrawInstanced pfnSwrDrawInstanced;
|
||||
PFNSwrDrawIndexed pfnSwrDrawIndexed;
|
||||
PFNSwrDrawIndexedInstanced pfnSwrDrawIndexedInstanced;
|
||||
PFNSwrInvalidateTiles pfnSwrInvalidateTiles;
|
||||
PFNSwrDiscardRect pfnSwrDiscardRect;
|
||||
PFNSwrDispatch pfnSwrDispatch;
|
||||
PFNSwrStoreTiles pfnSwrStoreTiles;
|
||||
PFNSwrClearRenderTarget pfnSwrClearRenderTarget;
|
||||
PFNSwrSetRastState pfnSwrSetRastState;
|
||||
PFNSwrSetViewports pfnSwrSetViewports;
|
||||
PFNSwrSetScissorRects pfnSwrSetScissorRects;
|
||||
PFNSwrGetPrivateContextState pfnSwrGetPrivateContextState;
|
||||
PFNSwrAllocDrawContextMemory pfnSwrAllocDrawContextMemory;
|
||||
PFNSwrEnableStatsFE pfnSwrEnableStatsFE;
|
||||
PFNSwrEnableStatsBE pfnSwrEnableStatsBE;
|
||||
PFNSwrEndFrame pfnSwrEndFrame;
|
||||
PFNSwrInit pfnSwrInit;
|
||||
};
|
||||
|
||||
extern "C" {
|
||||
typedef void(SWR_API* PFNSwrGetInterface)(SWR_INTERFACE& out_funcs);
|
||||
SWR_VISIBLE void SWR_API SwrGetInterface(SWR_INTERFACE& out_funcs);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -1,490 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file arena.h
|
||||
*
|
||||
* @brief Arena memory manager
|
||||
* The arena is convenient and fast for managing allocations for any of
|
||||
* our allocations that are associated with operations and can all be freed
|
||||
* once when their operation has completed. Allocations are cheap since
|
||||
* most of the time its simply an increment of an offset. Also, no need to
|
||||
* free individual allocations. All of the arena memory can be freed at once.
|
||||
*
|
||||
******************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#include <mutex>
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
#include "core/utils.h"
|
||||
|
||||
static const size_t ARENA_BLOCK_ALIGN = 64;
|
||||
|
||||
struct ArenaBlock
|
||||
{
|
||||
size_t blockSize = 0;
|
||||
ArenaBlock* pNext = nullptr;
|
||||
};
|
||||
static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN, "Increase BLOCK_ALIGN size");
|
||||
|
||||
class DefaultAllocator
|
||||
{
|
||||
public:
|
||||
ArenaBlock* AllocateAligned(size_t size, size_t align)
|
||||
{
|
||||
SWR_ASSUME_ASSERT(size >= sizeof(ArenaBlock));
|
||||
|
||||
ArenaBlock* p = new (AlignedMalloc(size, align)) ArenaBlock();
|
||||
p->blockSize = size;
|
||||
return p;
|
||||
}
|
||||
|
||||
void Free(ArenaBlock* pMem)
|
||||
{
|
||||
if (pMem)
|
||||
{
|
||||
SWR_ASSUME_ASSERT(pMem->blockSize < size_t(0xdddddddd));
|
||||
AlignedFree(pMem);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Caching Allocator for Arena
|
||||
template <uint32_t NumBucketsT = 8, uint32_t StartBucketBitT = 12>
|
||||
struct CachingAllocatorT : DefaultAllocator
|
||||
{
|
||||
ArenaBlock* AllocateAligned(size_t size, size_t align)
|
||||
{
|
||||
SWR_ASSUME_ASSERT(size >= sizeof(ArenaBlock));
|
||||
SWR_ASSUME_ASSERT(size <= uint32_t(-1));
|
||||
|
||||
uint32_t bucket = GetBucketId(size);
|
||||
|
||||
{
|
||||
// search cached blocks
|
||||
std::lock_guard<std::mutex> l(m_mutex);
|
||||
ArenaBlock* pPrevBlock = &m_cachedBlocks[bucket];
|
||||
ArenaBlock* pBlock = SearchBlocks(pPrevBlock, size, align);
|
||||
|
||||
if (pBlock)
|
||||
{
|
||||
m_cachedSize -= pBlock->blockSize;
|
||||
if (pBlock == m_pLastCachedBlocks[bucket])
|
||||
{
|
||||
m_pLastCachedBlocks[bucket] = pPrevBlock;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pPrevBlock = &m_oldCachedBlocks[bucket];
|
||||
pBlock = SearchBlocks(pPrevBlock, size, align);
|
||||
|
||||
if (pBlock)
|
||||
{
|
||||
m_oldCachedSize -= pBlock->blockSize;
|
||||
if (pBlock == m_pOldLastCachedBlocks[bucket])
|
||||
{
|
||||
m_pOldLastCachedBlocks[bucket] = pPrevBlock;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (pBlock)
|
||||
{
|
||||
assert(pPrevBlock && pPrevBlock->pNext == pBlock);
|
||||
pPrevBlock->pNext = pBlock->pNext;
|
||||
pBlock->pNext = nullptr;
|
||||
|
||||
return pBlock;
|
||||
}
|
||||
|
||||
m_totalAllocated += size;
|
||||
|
||||
#if 0
|
||||
{
|
||||
static uint32_t count = 0;
|
||||
char buf[128];
|
||||
sprintf_s(buf, "Arena Alloc %d 0x%llx bytes - 0x%llx total\n", ++count, uint64_t(size), uint64_t(m_totalAllocated));
|
||||
OutputDebugStringA(buf);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
if (bucket && bucket < (CACHE_NUM_BUCKETS - 1))
|
||||
{
|
||||
// Make all blocks in this bucket the same size
|
||||
size = size_t(1) << (bucket + 1 + CACHE_START_BUCKET_BIT);
|
||||
}
|
||||
|
||||
return this->DefaultAllocator::AllocateAligned(size, align);
|
||||
}
|
||||
|
||||
void Free(ArenaBlock* pMem)
|
||||
{
|
||||
if (pMem)
|
||||
{
|
||||
std::unique_lock<std::mutex> l(m_mutex);
|
||||
InsertCachedBlock(GetBucketId(pMem->blockSize), pMem);
|
||||
}
|
||||
}
|
||||
|
||||
void FreeOldBlocks()
|
||||
{
|
||||
if (!m_cachedSize)
|
||||
{
|
||||
return;
|
||||
}
|
||||
std::lock_guard<std::mutex> l(m_mutex);
|
||||
|
||||
bool doFree = (m_oldCachedSize > MAX_UNUSED_SIZE);
|
||||
|
||||
for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
|
||||
{
|
||||
if (doFree)
|
||||
{
|
||||
ArenaBlock* pBlock = m_oldCachedBlocks[i].pNext;
|
||||
while (pBlock)
|
||||
{
|
||||
ArenaBlock* pNext = pBlock->pNext;
|
||||
m_oldCachedSize -= pBlock->blockSize;
|
||||
m_totalAllocated -= pBlock->blockSize;
|
||||
this->DefaultAllocator::Free(pBlock);
|
||||
pBlock = pNext;
|
||||
}
|
||||
m_oldCachedBlocks[i].pNext = nullptr;
|
||||
m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i];
|
||||
}
|
||||
|
||||
if (m_pLastCachedBlocks[i] != &m_cachedBlocks[i])
|
||||
{
|
||||
if (i && i < (CACHE_NUM_BUCKETS - 1))
|
||||
{
|
||||
// We know that all blocks are the same size.
|
||||
// Just move the list over.
|
||||
m_pLastCachedBlocks[i]->pNext = m_oldCachedBlocks[i].pNext;
|
||||
m_oldCachedBlocks[i].pNext = m_cachedBlocks[i].pNext;
|
||||
m_cachedBlocks[i].pNext = nullptr;
|
||||
if (m_pOldLastCachedBlocks[i]->pNext)
|
||||
{
|
||||
m_pOldLastCachedBlocks[i] = m_pLastCachedBlocks[i];
|
||||
}
|
||||
m_pLastCachedBlocks[i] = &m_cachedBlocks[i];
|
||||
}
|
||||
else
|
||||
{
|
||||
// The end buckets can have variable sized lists.
|
||||
// Insert each block based on size
|
||||
ArenaBlock* pBlock = m_cachedBlocks[i].pNext;
|
||||
while (pBlock)
|
||||
{
|
||||
ArenaBlock* pNext = pBlock->pNext;
|
||||
pBlock->pNext = nullptr;
|
||||
m_cachedSize -= pBlock->blockSize;
|
||||
InsertCachedBlock<true>(i, pBlock);
|
||||
pBlock = pNext;
|
||||
}
|
||||
|
||||
m_pLastCachedBlocks[i] = &m_cachedBlocks[i];
|
||||
m_cachedBlocks[i].pNext = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
m_oldCachedSize += m_cachedSize;
|
||||
m_cachedSize = 0;
|
||||
}
|
||||
|
||||
CachingAllocatorT()
|
||||
{
|
||||
for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
|
||||
{
|
||||
m_pLastCachedBlocks[i] = &m_cachedBlocks[i];
|
||||
m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i];
|
||||
}
|
||||
}
|
||||
|
||||
~CachingAllocatorT()
|
||||
{
|
||||
// Free all cached blocks
|
||||
for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
|
||||
{
|
||||
ArenaBlock* pBlock = m_cachedBlocks[i].pNext;
|
||||
while (pBlock)
|
||||
{
|
||||
ArenaBlock* pNext = pBlock->pNext;
|
||||
this->DefaultAllocator::Free(pBlock);
|
||||
pBlock = pNext;
|
||||
}
|
||||
pBlock = m_oldCachedBlocks[i].pNext;
|
||||
while (pBlock)
|
||||
{
|
||||
ArenaBlock* pNext = pBlock->pNext;
|
||||
this->DefaultAllocator::Free(pBlock);
|
||||
pBlock = pNext;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
static uint32_t GetBucketId(size_t blockSize)
|
||||
{
|
||||
uint32_t bucketId = 0;
|
||||
|
||||
#if defined(BitScanReverseSizeT)
|
||||
BitScanReverseSizeT((unsigned long*)&bucketId, (blockSize - 1) >> CACHE_START_BUCKET_BIT);
|
||||
bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1);
|
||||
#endif
|
||||
|
||||
return bucketId;
|
||||
}
|
||||
|
||||
template <bool OldBlockT = false>
|
||||
void InsertCachedBlock(uint32_t bucketId, ArenaBlock* pNewBlock)
|
||||
{
|
||||
SWR_ASSUME_ASSERT(bucketId < CACHE_NUM_BUCKETS);
|
||||
|
||||
ArenaBlock* pPrevBlock =
|
||||
OldBlockT ? &m_oldCachedBlocks[bucketId] : &m_cachedBlocks[bucketId];
|
||||
ArenaBlock* pBlock = pPrevBlock->pNext;
|
||||
|
||||
while (pBlock)
|
||||
{
|
||||
if (pNewBlock->blockSize >= pBlock->blockSize)
|
||||
{
|
||||
// Insert here
|
||||
break;
|
||||
}
|
||||
pPrevBlock = pBlock;
|
||||
pBlock = pBlock->pNext;
|
||||
}
|
||||
|
||||
// Insert into list
|
||||
SWR_ASSUME_ASSERT(pPrevBlock);
|
||||
pPrevBlock->pNext = pNewBlock;
|
||||
pNewBlock->pNext = pBlock;
|
||||
|
||||
if (OldBlockT)
|
||||
{
|
||||
if (m_pOldLastCachedBlocks[bucketId] == pPrevBlock)
|
||||
{
|
||||
m_pOldLastCachedBlocks[bucketId] = pNewBlock;
|
||||
}
|
||||
|
||||
m_oldCachedSize += pNewBlock->blockSize;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (m_pLastCachedBlocks[bucketId] == pPrevBlock)
|
||||
{
|
||||
m_pLastCachedBlocks[bucketId] = pNewBlock;
|
||||
}
|
||||
|
||||
m_cachedSize += pNewBlock->blockSize;
|
||||
}
|
||||
}
|
||||
|
||||
static ArenaBlock* SearchBlocks(ArenaBlock*& pPrevBlock, size_t blockSize, size_t align)
|
||||
{
|
||||
ArenaBlock* pBlock = pPrevBlock->pNext;
|
||||
ArenaBlock* pPotentialBlock = nullptr;
|
||||
ArenaBlock* pPotentialPrev = nullptr;
|
||||
|
||||
while (pBlock)
|
||||
{
|
||||
if (pBlock->blockSize >= blockSize)
|
||||
{
|
||||
if (pBlock == AlignUp(pBlock, align))
|
||||
{
|
||||
if (pBlock->blockSize == blockSize)
|
||||
{
|
||||
// Won't find a better match
|
||||
break;
|
||||
}
|
||||
|
||||
// We could use this as it is larger than we wanted, but
|
||||
// continue to search for a better match
|
||||
pPotentialBlock = pBlock;
|
||||
pPotentialPrev = pPrevBlock;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Blocks are sorted by size (biggest first)
|
||||
// So, if we get here, there are no blocks
|
||||
// large enough, fall through to allocation.
|
||||
pBlock = nullptr;
|
||||
break;
|
||||
}
|
||||
|
||||
pPrevBlock = pBlock;
|
||||
pBlock = pBlock->pNext;
|
||||
}
|
||||
|
||||
if (!pBlock)
|
||||
{
|
||||
// Couldn't find an exact match, use next biggest size
|
||||
pBlock = pPotentialBlock;
|
||||
pPrevBlock = pPotentialPrev;
|
||||
}
|
||||
|
||||
return pBlock;
|
||||
}
|
||||
|
||||
// buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ...
|
||||
static const uint32_t CACHE_NUM_BUCKETS = NumBucketsT;
|
||||
static const uint32_t CACHE_START_BUCKET_BIT = StartBucketBitT;
|
||||
static const size_t MAX_UNUSED_SIZE = sizeof(MEGABYTE);
|
||||
|
||||
ArenaBlock m_cachedBlocks[CACHE_NUM_BUCKETS];
|
||||
ArenaBlock* m_pLastCachedBlocks[CACHE_NUM_BUCKETS];
|
||||
ArenaBlock m_oldCachedBlocks[CACHE_NUM_BUCKETS];
|
||||
ArenaBlock* m_pOldLastCachedBlocks[CACHE_NUM_BUCKETS];
|
||||
std::mutex m_mutex;
|
||||
|
||||
size_t m_totalAllocated = 0;
|
||||
|
||||
size_t m_cachedSize = 0;
|
||||
size_t m_oldCachedSize = 0;
|
||||
};
|
||||
typedef CachingAllocatorT<> CachingAllocator;
|
||||
|
||||
template <typename T = DefaultAllocator, size_t BlockSizeT = 128 * sizeof(KILOBYTE)>
|
||||
class TArena
|
||||
{
|
||||
public:
|
||||
TArena(T& in_allocator) : m_allocator(in_allocator) {}
|
||||
TArena() : m_allocator(m_defAllocator) {}
|
||||
~TArena() { Reset(true); }
|
||||
|
||||
void* AllocAligned(size_t size, size_t align)
|
||||
{
|
||||
if (0 == size)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
SWR_ASSERT(align <= ARENA_BLOCK_ALIGN);
|
||||
|
||||
if (m_pCurBlock)
|
||||
{
|
||||
ArenaBlock* pCurBlock = m_pCurBlock;
|
||||
size_t offset = AlignUp(m_offset, align);
|
||||
|
||||
if ((offset + size) <= pCurBlock->blockSize)
|
||||
{
|
||||
void* pMem = PtrAdd(pCurBlock, offset);
|
||||
m_offset = offset + size;
|
||||
return pMem;
|
||||
}
|
||||
|
||||
// Not enough memory in this block, fall through to allocate
|
||||
// a new block
|
||||
}
|
||||
|
||||
static const size_t ArenaBlockSize = BlockSizeT;
|
||||
size_t blockSize = std::max(size + ARENA_BLOCK_ALIGN, ArenaBlockSize);
|
||||
|
||||
// Add in one BLOCK_ALIGN unit to store ArenaBlock in.
|
||||
blockSize = AlignUp(blockSize, ARENA_BLOCK_ALIGN);
|
||||
|
||||
ArenaBlock* pNewBlock = m_allocator.AllocateAligned(
|
||||
blockSize, ARENA_BLOCK_ALIGN); // Arena blocks are always simd byte aligned.
|
||||
SWR_ASSERT(pNewBlock != nullptr);
|
||||
|
||||
if (pNewBlock != nullptr)
|
||||
{
|
||||
m_offset = ARENA_BLOCK_ALIGN;
|
||||
pNewBlock->pNext = m_pCurBlock;
|
||||
|
||||
m_pCurBlock = pNewBlock;
|
||||
}
|
||||
|
||||
return AllocAligned(size, align);
|
||||
}
|
||||
|
||||
void* Alloc(size_t size) { return AllocAligned(size, 1); }
|
||||
|
||||
void* AllocAlignedSync(size_t size, size_t align)
|
||||
{
|
||||
void* pAlloc = nullptr;
|
||||
|
||||
m_mutex.lock();
|
||||
pAlloc = AllocAligned(size, align);
|
||||
m_mutex.unlock();
|
||||
|
||||
return pAlloc;
|
||||
}
|
||||
|
||||
void* AllocSync(size_t size)
|
||||
{
|
||||
void* pAlloc = nullptr;
|
||||
|
||||
m_mutex.lock();
|
||||
pAlloc = Alloc(size);
|
||||
m_mutex.unlock();
|
||||
|
||||
return pAlloc;
|
||||
}
|
||||
|
||||
void Reset(bool removeAll = false)
|
||||
{
|
||||
m_offset = ARENA_BLOCK_ALIGN;
|
||||
|
||||
if (m_pCurBlock)
|
||||
{
|
||||
ArenaBlock* pUsedBlocks = m_pCurBlock->pNext;
|
||||
m_pCurBlock->pNext = nullptr;
|
||||
while (pUsedBlocks)
|
||||
{
|
||||
ArenaBlock* pBlock = pUsedBlocks;
|
||||
pUsedBlocks = pBlock->pNext;
|
||||
|
||||
m_allocator.Free(pBlock);
|
||||
}
|
||||
|
||||
if (removeAll)
|
||||
{
|
||||
m_allocator.Free(m_pCurBlock);
|
||||
m_pCurBlock = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool IsEmpty()
|
||||
{
|
||||
return (m_pCurBlock == nullptr) ||
|
||||
(m_offset == ARENA_BLOCK_ALIGN && m_pCurBlock->pNext == nullptr);
|
||||
}
|
||||
|
||||
private:
|
||||
ArenaBlock* m_pCurBlock = nullptr;
|
||||
size_t m_offset = ARENA_BLOCK_ALIGN;
|
||||
|
||||
/// @note Mutex is only used by sync allocation functions.
|
||||
std::mutex m_mutex;
|
||||
|
||||
DefaultAllocator m_defAllocator;
|
||||
T& m_allocator;
|
||||
};
|
||||
|
||||
using StdArena = TArena<DefaultAllocator>;
|
||||
using CachingArena = TArena<CachingAllocator>;
|
||||
|
|
@ -1,420 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file backend.cpp
|
||||
*
|
||||
* @brief Backend handles rasterization, pixel shading and output merger
|
||||
* operations.
|
||||
*
|
||||
******************************************************************************/
|
||||
|
||||
#include <smmintrin.h>
|
||||
|
||||
#include "backend.h"
|
||||
#include "backend_impl.h"
|
||||
#include "tilemgr.h"
|
||||
#include "memory/tilingtraits.h"
|
||||
#include "core/multisample.h"
|
||||
#include "backends/gen_BackendPixelRate.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Process compute work.
|
||||
/// @param pDC - pointer to draw context (dispatch).
|
||||
/// @param workerId - The unique worker ID that is assigned to this thread.
|
||||
/// @param threadGroupId - the linear index for the thread group within the dispatch.
|
||||
void ProcessComputeBE(DRAW_CONTEXT* pDC,
|
||||
uint32_t workerId,
|
||||
uint32_t threadGroupId,
|
||||
void*& pSpillFillBuffer,
|
||||
void*& pScratchSpace)
|
||||
{
|
||||
SWR_CONTEXT* pContext = pDC->pContext;
|
||||
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEDispatch, pDC->drawId);
|
||||
|
||||
const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
|
||||
SWR_ASSERT(pTaskData != nullptr);
|
||||
|
||||
// Ensure spill fill memory has been allocated.
|
||||
size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
|
||||
if (spillFillSize && pSpillFillBuffer == nullptr)
|
||||
{
|
||||
pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD16_BYTES);
|
||||
}
|
||||
|
||||
size_t scratchSpaceSize =
|
||||
pDC->pState->state.scratchSpaceSizePerWarp * pDC->pState->state.scratchSpaceNumWarps;
|
||||
if (scratchSpaceSize && pScratchSpace == nullptr)
|
||||
{
|
||||
pScratchSpace = pDC->pArena->AllocAlignedSync(scratchSpaceSize, KNOB_SIMD16_BYTES);
|
||||
}
|
||||
|
||||
const API_STATE& state = GetApiState(pDC);
|
||||
|
||||
SWR_CS_CONTEXT csContext{0};
|
||||
csContext.tileCounter = threadGroupId;
|
||||
csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
|
||||
csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
|
||||
csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
|
||||
csContext.pTGSM = pContext->ppScratch[workerId];
|
||||
csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
|
||||
csContext.pScratchSpace = (uint8_t*)pScratchSpace;
|
||||
csContext.scratchSpacePerWarp = pDC->pState->state.scratchSpaceSizePerWarp;
|
||||
|
||||
state.pfnCsFunc(GetPrivateState(pDC),
|
||||
pContext->threadPool.pThreadData[workerId].pWorkerPrivateData,
|
||||
&csContext);
|
||||
|
||||
UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);
|
||||
AR_EVENT(CSStats((HANDLE)&csContext.stats));
|
||||
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BEDispatch, 1);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Process shutdown.
|
||||
/// @param pDC - pointer to draw context (dispatch).
|
||||
/// @param workerId - The unique worker ID that is assigned to this thread.
|
||||
/// @param threadGroupId - the linear index for the thread group within the dispatch.
|
||||
void ProcessShutdownBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
|
||||
{
|
||||
// Dummy function
|
||||
}
|
||||
|
||||
void ProcessSyncBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
|
||||
{
|
||||
uint32_t x, y;
|
||||
MacroTileMgr::getTileIndices(macroTile, x, y);
|
||||
SWR_ASSERT(x == 0 && y == 0);
|
||||
}
|
||||
|
||||
void ProcessStoreTileBE(DRAW_CONTEXT* pDC,
|
||||
uint32_t workerId,
|
||||
uint32_t macroTile,
|
||||
STORE_TILES_DESC* pDesc,
|
||||
SWR_RENDERTARGET_ATTACHMENT attachment)
|
||||
{
|
||||
SWR_CONTEXT* pContext = pDC->pContext;
|
||||
HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
|
||||
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEStoreTiles, pDC->drawId);
|
||||
|
||||
SWR_FORMAT srcFormat;
|
||||
switch (attachment)
|
||||
{
|
||||
case SWR_ATTACHMENT_COLOR0:
|
||||
case SWR_ATTACHMENT_COLOR1:
|
||||
case SWR_ATTACHMENT_COLOR2:
|
||||
case SWR_ATTACHMENT_COLOR3:
|
||||
case SWR_ATTACHMENT_COLOR4:
|
||||
case SWR_ATTACHMENT_COLOR5:
|
||||
case SWR_ATTACHMENT_COLOR6:
|
||||
case SWR_ATTACHMENT_COLOR7:
|
||||
srcFormat = KNOB_COLOR_HOT_TILE_FORMAT;
|
||||
break;
|
||||
case SWR_ATTACHMENT_DEPTH:
|
||||
srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT;
|
||||
break;
|
||||
case SWR_ATTACHMENT_STENCIL:
|
||||
srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT;
|
||||
break;
|
||||
default:
|
||||
SWR_INVALID("Unknown attachment: %d", attachment);
|
||||
srcFormat = KNOB_COLOR_HOT_TILE_FORMAT;
|
||||
break;
|
||||
}
|
||||
|
||||
uint32_t x, y;
|
||||
MacroTileMgr::getTileIndices(macroTile, x, y);
|
||||
|
||||
// Only need to store the hottile if it's been rendered to...
|
||||
HOTTILE* pHotTile =
|
||||
pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false);
|
||||
if (pHotTile)
|
||||
{
|
||||
// clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
|
||||
if (pHotTile->state == HOTTILE_CLEAR)
|
||||
{
|
||||
PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat];
|
||||
SWR_ASSERT(pfnClearTiles != nullptr);
|
||||
|
||||
pfnClearTiles(pDC,
|
||||
hWorkerPrivateData,
|
||||
attachment,
|
||||
macroTile,
|
||||
pHotTile->renderTargetArrayIndex,
|
||||
pHotTile->clearData,
|
||||
pDesc->rect);
|
||||
}
|
||||
|
||||
if (pHotTile->state == HOTTILE_DIRTY ||
|
||||
pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
|
||||
{
|
||||
int32_t destX = KNOB_MACROTILE_X_DIM * x;
|
||||
int32_t destY = KNOB_MACROTILE_Y_DIM * y;
|
||||
|
||||
pContext->pfnStoreTile(pDC,
|
||||
hWorkerPrivateData,
|
||||
srcFormat,
|
||||
attachment,
|
||||
destX,
|
||||
destY,
|
||||
pHotTile->renderTargetArrayIndex,
|
||||
pHotTile->pBuffer);
|
||||
}
|
||||
|
||||
if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
|
||||
{
|
||||
if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY &&
|
||||
pHotTile->state == HOTTILE_RESOLVED))
|
||||
{
|
||||
pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
|
||||
}
|
||||
}
|
||||
}
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BEStoreTiles, 1);
|
||||
}
|
||||
|
||||
void ProcessStoreTilesBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
|
||||
{
|
||||
STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pData;
|
||||
|
||||
unsigned long rt = 0;
|
||||
uint32_t mask = pDesc->attachmentMask;
|
||||
while (_BitScanForward(&rt, mask))
|
||||
{
|
||||
mask &= ~(1 << rt);
|
||||
ProcessStoreTileBE(pDC, workerId, macroTile, pDesc, (SWR_RENDERTARGET_ATTACHMENT)rt);
|
||||
}
|
||||
}
|
||||
|
||||
void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT* pDC,
|
||||
uint32_t workerId,
|
||||
uint32_t macroTile,
|
||||
void* pData)
|
||||
{
|
||||
DISCARD_INVALIDATE_TILES_DESC* pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pData;
|
||||
SWR_CONTEXT* pContext = pDC->pContext;
|
||||
|
||||
const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
|
||||
|
||||
for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
|
||||
{
|
||||
if (pDesc->attachmentMask & (1 << i))
|
||||
{
|
||||
HOTTILE* pHotTile =
|
||||
pContext->pHotTileMgr->GetHotTileNoLoad(pContext,
|
||||
pDC,
|
||||
macroTile,
|
||||
(SWR_RENDERTARGET_ATTACHMENT)i,
|
||||
pDesc->createNewTiles,
|
||||
numSamples);
|
||||
if (pHotTile)
|
||||
{
|
||||
HOTTILE_STATE newState = (HOTTILE_STATE)pDesc->newTileState;;
|
||||
if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_CLEAR)
|
||||
{
|
||||
if (newState == HOTTILE_INVALID)
|
||||
{
|
||||
// This is OK for APIs that explicitly allow discards
|
||||
// (for e.g. depth / stencil data)
|
||||
//SWR_INVALID("Discarding valid data!");
|
||||
}
|
||||
}
|
||||
pHotTile->state = newState;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <uint32_t sampleCountT>
|
||||
void BackendNullPS(DRAW_CONTEXT* pDC,
|
||||
uint32_t workerId,
|
||||
uint32_t x,
|
||||
uint32_t y,
|
||||
SWR_TRIANGLE_DESC& work,
|
||||
RenderOutputBuffers& renderBuffers)
|
||||
{
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BENullBackend, pDC->drawId);
|
||||
///@todo: handle center multisample pattern
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
|
||||
|
||||
const API_STATE& state = GetApiState(pDC);
|
||||
|
||||
BarycentricCoeffs coeffs;
|
||||
SetupBarycentricCoeffs(&coeffs, work);
|
||||
|
||||
uint8_t *pDepthBuffer, *pStencilBuffer;
|
||||
SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers);
|
||||
|
||||
SWR_PS_CONTEXT psContext;
|
||||
// skip SetupPixelShaderContext(&psContext, ...); // not needed here
|
||||
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 0);
|
||||
|
||||
simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
|
||||
|
||||
const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
|
||||
const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
|
||||
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
|
||||
{
|
||||
simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
|
||||
|
||||
const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
|
||||
|
||||
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
|
||||
{
|
||||
// iterate over active samples
|
||||
unsigned long sample = 0;
|
||||
uint32_t sampleMask = state.blendState.sampleMask;
|
||||
while (_BitScanForward(&sample, sampleMask))
|
||||
{
|
||||
sampleMask &= ~(1 << sample);
|
||||
|
||||
simdmask coverageMask = work.coverageMask[sample] & MASK;
|
||||
|
||||
if (coverageMask)
|
||||
{
|
||||
// offset depth/stencil buffers current sample
|
||||
uint8_t* pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
|
||||
uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
|
||||
|
||||
if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
|
||||
{
|
||||
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
|
||||
"Unsupported depth hot tile format");
|
||||
|
||||
const simdscalar z =
|
||||
_simd_load_ps(reinterpret_cast<const float*>(pDepthSample));
|
||||
|
||||
const float minz = state.depthBoundsState.depthBoundsTestMinValue;
|
||||
const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
|
||||
|
||||
coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
|
||||
}
|
||||
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
|
||||
|
||||
// calculate per sample positions
|
||||
psContext.vX.sample = _simd_add_ps(vXSamplePosUL, samplePos.vX(sample));
|
||||
psContext.vY.sample = _simd_add_ps(vYSamplePosUL, samplePos.vY(sample));
|
||||
|
||||
CalcSampleBarycentrics(coeffs, psContext);
|
||||
|
||||
// interpolate and quantize z
|
||||
psContext.vZ = vplaneps(coeffs.vZa,
|
||||
coeffs.vZb,
|
||||
coeffs.vZc,
|
||||
psContext.vI.sample,
|
||||
psContext.vJ.sample);
|
||||
psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
|
||||
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
|
||||
|
||||
// interpolate user clip distance if available
|
||||
if (state.backendState.clipDistanceMask)
|
||||
{
|
||||
coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
|
||||
work.pUserClipBuffer,
|
||||
psContext.vI.sample,
|
||||
psContext.vJ.sample);
|
||||
}
|
||||
|
||||
simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
|
||||
simdscalar stencilPassMask = vCoverageMask;
|
||||
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId);
|
||||
simdscalar depthPassMask = DepthStencilTest(&state,
|
||||
work.triFlags.frontFacing,
|
||||
work.triFlags.viewportIndex,
|
||||
psContext.vZ,
|
||||
pDepthSample,
|
||||
vCoverageMask,
|
||||
pStencilSample,
|
||||
&stencilPassMask);
|
||||
AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask),
|
||||
_simd_movemask_ps(stencilPassMask),
|
||||
_simd_movemask_ps(vCoverageMask)));
|
||||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
|
||||
&state.depthStencilState,
|
||||
work.triFlags.frontFacing,
|
||||
psContext.vZ,
|
||||
pDepthSample,
|
||||
depthPassMask,
|
||||
vCoverageMask,
|
||||
pStencilSample,
|
||||
stencilPassMask);
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0);
|
||||
|
||||
uint32_t statMask = _simd_movemask_ps(depthPassMask);
|
||||
uint32_t statCount = _mm_popcnt_u32(statMask);
|
||||
UPDATE_STAT_BE(DepthPassCount, statCount);
|
||||
}
|
||||
|
||||
Endtile:
|
||||
ATTR_UNUSED;
|
||||
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
||||
}
|
||||
|
||||
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
|
||||
pStencilBuffer +=
|
||||
(KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
|
||||
|
||||
vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx);
|
||||
}
|
||||
|
||||
vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy);
|
||||
}
|
||||
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BENullBackend, 0);
|
||||
}
|
||||
|
||||
PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS] = {};
|
||||
PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
|
||||
PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2] // centroid
|
||||
[2] // canEarlyZ
|
||||
= {};
|
||||
PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][2] // isCenterPattern
|
||||
[SWR_INPUT_COVERAGE_COUNT][2] // centroid
|
||||
[2] // forcedSampleCount
|
||||
[2] // canEarlyZ
|
||||
= {};
|
||||
PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT]
|
||||
[2] // centroid
|
||||
[2] // canEarlyZ
|
||||
= {};
|
||||
|
||||
void InitBackendFuncTables()
|
||||
{
|
||||
InitBackendPixelRate();
|
||||
InitBackendSingleFuncTable(gBackendSingleSample);
|
||||
InitBackendSampleFuncTable(gBackendSampleRateTable);
|
||||
|
||||
gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS<SWR_MULTISAMPLE_1X>;
|
||||
gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS<SWR_MULTISAMPLE_2X>;
|
||||
gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS<SWR_MULTISAMPLE_4X>;
|
||||
gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS<SWR_MULTISAMPLE_8X>;
|
||||
gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS<SWR_MULTISAMPLE_16X>;
|
||||
}
|
||||
|
|
@ -1,70 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file backend.h
|
||||
*
|
||||
* @brief Backend handles rasterization, pixel shading and output merger
|
||||
* operations.
|
||||
*
|
||||
******************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#include "common/os.h"
|
||||
#include "core/context.h"
|
||||
#include "core/multisample.h"
|
||||
#include "depthstencil.h"
|
||||
#include "rdtsc_core.h"
|
||||
|
||||
void ProcessComputeBE(DRAW_CONTEXT* pDC,
|
||||
uint32_t workerId,
|
||||
uint32_t threadGroupId,
|
||||
void*& pSpillFillBuffer,
|
||||
void*& pScratchSpace);
|
||||
void ProcessSyncBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData);
|
||||
void ProcessClearBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData);
|
||||
void ProcessStoreTilesBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData);
|
||||
void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT* pDC,
|
||||
uint32_t workerId,
|
||||
uint32_t macroTile,
|
||||
void* pData);
|
||||
void ProcessShutdownBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData);
|
||||
|
||||
typedef void (*PFN_CLEAR_TILES)(DRAW_CONTEXT*,
|
||||
HANDLE hWorkerData,
|
||||
SWR_RENDERTARGET_ATTACHMENT rt,
|
||||
uint32_t,
|
||||
uint32_t,
|
||||
uint32_t[4],
|
||||
const SWR_RECT& rect);
|
||||
|
||||
extern PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS];
|
||||
extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
|
||||
extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2] // centroid
|
||||
[2]; // canEarlyZ
|
||||
extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][2] // isCenterPattern
|
||||
[SWR_INPUT_COVERAGE_COUNT][2] // centroid
|
||||
[2] // forcedSampleCount
|
||||
[2] // canEarlyZ
|
||||
;
|
||||
extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
|
||||
[SWR_INPUT_COVERAGE_COUNT][2] // centroid
|
||||
[2]; // canEarlyZ
|
||||
|
|
@ -1,308 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file backend.cpp
|
||||
*
|
||||
* @brief Backend handles rasterization, pixel shading and output merger
|
||||
* operations.
|
||||
*
|
||||
******************************************************************************/
|
||||
|
||||
#include <smmintrin.h>
|
||||
|
||||
#include "backend.h"
|
||||
#include "backend_impl.h"
|
||||
#include "tilemgr.h"
|
||||
#include "memory/tilingtraits.h"
|
||||
#include "core/multisample.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
template <SWR_FORMAT format>
|
||||
void ClearRasterTile(uint8_t* pTileBuffer, simd16vector& value)
|
||||
{
|
||||
auto lambda = [&](int32_t comp)
|
||||
{
|
||||
FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
|
||||
|
||||
pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
|
||||
};
|
||||
|
||||
const uint32_t numIter =
|
||||
(KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
|
||||
|
||||
for (uint32_t i = 0; i < numIter; ++i)
|
||||
{
|
||||
UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
|
||||
}
|
||||
}
|
||||
|
||||
template <SWR_FORMAT format>
|
||||
INLINE void ClearMacroTile(DRAW_CONTEXT* pDC,
|
||||
HANDLE hWorkerPrivateData,
|
||||
SWR_RENDERTARGET_ATTACHMENT rt,
|
||||
uint32_t macroTile,
|
||||
uint32_t renderTargetArrayIndex,
|
||||
uint32_t clear[4],
|
||||
const SWR_RECT& rect)
|
||||
{
|
||||
// convert clear color to hottile format
|
||||
// clear color is in RGBA float/uint32
|
||||
|
||||
simd16vector vClear;
|
||||
for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
|
||||
{
|
||||
simd16scalar vComp = _simd16_load1_ps((const float*)&clear[comp]);
|
||||
|
||||
if (FormatTraits<format>::isNormalized(comp))
|
||||
{
|
||||
vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
|
||||
vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
|
||||
}
|
||||
vComp = FormatTraits<format>::pack(comp, vComp);
|
||||
|
||||
vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
|
||||
}
|
||||
|
||||
uint32_t tileX, tileY;
|
||||
MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
|
||||
|
||||
// Init to full macrotile
|
||||
SWR_RECT clearTile = {
|
||||
KNOB_MACROTILE_X_DIM * int32_t(tileX),
|
||||
KNOB_MACROTILE_Y_DIM * int32_t(tileY),
|
||||
KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
|
||||
KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
|
||||
};
|
||||
|
||||
// intersect with clear rect
|
||||
clearTile &= rect;
|
||||
|
||||
// translate to local hottile origin
|
||||
clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM,
|
||||
-int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
|
||||
|
||||
// Make maximums inclusive (needed for convert to raster tiles)
|
||||
clearTile.xmax -= 1;
|
||||
clearTile.ymax -= 1;
|
||||
|
||||
// convert to raster tiles
|
||||
clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
|
||||
clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
|
||||
clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
|
||||
clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
|
||||
|
||||
const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
|
||||
// compute steps between raster tile samples / raster tiles / macro tile rows
|
||||
const uint32_t rasterTileSampleStep =
|
||||
KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
|
||||
const uint32_t rasterTileStep =
|
||||
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
|
||||
const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
|
||||
const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
|
||||
|
||||
HOTTILE* pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext,
|
||||
pDC,
|
||||
hWorkerPrivateData,
|
||||
macroTile,
|
||||
rt,
|
||||
true,
|
||||
numSamples,
|
||||
renderTargetArrayIndex);
|
||||
uint32_t rasterTileStartOffset =
|
||||
(ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp>>(
|
||||
pitch, clearTile.xmin, clearTile.ymin)) *
|
||||
numSamples;
|
||||
uint8_t* pRasterTileRow =
|
||||
pHotTile->pBuffer +
|
||||
rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ,
|
||||
// FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
|
||||
|
||||
// loop over all raster tiles in the current hot tile
|
||||
for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
|
||||
{
|
||||
uint8_t* pRasterTile = pRasterTileRow;
|
||||
for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
|
||||
{
|
||||
for (int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
|
||||
{
|
||||
ClearRasterTile<format>(pRasterTile, vClear);
|
||||
pRasterTile += rasterTileSampleStep;
|
||||
}
|
||||
}
|
||||
pRasterTileRow += macroTileRowStep;
|
||||
}
|
||||
|
||||
pHotTile->state = HOTTILE_DIRTY;
|
||||
}
|
||||
|
||||
void ProcessClearBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
|
||||
{
|
||||
SWR_CONTEXT* pContext = pDC->pContext;
|
||||
HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
|
||||
|
||||
if (KNOB_FAST_CLEAR)
|
||||
{
|
||||
CLEAR_DESC* pClear = (CLEAR_DESC*)pUserData;
|
||||
SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
|
||||
uint32_t numSamples = GetNumSamples(sampleCount);
|
||||
|
||||
SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.
|
||||
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEClear, pDC->drawId);
|
||||
|
||||
if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
|
||||
{
|
||||
unsigned long rt = 0;
|
||||
uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
|
||||
while (_BitScanForward(&rt, mask))
|
||||
{
|
||||
mask &= ~(1 << rt);
|
||||
|
||||
HOTTILE* pHotTile =
|
||||
pContext->pHotTileMgr->GetHotTile(pContext,
|
||||
pDC,
|
||||
hWorkerPrivateData,
|
||||
macroTile,
|
||||
(SWR_RENDERTARGET_ATTACHMENT)rt,
|
||||
true,
|
||||
numSamples,
|
||||
pClear->renderTargetArrayIndex);
|
||||
|
||||
// All we want to do here is to mark the hot tile as being in a "needs clear" state.
|
||||
pHotTile->clearData[0] = *(uint32_t*)&(pClear->clearRTColor[0]);
|
||||
pHotTile->clearData[1] = *(uint32_t*)&(pClear->clearRTColor[1]);
|
||||
pHotTile->clearData[2] = *(uint32_t*)&(pClear->clearRTColor[2]);
|
||||
pHotTile->clearData[3] = *(uint32_t*)&(pClear->clearRTColor[3]);
|
||||
pHotTile->state = HOTTILE_CLEAR;
|
||||
}
|
||||
}
|
||||
|
||||
if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
|
||||
{
|
||||
HOTTILE* pHotTile = pContext->pHotTileMgr->GetHotTile(pContext,
|
||||
pDC,
|
||||
hWorkerPrivateData,
|
||||
macroTile,
|
||||
SWR_ATTACHMENT_DEPTH,
|
||||
true,
|
||||
numSamples,
|
||||
pClear->renderTargetArrayIndex);
|
||||
pHotTile->clearData[0] = *(uint32_t*)&pClear->clearDepth;
|
||||
pHotTile->state = HOTTILE_CLEAR;
|
||||
}
|
||||
|
||||
if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
|
||||
{
|
||||
HOTTILE* pHotTile = pContext->pHotTileMgr->GetHotTile(pContext,
|
||||
pDC,
|
||||
hWorkerPrivateData,
|
||||
macroTile,
|
||||
SWR_ATTACHMENT_STENCIL,
|
||||
true,
|
||||
numSamples,
|
||||
pClear->renderTargetArrayIndex);
|
||||
|
||||
pHotTile->clearData[0] = pClear->clearStencil;
|
||||
pHotTile->state = HOTTILE_CLEAR;
|
||||
}
|
||||
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BEClear, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Legacy clear
|
||||
CLEAR_DESC* pClear = (CLEAR_DESC*)pUserData;
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEClear, pDC->drawId);
|
||||
|
||||
if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
|
||||
{
|
||||
uint32_t clearData[4];
|
||||
clearData[0] = *(uint32_t*)&(pClear->clearRTColor[0]);
|
||||
clearData[1] = *(uint32_t*)&(pClear->clearRTColor[1]);
|
||||
clearData[2] = *(uint32_t*)&(pClear->clearRTColor[2]);
|
||||
clearData[3] = *(uint32_t*)&(pClear->clearRTColor[3]);
|
||||
|
||||
PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
|
||||
SWR_ASSERT(pfnClearTiles != nullptr);
|
||||
|
||||
unsigned long rt = 0;
|
||||
uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
|
||||
while (_BitScanForward(&rt, mask))
|
||||
{
|
||||
mask &= ~(1 << rt);
|
||||
|
||||
pfnClearTiles(pDC,
|
||||
hWorkerPrivateData,
|
||||
(SWR_RENDERTARGET_ATTACHMENT)rt,
|
||||
macroTile,
|
||||
pClear->renderTargetArrayIndex,
|
||||
clearData,
|
||||
pClear->rect);
|
||||
}
|
||||
}
|
||||
|
||||
if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
|
||||
{
|
||||
uint32_t clearData[4];
|
||||
clearData[0] = *(uint32_t*)&pClear->clearDepth;
|
||||
PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
|
||||
SWR_ASSERT(pfnClearTiles != nullptr);
|
||||
|
||||
pfnClearTiles(pDC,
|
||||
hWorkerPrivateData,
|
||||
SWR_ATTACHMENT_DEPTH,
|
||||
macroTile,
|
||||
pClear->renderTargetArrayIndex,
|
||||
clearData,
|
||||
pClear->rect);
|
||||
}
|
||||
|
||||
if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
|
||||
{
|
||||
uint32_t clearData[4];
|
||||
clearData[0] = pClear->clearStencil;
|
||||
PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
|
||||
|
||||
pfnClearTiles(pDC,
|
||||
hWorkerPrivateData,
|
||||
SWR_ATTACHMENT_STENCIL,
|
||||
macroTile,
|
||||
pClear->renderTargetArrayIndex,
|
||||
clearData,
|
||||
pClear->rect);
|
||||
}
|
||||
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BEClear, 1);
|
||||
}
|
||||
}
|
||||
|
||||
void InitClearTilesTable()
|
||||
{
|
||||
memset(gClearTilesTable, 0, sizeof(gClearTilesTable));
|
||||
|
||||
gClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
|
||||
gClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
|
||||
gClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
|
||||
gClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
|
||||
gClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
|
||||
}
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,454 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file backend.cpp
|
||||
*
|
||||
* @brief Backend handles rasterization, pixel shading and output merger
|
||||
* operations.
|
||||
*
|
||||
******************************************************************************/
|
||||
|
||||
#include <smmintrin.h>
|
||||
|
||||
#include "backend.h"
|
||||
#include "backend_impl.h"
|
||||
#include "tilemgr.h"
|
||||
#include "memory/tilingtraits.h"
|
||||
#include "core/multisample.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
template <typename T>
|
||||
void BackendSampleRate(DRAW_CONTEXT* pDC,
|
||||
uint32_t workerId,
|
||||
uint32_t x,
|
||||
uint32_t y,
|
||||
SWR_TRIANGLE_DESC& work,
|
||||
RenderOutputBuffers& renderBuffers)
|
||||
{
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESampleRateBackend, pDC->drawId);
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
|
||||
|
||||
void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
|
||||
const API_STATE& state = GetApiState(pDC);
|
||||
|
||||
BarycentricCoeffs coeffs;
|
||||
SetupBarycentricCoeffs(&coeffs, work);
|
||||
|
||||
SWR_PS_CONTEXT psContext;
|
||||
const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
|
||||
SetupPixelShaderContext<T>(&psContext, samplePos, work);
|
||||
|
||||
uint8_t *pDepthBuffer, *pStencilBuffer;
|
||||
SetupRenderBuffers(psContext.pColorBuffer,
|
||||
&pDepthBuffer,
|
||||
&pStencilBuffer,
|
||||
state.colorHottileEnable,
|
||||
renderBuffers);
|
||||
|
||||
bool isTileDirty = false;
|
||||
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 0);
|
||||
|
||||
psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
|
||||
psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
|
||||
|
||||
const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
|
||||
|
||||
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
|
||||
{
|
||||
psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
|
||||
psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
|
||||
|
||||
const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
|
||||
|
||||
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
|
||||
{
|
||||
const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
|
||||
|
||||
|
||||
if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
|
||||
{
|
||||
const uint64_t* pCoverageMask =
|
||||
(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
|
||||
? &work.innerCoverageMask
|
||||
: &work.coverageMask[0];
|
||||
|
||||
generateInputCoverage<T, T::InputCoverage>(
|
||||
pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
|
||||
}
|
||||
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
|
||||
|
||||
CalcPixelBarycentrics(coeffs, psContext);
|
||||
|
||||
CalcCentroid<T, false>(
|
||||
&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
|
||||
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
|
||||
|
||||
for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
|
||||
{
|
||||
simdmask coverageMask = work.coverageMask[sample] & MASK;
|
||||
|
||||
if (coverageMask)
|
||||
{
|
||||
// offset depth/stencil buffers current sample
|
||||
uint8_t* pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
|
||||
uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
|
||||
|
||||
if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
|
||||
{
|
||||
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
|
||||
"Unsupported depth hot tile format");
|
||||
|
||||
const simdscalar z =
|
||||
_simd_load_ps(reinterpret_cast<const float*>(pDepthSample));
|
||||
|
||||
const float minz = state.depthBoundsState.depthBoundsTestMinValue;
|
||||
const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
|
||||
|
||||
coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
|
||||
}
|
||||
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
|
||||
|
||||
// calculate per sample positions
|
||||
psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
|
||||
psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
|
||||
|
||||
CalcSampleBarycentrics(coeffs, psContext);
|
||||
|
||||
// interpolate and quantize z
|
||||
psContext.vZ = vplaneps(coeffs.vZa,
|
||||
coeffs.vZb,
|
||||
coeffs.vZc,
|
||||
psContext.vI.sample,
|
||||
psContext.vJ.sample);
|
||||
psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
|
||||
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
|
||||
|
||||
// interpolate user clip distance if available
|
||||
if (state.backendState.clipDistanceMask)
|
||||
{
|
||||
coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
|
||||
work.pUserClipBuffer,
|
||||
psContext.vI.sample,
|
||||
psContext.vJ.sample);
|
||||
}
|
||||
|
||||
simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
|
||||
simdscalar depthPassMask = vCoverageMask;
|
||||
simdscalar stencilPassMask = vCoverageMask;
|
||||
|
||||
// Early-Z?
|
||||
if (T::bCanEarlyZ)
|
||||
{
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId);
|
||||
depthPassMask = DepthStencilTest(&state,
|
||||
work.triFlags.frontFacing,
|
||||
work.triFlags.viewportIndex,
|
||||
psContext.vZ,
|
||||
pDepthSample,
|
||||
vCoverageMask,
|
||||
pStencilSample,
|
||||
&stencilPassMask);
|
||||
AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask),
|
||||
_simd_movemask_ps(stencilPassMask),
|
||||
_simd_movemask_ps(vCoverageMask)));
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0);
|
||||
|
||||
// early-exit if no samples passed depth or earlyZ is forced on.
|
||||
if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
|
||||
{
|
||||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
|
||||
&state.depthStencilState,
|
||||
work.triFlags.frontFacing,
|
||||
psContext.vZ,
|
||||
pDepthSample,
|
||||
depthPassMask,
|
||||
vCoverageMask,
|
||||
pStencilSample,
|
||||
stencilPassMask);
|
||||
|
||||
if (!_simd_movemask_ps(depthPassMask))
|
||||
{
|
||||
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
psContext.sampleIndex = sample;
|
||||
psContext.activeMask = _simd_castps_si(vCoverageMask);
|
||||
|
||||
// execute pixel shader
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId);
|
||||
state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0);
|
||||
|
||||
// update stats
|
||||
UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
|
||||
AR_EVENT(PSStats((HANDLE)&psContext.stats));
|
||||
|
||||
vCoverageMask = _simd_castsi_ps(psContext.activeMask);
|
||||
|
||||
if (_simd_movemask_ps(vCoverageMask))
|
||||
{
|
||||
isTileDirty = true;
|
||||
}
|
||||
|
||||
// late-Z
|
||||
if (!T::bCanEarlyZ)
|
||||
{
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BELateDepthTest, pDC->drawId);
|
||||
depthPassMask = DepthStencilTest(&state,
|
||||
work.triFlags.frontFacing,
|
||||
work.triFlags.viewportIndex,
|
||||
psContext.vZ,
|
||||
pDepthSample,
|
||||
vCoverageMask,
|
||||
pStencilSample,
|
||||
&stencilPassMask);
|
||||
AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask),
|
||||
_simd_movemask_ps(stencilPassMask),
|
||||
_simd_movemask_ps(vCoverageMask)));
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BELateDepthTest, 0);
|
||||
|
||||
if (!_simd_movemask_ps(depthPassMask))
|
||||
{
|
||||
// need to call depth/stencil write for stencil write
|
||||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
|
||||
&state.depthStencilState,
|
||||
work.triFlags.frontFacing,
|
||||
psContext.vZ,
|
||||
pDepthSample,
|
||||
depthPassMask,
|
||||
vCoverageMask,
|
||||
pStencilSample,
|
||||
stencilPassMask);
|
||||
|
||||
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t statMask = _simd_movemask_ps(depthPassMask);
|
||||
uint32_t statCount = _mm_popcnt_u32(statMask);
|
||||
UPDATE_STAT_BE(DepthPassCount, statCount);
|
||||
|
||||
// output merger
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId);
|
||||
|
||||
OutputMerger8x2(pDC,
|
||||
psContext,
|
||||
psContext.pColorBuffer,
|
||||
sample,
|
||||
&state.blendState,
|
||||
state.pfnBlendFunc,
|
||||
vCoverageMask,
|
||||
depthPassMask,
|
||||
state.psState.renderTargetMask,
|
||||
useAlternateOffset,
|
||||
workerId);
|
||||
|
||||
// do final depth write after all pixel kills
|
||||
if (!state.psState.forceEarlyZ)
|
||||
{
|
||||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
|
||||
&state.depthStencilState,
|
||||
work.triFlags.frontFacing,
|
||||
psContext.vZ,
|
||||
pDepthSample,
|
||||
depthPassMask,
|
||||
vCoverageMask,
|
||||
pStencilSample,
|
||||
stencilPassMask);
|
||||
}
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0);
|
||||
}
|
||||
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
||||
}
|
||||
|
||||
Endtile:
|
||||
ATTR_UNUSED;
|
||||
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId);
|
||||
|
||||
if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
|
||||
{
|
||||
work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
||||
}
|
||||
|
||||
if (useAlternateOffset)
|
||||
{
|
||||
unsigned long rt;
|
||||
uint32_t rtMask = state.colorHottileEnable;
|
||||
while (_BitScanForward(&rt, rtMask))
|
||||
{
|
||||
rtMask &= ~(1 << rt);
|
||||
psContext.pColorBuffer[rt] +=
|
||||
(2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
|
||||
}
|
||||
}
|
||||
|
||||
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
|
||||
pStencilBuffer +=
|
||||
(KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
|
||||
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0);
|
||||
|
||||
psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
|
||||
psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
|
||||
}
|
||||
|
||||
psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
|
||||
psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
|
||||
}
|
||||
|
||||
if (isTileDirty)
|
||||
{
|
||||
SetRenderHotTilesDirty(pDC, renderBuffers);
|
||||
}
|
||||
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BESampleRateBackend, 0);
|
||||
}
|
||||
|
||||
// Recursive template used to auto-nest conditionals. Converts dynamic enum function
|
||||
// arguments to static template arguments.
|
||||
template <uint32_t... ArgsT>
|
||||
struct BEChooserSampleRate
|
||||
{
|
||||
// Last Arg Terminator
|
||||
static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
|
||||
{
|
||||
switch (tArg)
|
||||
{
|
||||
case SWR_BACKEND_MSAA_SAMPLE_RATE:
|
||||
return BackendSampleRate<SwrBackendTraits<ArgsT...>>;
|
||||
break;
|
||||
case SWR_BACKEND_SINGLE_SAMPLE:
|
||||
case SWR_BACKEND_MSAA_PIXEL_RATE:
|
||||
SWR_ASSERT(0 && "Invalid backend func\n");
|
||||
return nullptr;
|
||||
break;
|
||||
default:
|
||||
SWR_ASSERT(0 && "Invalid backend func\n");
|
||||
return nullptr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Recursively parse args
|
||||
template <typename... TArgsT>
|
||||
static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
|
||||
{
|
||||
switch (tArg)
|
||||
{
|
||||
case SWR_INPUT_COVERAGE_NONE:
|
||||
return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
|
||||
remainingArgs...);
|
||||
break;
|
||||
case SWR_INPUT_COVERAGE_NORMAL:
|
||||
return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(
|
||||
remainingArgs...);
|
||||
break;
|
||||
case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE:
|
||||
return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(
|
||||
remainingArgs...);
|
||||
break;
|
||||
default:
|
||||
SWR_ASSERT(0 && "Invalid sample pattern\n");
|
||||
return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
|
||||
remainingArgs...);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Recursively parse args
|
||||
template <typename... TArgsT>
|
||||
static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
|
||||
{
|
||||
switch (tArg)
|
||||
{
|
||||
case SWR_MULTISAMPLE_1X:
|
||||
return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
|
||||
break;
|
||||
case SWR_MULTISAMPLE_2X:
|
||||
return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...);
|
||||
break;
|
||||
case SWR_MULTISAMPLE_4X:
|
||||
return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...);
|
||||
break;
|
||||
case SWR_MULTISAMPLE_8X:
|
||||
return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...);
|
||||
break;
|
||||
case SWR_MULTISAMPLE_16X:
|
||||
return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...);
|
||||
break;
|
||||
default:
|
||||
SWR_ASSERT(0 && "Invalid sample count\n");
|
||||
return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Recursively parse args
|
||||
template <typename... TArgsT>
|
||||
static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
|
||||
{
|
||||
if (tArg == true)
|
||||
{
|
||||
return BEChooserSampleRate<ArgsT..., 1>::GetFunc(remainingArgs...);
|
||||
}
|
||||
|
||||
return BEChooserSampleRate<ArgsT..., 0>::GetFunc(remainingArgs...);
|
||||
}
|
||||
};
|
||||
|
||||
void InitBackendSampleFuncTable(
|
||||
PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
|
||||
{
|
||||
for (uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT;
|
||||
sampleCount++)
|
||||
{
|
||||
for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
|
||||
{
|
||||
for (uint32_t centroid = 0; centroid < 2; centroid++)
|
||||
{
|
||||
for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
|
||||
{
|
||||
table[sampleCount][inputCoverage][centroid][canEarlyZ] =
|
||||
BEChooserSampleRate<>::GetFunc(
|
||||
(SWR_MULTISAMPLE_COUNT)sampleCount,
|
||||
false,
|
||||
(SWR_INPUT_COVERAGE)inputCoverage,
|
||||
(centroid > 0),
|
||||
false,
|
||||
(canEarlyZ > 0),
|
||||
(SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,428 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file backend.cpp
|
||||
*
|
||||
* @brief Backend handles rasterization, pixel shading and output merger
|
||||
* operations.
|
||||
*
|
||||
******************************************************************************/
|
||||
|
||||
#include <smmintrin.h>
|
||||
|
||||
#include "backend.h"
|
||||
#include "backend_impl.h"
|
||||
#include "tilemgr.h"
|
||||
#include "memory/tilingtraits.h"
|
||||
#include "core/multisample.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
template <typename T>
|
||||
void BackendSingleSample(DRAW_CONTEXT* pDC,
|
||||
uint32_t workerId,
|
||||
uint32_t x,
|
||||
uint32_t y,
|
||||
SWR_TRIANGLE_DESC& work,
|
||||
RenderOutputBuffers& renderBuffers)
|
||||
{
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESingleSampleBackend, pDC->drawId);
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
|
||||
|
||||
void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
|
||||
|
||||
const API_STATE& state = GetApiState(pDC);
|
||||
|
||||
BarycentricCoeffs coeffs;
|
||||
SetupBarycentricCoeffs(&coeffs, work);
|
||||
|
||||
SWR_PS_CONTEXT psContext;
|
||||
const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
|
||||
SetupPixelShaderContext<T>(&psContext, samplePos, work);
|
||||
|
||||
uint8_t *pDepthBuffer, *pStencilBuffer;
|
||||
SetupRenderBuffers(psContext.pColorBuffer,
|
||||
&pDepthBuffer,
|
||||
&pStencilBuffer,
|
||||
state.colorHottileEnable,
|
||||
renderBuffers);
|
||||
|
||||
// Indicates backend rendered something to the color buffer
|
||||
bool isTileDirty = false;
|
||||
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 1);
|
||||
|
||||
psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
|
||||
psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
|
||||
|
||||
const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
|
||||
|
||||
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
|
||||
{
|
||||
psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
|
||||
psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
|
||||
|
||||
const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
|
||||
|
||||
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
|
||||
{
|
||||
const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
|
||||
|
||||
|
||||
simdmask coverageMask = work.coverageMask[0] & MASK;
|
||||
|
||||
if (coverageMask)
|
||||
{
|
||||
if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
|
||||
{
|
||||
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
|
||||
"Unsupported depth hot tile format");
|
||||
|
||||
const simdscalar z =
|
||||
_simd_load_ps(reinterpret_cast<const float*>(pDepthBuffer));
|
||||
|
||||
const float minz = state.depthBoundsState.depthBoundsTestMinValue;
|
||||
const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
|
||||
|
||||
coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
|
||||
}
|
||||
|
||||
if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
|
||||
{
|
||||
const uint64_t* pCoverageMask =
|
||||
(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
|
||||
? &work.innerCoverageMask
|
||||
: &work.coverageMask[0];
|
||||
|
||||
generateInputCoverage<T, T::InputCoverage>(
|
||||
pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
|
||||
}
|
||||
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
|
||||
|
||||
CalcPixelBarycentrics(coeffs, psContext);
|
||||
|
||||
CalcCentroid<T, true>(
|
||||
&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
|
||||
|
||||
// interpolate and quantize z
|
||||
psContext.vZ = vplaneps(
|
||||
coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
|
||||
psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
|
||||
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 1);
|
||||
|
||||
// interpolate user clip distance if available
|
||||
if (state.backendState.clipDistanceMask)
|
||||
{
|
||||
coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
|
||||
work.pUserClipBuffer,
|
||||
psContext.vI.center,
|
||||
psContext.vJ.center);
|
||||
}
|
||||
|
||||
simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
|
||||
simdscalar depthPassMask = vCoverageMask;
|
||||
simdscalar stencilPassMask = vCoverageMask;
|
||||
|
||||
// Early-Z?
|
||||
if (T::bCanEarlyZ)
|
||||
{
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId);
|
||||
depthPassMask = DepthStencilTest(&state,
|
||||
work.triFlags.frontFacing,
|
||||
work.triFlags.viewportIndex,
|
||||
psContext.vZ,
|
||||
pDepthBuffer,
|
||||
vCoverageMask,
|
||||
pStencilBuffer,
|
||||
&stencilPassMask);
|
||||
AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
|
||||
_simd_movemask_ps(stencilPassMask),
|
||||
_simd_movemask_ps(vCoverageMask)));
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0);
|
||||
|
||||
// early-exit if no pixels passed depth or earlyZ is forced on
|
||||
if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
|
||||
{
|
||||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
|
||||
&state.depthStencilState,
|
||||
work.triFlags.frontFacing,
|
||||
psContext.vZ,
|
||||
pDepthBuffer,
|
||||
depthPassMask,
|
||||
vCoverageMask,
|
||||
pStencilBuffer,
|
||||
stencilPassMask);
|
||||
|
||||
if (!_simd_movemask_ps(depthPassMask))
|
||||
{
|
||||
goto Endtile;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
psContext.sampleIndex = 0;
|
||||
psContext.activeMask = _simd_castps_si(vCoverageMask);
|
||||
|
||||
// execute pixel shader
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId);
|
||||
state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0);
|
||||
|
||||
// update stats
|
||||
UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
|
||||
AR_EVENT(PSStats((HANDLE)&psContext.stats));
|
||||
|
||||
vCoverageMask = _simd_castsi_ps(psContext.activeMask);
|
||||
|
||||
if (_simd_movemask_ps(vCoverageMask))
|
||||
{
|
||||
isTileDirty = true;
|
||||
}
|
||||
|
||||
// late-Z
|
||||
if (!T::bCanEarlyZ)
|
||||
{
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BELateDepthTest, pDC->drawId);
|
||||
depthPassMask = DepthStencilTest(&state,
|
||||
work.triFlags.frontFacing,
|
||||
work.triFlags.viewportIndex,
|
||||
psContext.vZ,
|
||||
pDepthBuffer,
|
||||
vCoverageMask,
|
||||
pStencilBuffer,
|
||||
&stencilPassMask);
|
||||
AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
|
||||
_simd_movemask_ps(stencilPassMask),
|
||||
_simd_movemask_ps(vCoverageMask)));
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BELateDepthTest, 0);
|
||||
|
||||
if (!_simd_movemask_ps(depthPassMask))
|
||||
{
|
||||
// need to call depth/stencil write for stencil write
|
||||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
|
||||
&state.depthStencilState,
|
||||
work.triFlags.frontFacing,
|
||||
psContext.vZ,
|
||||
pDepthBuffer,
|
||||
depthPassMask,
|
||||
vCoverageMask,
|
||||
pStencilBuffer,
|
||||
stencilPassMask);
|
||||
goto Endtile;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// for early z, consolidate discards from shader
|
||||
// into depthPassMask
|
||||
depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
|
||||
}
|
||||
|
||||
uint32_t statMask = _simd_movemask_ps(depthPassMask);
|
||||
uint32_t statCount = _mm_popcnt_u32(statMask);
|
||||
UPDATE_STAT_BE(DepthPassCount, statCount);
|
||||
|
||||
// output merger
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId);
|
||||
|
||||
OutputMerger8x2(pDC,
|
||||
psContext,
|
||||
psContext.pColorBuffer,
|
||||
0,
|
||||
&state.blendState,
|
||||
state.pfnBlendFunc,
|
||||
vCoverageMask,
|
||||
depthPassMask,
|
||||
state.psState.renderTargetMask,
|
||||
useAlternateOffset,
|
||||
workerId);
|
||||
|
||||
// do final depth write after all pixel kills
|
||||
if (!state.psState.forceEarlyZ)
|
||||
{
|
||||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
|
||||
&state.depthStencilState,
|
||||
work.triFlags.frontFacing,
|
||||
psContext.vZ,
|
||||
pDepthBuffer,
|
||||
depthPassMask,
|
||||
vCoverageMask,
|
||||
pStencilBuffer,
|
||||
stencilPassMask);
|
||||
}
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0);
|
||||
}
|
||||
|
||||
Endtile:
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId);
|
||||
|
||||
work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
||||
if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
|
||||
{
|
||||
work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
|
||||
}
|
||||
|
||||
if (useAlternateOffset)
|
||||
{
|
||||
unsigned long rt;
|
||||
uint32_t rtMask = state.colorHottileEnable;
|
||||
while (_BitScanForward(&rt, rtMask))
|
||||
{
|
||||
rtMask &= ~(1 << rt);
|
||||
psContext.pColorBuffer[rt] +=
|
||||
(2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
|
||||
}
|
||||
}
|
||||
|
||||
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
|
||||
pStencilBuffer +=
|
||||
(KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
|
||||
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0);
|
||||
|
||||
psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
|
||||
psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
|
||||
}
|
||||
|
||||
psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
|
||||
psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
|
||||
}
|
||||
|
||||
if (isTileDirty)
|
||||
{
|
||||
SetRenderHotTilesDirty(pDC, renderBuffers);
|
||||
}
|
||||
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BESingleSampleBackend, 0);
|
||||
}
|
||||
|
||||
// Recursive template used to auto-nest conditionals. Converts dynamic enum function
|
||||
// arguments to static template arguments.
|
||||
template <uint32_t... ArgsT>
|
||||
struct BEChooserSingleSample
|
||||
{
|
||||
// Last Arg Terminator
|
||||
static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
|
||||
{
|
||||
switch (tArg)
|
||||
{
|
||||
case SWR_BACKEND_SINGLE_SAMPLE:
|
||||
return BackendSingleSample<SwrBackendTraits<ArgsT...>>;
|
||||
break;
|
||||
case SWR_BACKEND_MSAA_PIXEL_RATE:
|
||||
case SWR_BACKEND_MSAA_SAMPLE_RATE:
|
||||
default:
|
||||
SWR_ASSERT(0 && "Invalid backend func\n");
|
||||
return nullptr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Recursively parse args
|
||||
template <typename... TArgsT>
|
||||
static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
|
||||
{
|
||||
switch (tArg)
|
||||
{
|
||||
case SWR_INPUT_COVERAGE_NONE:
|
||||
return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
|
||||
remainingArgs...);
|
||||
break;
|
||||
case SWR_INPUT_COVERAGE_NORMAL:
|
||||
return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(
|
||||
remainingArgs...);
|
||||
break;
|
||||
case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE:
|
||||
return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(
|
||||
remainingArgs...);
|
||||
break;
|
||||
default:
|
||||
SWR_ASSERT(0 && "Invalid sample pattern\n");
|
||||
return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
|
||||
remainingArgs...);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Recursively parse args
|
||||
template <typename... TArgsT>
|
||||
static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
|
||||
{
|
||||
switch (tArg)
|
||||
{
|
||||
case SWR_MULTISAMPLE_1X:
|
||||
return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
|
||||
break;
|
||||
case SWR_MULTISAMPLE_2X:
|
||||
return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...);
|
||||
break;
|
||||
case SWR_MULTISAMPLE_4X:
|
||||
return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...);
|
||||
break;
|
||||
case SWR_MULTISAMPLE_8X:
|
||||
return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...);
|
||||
break;
|
||||
case SWR_MULTISAMPLE_16X:
|
||||
return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...);
|
||||
break;
|
||||
default:
|
||||
SWR_ASSERT(0 && "Invalid sample count\n");
|
||||
return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Recursively parse args
|
||||
template <typename... TArgsT>
|
||||
static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
|
||||
{
|
||||
if (tArg == true)
|
||||
{
|
||||
return BEChooserSingleSample<ArgsT..., 1>::GetFunc(remainingArgs...);
|
||||
}
|
||||
|
||||
return BEChooserSingleSample<ArgsT..., 0>::GetFunc(remainingArgs...);
|
||||
}
|
||||
};
|
||||
|
||||
void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
|
||||
{
|
||||
for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
|
||||
{
|
||||
for (uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
|
||||
{
|
||||
for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
|
||||
{
|
||||
table[inputCoverage][isCentroid][canEarlyZ] =
|
||||
BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X,
|
||||
false,
|
||||
(SWR_INPUT_COVERAGE)inputCoverage,
|
||||
(isCentroid > 0),
|
||||
false,
|
||||
(canEarlyZ > 0),
|
||||
SWR_BACKEND_SINGLE_SAMPLE);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,57 +0,0 @@
|
|||
# Copyright © 2017-2018 Intel Corporation
|
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
|
||||
files_swr_common += custom_target(
|
||||
'gen_backend_pixel',
|
||||
input : swr_gen_backends_py,
|
||||
output : [
|
||||
'gen_BackendPixelRate0.cpp', 'gen_BackendPixelRate1.cpp',
|
||||
'gen_BackendPixelRate2.cpp', 'gen_BackendPixelRate3.cpp',
|
||||
'gen_BackendPixelRate.hpp',
|
||||
],
|
||||
command : [
|
||||
prog_python, '@INPUT@',
|
||||
'--outdir', '@OUTDIR@',
|
||||
'--dim', '5', '2', '3', '2', '2', '2',
|
||||
'--numfiles', '4',
|
||||
'--cpp', '--hpp',
|
||||
],
|
||||
depend_files : [ swr_gen_backend_files, swr_gen_header_init_files ],
|
||||
)
|
||||
|
||||
files_swr_common += custom_target(
|
||||
'gen_backend_raster',
|
||||
input : swr_gen_backends_py,
|
||||
output : [
|
||||
'gen_rasterizer0.cpp', 'gen_rasterizer1.cpp',
|
||||
'gen_rasterizer2.cpp', 'gen_rasterizer3.cpp',
|
||||
'gen_rasterizer.hpp',
|
||||
],
|
||||
command : [
|
||||
prog_python, '@INPUT@',
|
||||
'--outdir', '@OUTDIR@',
|
||||
'--rast',
|
||||
'--dim', '5', '2', '2', '3', '5', '2',
|
||||
'--numfiles', '4',
|
||||
'--cpp', '--hpp',
|
||||
],
|
||||
depend_files : [ swr_gen_rasterizer_files, swr_gen_header_init_files ],
|
||||
)
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,254 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file binner.h
|
||||
*
|
||||
* @brief Declaration for the macrotile binner
|
||||
*
|
||||
******************************************************************************/
|
||||
#include "state.h"
|
||||
#include "conservativeRast.h"
|
||||
#include "utils.h"
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Offsets added to post-viewport vertex positions based on
|
||||
/// raster state.
|
||||
///
|
||||
/// Can't use templated variable because we must stick with C++11 features.
|
||||
/// Template variables were introduced with C++14
|
||||
template <typename SIMD_T>
|
||||
struct SwrPixelOffsets
|
||||
{
|
||||
public:
|
||||
INLINE static Float<SIMD_T> GetOffset(uint32_t loc)
|
||||
{
|
||||
SWR_ASSERT(loc <= 1);
|
||||
|
||||
return SIMD_T::set1_ps(loc ? 0.5f : 0.0f);
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Convert the X,Y coords of a triangle to the requested Fixed
|
||||
/// Point precision from FP32.
|
||||
template <typename SIMD_T, typename PT = FixedPointTraits<Fixed_16_8>>
|
||||
INLINE Integer<SIMD_T> fpToFixedPointVertical(const Float<SIMD_T>& vIn)
|
||||
{
|
||||
return SIMD_T::cvtps_epi32(SIMD_T::mul_ps(vIn, SIMD_T::set1_ps(PT::ScaleT::value)));
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Helper function to set the X,Y coords of a triangle to the
|
||||
/// requested Fixed Point precision from FP32.
|
||||
/// @param tri: simdvector[3] of FP triangle verts
|
||||
/// @param vXi: fixed point X coords of tri verts
|
||||
/// @param vYi: fixed point Y coords of tri verts
|
||||
template <typename SIMD_T>
|
||||
INLINE static void
|
||||
FPToFixedPoint(const Vec4<SIMD_T>* const tri, Integer<SIMD_T> (&vXi)[3], Integer<SIMD_T> (&vYi)[3])
|
||||
{
|
||||
vXi[0] = fpToFixedPointVertical<SIMD_T>(tri[0].x);
|
||||
vYi[0] = fpToFixedPointVertical<SIMD_T>(tri[0].y);
|
||||
vXi[1] = fpToFixedPointVertical<SIMD_T>(tri[1].x);
|
||||
vYi[1] = fpToFixedPointVertical<SIMD_T>(tri[1].y);
|
||||
vXi[2] = fpToFixedPointVertical<SIMD_T>(tri[2].x);
|
||||
vYi[2] = fpToFixedPointVertical<SIMD_T>(tri[2].y);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Calculate bounding box for current triangle
|
||||
/// @tparam CT: ConservativeRastFETraits type
|
||||
/// @param vX: fixed point X position for triangle verts
|
||||
/// @param vY: fixed point Y position for triangle verts
|
||||
/// @param bbox: fixed point bbox
|
||||
/// *Note*: expects vX, vY to be in the correct precision for the type
|
||||
/// of rasterization. This avoids unnecessary FP->fixed conversions.
|
||||
template <typename SIMD_T, typename CT>
|
||||
INLINE void calcBoundingBoxIntVertical(const Integer<SIMD_T> (&vX)[3],
|
||||
const Integer<SIMD_T> (&vY)[3],
|
||||
SIMDBBOX_T<SIMD_T>& bbox)
|
||||
{
|
||||
Integer<SIMD_T> vMinX = vX[0];
|
||||
|
||||
vMinX = SIMD_T::min_epi32(vMinX, vX[1]);
|
||||
vMinX = SIMD_T::min_epi32(vMinX, vX[2]);
|
||||
|
||||
Integer<SIMD_T> vMaxX = vX[0];
|
||||
|
||||
vMaxX = SIMD_T::max_epi32(vMaxX, vX[1]);
|
||||
vMaxX = SIMD_T::max_epi32(vMaxX, vX[2]);
|
||||
|
||||
Integer<SIMD_T> vMinY = vY[0];
|
||||
|
||||
vMinY = SIMD_T::min_epi32(vMinY, vY[1]);
|
||||
vMinY = SIMD_T::min_epi32(vMinY, vY[2]);
|
||||
|
||||
Integer<SIMD_T> vMaxY = vY[0];
|
||||
|
||||
vMaxY = SIMD_T::max_epi32(vMaxY, vY[1]);
|
||||
vMaxY = SIMD_T::max_epi32(vMaxY, vY[2]);
|
||||
|
||||
if (CT::BoundingBoxOffsetT::value != 0)
|
||||
{
|
||||
/// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative
|
||||
/// rasterization expand bbox by 1/256; coverage will be correctly handled in the
|
||||
/// rasterizer.
|
||||
|
||||
const Integer<SIMD_T> value = SIMD_T::set1_epi32(CT::BoundingBoxOffsetT::value);
|
||||
|
||||
vMinX = SIMD_T::sub_epi32(vMinX, value);
|
||||
vMaxX = SIMD_T::add_epi32(vMaxX, value);
|
||||
vMinY = SIMD_T::sub_epi32(vMinY, value);
|
||||
vMaxY = SIMD_T::add_epi32(vMaxY, value);
|
||||
}
|
||||
|
||||
bbox.xmin = vMinX;
|
||||
bbox.xmax = vMaxX;
|
||||
bbox.ymin = vMinY;
|
||||
bbox.ymax = vMaxY;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Gather scissor rect data based on per-prim viewport indices.
|
||||
/// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
|
||||
/// @param pViewportIndex - array of per-primitive viewport indexes.
|
||||
/// @param scisXmin - output vector of per-primitive scissor rect Xmin data.
|
||||
/// @param scisYmin - output vector of per-primitive scissor rect Ymin data.
|
||||
/// @param scisXmax - output vector of per-primitive scissor rect Xmax data.
|
||||
/// @param scisYmax - output vector of per-primitive scissor rect Ymax data.
|
||||
//
|
||||
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
|
||||
static void GatherScissors(const SWR_RECT* pScissorsInFixedPoint,
|
||||
const uint32_t* pViewportIndex,
|
||||
simdscalari& scisXmin,
|
||||
simdscalari& scisYmin,
|
||||
simdscalari& scisXmax,
|
||||
simdscalari& scisYmax)
|
||||
{
|
||||
scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[6]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[5]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[4]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[3]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[2]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[1]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[0]].xmin);
|
||||
scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[6]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[5]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[4]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[3]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[2]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[1]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[0]].ymin);
|
||||
scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[6]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[5]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[4]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[3]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[2]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[1]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[0]].xmax);
|
||||
scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[6]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[5]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[4]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[3]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[2]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[1]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[0]].ymax);
|
||||
}
|
||||
|
||||
static void GatherScissors(const SWR_RECT* pScissorsInFixedPoint,
|
||||
const uint32_t* pViewportIndex,
|
||||
simd16scalari& scisXmin,
|
||||
simd16scalari& scisYmin,
|
||||
simd16scalari& scisXmax,
|
||||
simd16scalari& scisYmax)
|
||||
{
|
||||
scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[14]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[13]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[12]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[11]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[10]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[9]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[8]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[7]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[6]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[5]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[4]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[3]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[2]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[1]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[0]].xmin);
|
||||
|
||||
scisYmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[14]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[13]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[12]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[11]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[10]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[9]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[8]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[7]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[6]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[5]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[4]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[3]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[2]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[1]].ymin,
|
||||
pScissorsInFixedPoint[pViewportIndex[0]].ymin);
|
||||
|
||||
scisXmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[14]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[13]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[12]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[11]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[10]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[9]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[8]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[7]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[6]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[5]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[4]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[3]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[2]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[1]].xmax,
|
||||
pScissorsInFixedPoint[pViewportIndex[0]].xmax);
|
||||
|
||||
scisYmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[14]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[13]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[12]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[11]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[10]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[9]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[8]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[7]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[6]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[5]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[4]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[3]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[2]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[1]].ymax,
|
||||
pScissorsInFixedPoint[pViewportIndex[0]].ymax);
|
||||
}
|
||||
|
|
@ -1,348 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file blend.cpp
|
||||
*
|
||||
* @brief Implementation for blending operations.
|
||||
*
|
||||
******************************************************************************/
|
||||
#include "state.h"
|
||||
|
||||
template <bool Color, bool Alpha>
|
||||
INLINE void GenerateBlendFactor(SWR_BLEND_FACTOR func,
|
||||
simdvector& constantColor,
|
||||
simdvector& src,
|
||||
simdvector& src1,
|
||||
simdvector& dst,
|
||||
simdvector& out)
|
||||
{
|
||||
simdvector result;
|
||||
|
||||
switch (func)
|
||||
{
|
||||
case BLENDFACTOR_ZERO:
|
||||
result.x = _simd_setzero_ps();
|
||||
result.y = _simd_setzero_ps();
|
||||
result.z = _simd_setzero_ps();
|
||||
result.w = _simd_setzero_ps();
|
||||
break;
|
||||
|
||||
case BLENDFACTOR_ONE:
|
||||
result.x = _simd_set1_ps(1.0);
|
||||
result.y = _simd_set1_ps(1.0);
|
||||
result.z = _simd_set1_ps(1.0);
|
||||
result.w = _simd_set1_ps(1.0);
|
||||
break;
|
||||
|
||||
case BLENDFACTOR_SRC_COLOR:
|
||||
result = src;
|
||||
break;
|
||||
|
||||
case BLENDFACTOR_DST_COLOR:
|
||||
result = dst;
|
||||
break;
|
||||
|
||||
case BLENDFACTOR_INV_SRC_COLOR:
|
||||
result.x = _simd_sub_ps(_simd_set1_ps(1.0), src.x);
|
||||
result.y = _simd_sub_ps(_simd_set1_ps(1.0), src.y);
|
||||
result.z = _simd_sub_ps(_simd_set1_ps(1.0), src.z);
|
||||
result.w = _simd_sub_ps(_simd_set1_ps(1.0), src.w);
|
||||
break;
|
||||
|
||||
case BLENDFACTOR_INV_DST_COLOR:
|
||||
result.x = _simd_sub_ps(_simd_set1_ps(1.0), dst.x);
|
||||
result.y = _simd_sub_ps(_simd_set1_ps(1.0), dst.y);
|
||||
result.z = _simd_sub_ps(_simd_set1_ps(1.0), dst.z);
|
||||
result.w = _simd_sub_ps(_simd_set1_ps(1.0), dst.w);
|
||||
break;
|
||||
|
||||
case BLENDFACTOR_SRC_ALPHA:
|
||||
result.x = src.w;
|
||||
result.y = src.w;
|
||||
result.z = src.w;
|
||||
result.w = src.w;
|
||||
break;
|
||||
|
||||
case BLENDFACTOR_INV_SRC_ALPHA:
|
||||
{
|
||||
simdscalar oneMinusSrcA = _simd_sub_ps(_simd_set1_ps(1.0), src.w);
|
||||
result.x = oneMinusSrcA;
|
||||
result.y = oneMinusSrcA;
|
||||
result.z = oneMinusSrcA;
|
||||
result.w = oneMinusSrcA;
|
||||
break;
|
||||
}
|
||||
|
||||
case BLENDFACTOR_DST_ALPHA:
|
||||
result.x = dst.w;
|
||||
result.y = dst.w;
|
||||
result.z = dst.w;
|
||||
result.w = dst.w;
|
||||
break;
|
||||
|
||||
case BLENDFACTOR_INV_DST_ALPHA:
|
||||
{
|
||||
simdscalar oneMinusDstA = _simd_sub_ps(_simd_set1_ps(1.0), dst.w);
|
||||
result.x = oneMinusDstA;
|
||||
result.y = oneMinusDstA;
|
||||
result.z = oneMinusDstA;
|
||||
result.w = oneMinusDstA;
|
||||
break;
|
||||
}
|
||||
|
||||
case BLENDFACTOR_SRC_ALPHA_SATURATE:
|
||||
{
|
||||
simdscalar sat = _simd_min_ps(src.w, _simd_sub_ps(_simd_set1_ps(1.0), dst.w));
|
||||
result.x = sat;
|
||||
result.y = sat;
|
||||
result.z = sat;
|
||||
result.w = _simd_set1_ps(1.0);
|
||||
break;
|
||||
}
|
||||
|
||||
case BLENDFACTOR_CONST_COLOR:
|
||||
result.x = constantColor[0];
|
||||
result.y = constantColor[1];
|
||||
result.z = constantColor[2];
|
||||
result.w = constantColor[3];
|
||||
break;
|
||||
|
||||
case BLENDFACTOR_CONST_ALPHA:
|
||||
result.x = result.y = result.z = result.w = constantColor[3];
|
||||
break;
|
||||
|
||||
case BLENDFACTOR_INV_CONST_COLOR:
|
||||
{
|
||||
result.x = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[0]);
|
||||
result.y = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[1]);
|
||||
result.z = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[2]);
|
||||
result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]);
|
||||
break;
|
||||
}
|
||||
|
||||
case BLENDFACTOR_INV_CONST_ALPHA:
|
||||
{
|
||||
result.x = result.y = result.z = result.w =
|
||||
_simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]);
|
||||
break;
|
||||
}
|
||||
|
||||
case BLENDFACTOR_SRC1_COLOR:
|
||||
result.x = src1.x;
|
||||
result.y = src1.y;
|
||||
result.z = src1.z;
|
||||
result.w = src1.w;
|
||||
break;
|
||||
|
||||
case BLENDFACTOR_SRC1_ALPHA:
|
||||
result.x = result.y = result.z = result.w = src1.w;
|
||||
break;
|
||||
|
||||
case BLENDFACTOR_INV_SRC1_COLOR:
|
||||
result.x = _simd_sub_ps(_simd_set1_ps(1.0f), src1.x);
|
||||
result.y = _simd_sub_ps(_simd_set1_ps(1.0f), src1.y);
|
||||
result.z = _simd_sub_ps(_simd_set1_ps(1.0f), src1.z);
|
||||
result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w);
|
||||
break;
|
||||
|
||||
case BLENDFACTOR_INV_SRC1_ALPHA:
|
||||
result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w);
|
||||
break;
|
||||
|
||||
default:
|
||||
SWR_INVALID("Unimplemented blend factor: %d", func);
|
||||
}
|
||||
|
||||
if (Color)
|
||||
{
|
||||
out.x = result.x;
|
||||
out.y = result.y;
|
||||
out.z = result.z;
|
||||
}
|
||||
if (Alpha)
|
||||
{
|
||||
out.w = result.w;
|
||||
}
|
||||
}
|
||||
|
||||
template <bool Color, bool Alpha>
|
||||
INLINE void BlendFunc(SWR_BLEND_OP blendOp,
|
||||
simdvector& src,
|
||||
simdvector& srcFactor,
|
||||
simdvector& dst,
|
||||
simdvector& dstFactor,
|
||||
simdvector& out)
|
||||
{
|
||||
simdvector result;
|
||||
|
||||
switch (blendOp)
|
||||
{
|
||||
case BLENDOP_ADD:
|
||||
result.x = _simd_fmadd_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x));
|
||||
result.y = _simd_fmadd_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y));
|
||||
result.z = _simd_fmadd_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z));
|
||||
result.w = _simd_fmadd_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w));
|
||||
break;
|
||||
|
||||
case BLENDOP_SUBTRACT:
|
||||
result.x = _simd_fmsub_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x));
|
||||
result.y = _simd_fmsub_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y));
|
||||
result.z = _simd_fmsub_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z));
|
||||
result.w = _simd_fmsub_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w));
|
||||
break;
|
||||
|
||||
case BLENDOP_REVSUBTRACT:
|
||||
result.x = _simd_fmsub_ps(dstFactor.x, dst.x, _simd_mul_ps(srcFactor.x, src.x));
|
||||
result.y = _simd_fmsub_ps(dstFactor.y, dst.y, _simd_mul_ps(srcFactor.y, src.y));
|
||||
result.z = _simd_fmsub_ps(dstFactor.z, dst.z, _simd_mul_ps(srcFactor.z, src.z));
|
||||
result.w = _simd_fmsub_ps(dstFactor.w, dst.w, _simd_mul_ps(srcFactor.w, src.w));
|
||||
break;
|
||||
|
||||
case BLENDOP_MIN:
|
||||
result.x = _simd_min_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x));
|
||||
result.y = _simd_min_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y));
|
||||
result.z = _simd_min_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z));
|
||||
result.w = _simd_min_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w));
|
||||
break;
|
||||
|
||||
case BLENDOP_MAX:
|
||||
result.x = _simd_max_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x));
|
||||
result.y = _simd_max_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y));
|
||||
result.z = _simd_max_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z));
|
||||
result.w = _simd_max_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w));
|
||||
break;
|
||||
|
||||
default:
|
||||
SWR_INVALID("Unimplemented blend function: %d", blendOp);
|
||||
}
|
||||
|
||||
if (Color)
|
||||
{
|
||||
out.x = result.x;
|
||||
out.y = result.y;
|
||||
out.z = result.z;
|
||||
}
|
||||
if (Alpha)
|
||||
{
|
||||
out.w = result.w;
|
||||
}
|
||||
}
|
||||
|
||||
template <SWR_TYPE type>
|
||||
INLINE void Clamp(simdvector& src)
|
||||
{
|
||||
switch (type)
|
||||
{
|
||||
case SWR_TYPE_FLOAT:
|
||||
break;
|
||||
|
||||
case SWR_TYPE_UNORM:
|
||||
src.x = _simd_max_ps(src.x, _simd_setzero_ps());
|
||||
src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f));
|
||||
|
||||
src.y = _simd_max_ps(src.y, _simd_setzero_ps());
|
||||
src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f));
|
||||
|
||||
src.z = _simd_max_ps(src.z, _simd_setzero_ps());
|
||||
src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f));
|
||||
|
||||
src.w = _simd_max_ps(src.w, _simd_setzero_ps());
|
||||
src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f));
|
||||
break;
|
||||
|
||||
case SWR_TYPE_SNORM:
|
||||
src.x = _simd_max_ps(src.x, _simd_set1_ps(-1.0f));
|
||||
src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f));
|
||||
|
||||
src.y = _simd_max_ps(src.y, _simd_set1_ps(-1.0f));
|
||||
src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f));
|
||||
|
||||
src.z = _simd_max_ps(src.z, _simd_set1_ps(-1.0f));
|
||||
src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f));
|
||||
|
||||
src.w = _simd_max_ps(src.w, _simd_set1_ps(-1.0f));
|
||||
src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f));
|
||||
break;
|
||||
|
||||
default:
|
||||
SWR_INVALID("Unimplemented clamp: %d", type);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
template <SWR_TYPE type>
|
||||
void Blend(const SWR_BLEND_STATE* pBlendState,
|
||||
const SWR_RENDER_TARGET_BLEND_STATE* pState,
|
||||
simdvector& src,
|
||||
simdvector& src1,
|
||||
uint8_t* pDst,
|
||||
simdvector& result)
|
||||
{
|
||||
// load render target
|
||||
simdvector dst;
|
||||
LoadSOA<KNOB_COLOR_HOT_TILE_FORMAT>(pDst, dst);
|
||||
|
||||
simdvector constColor;
|
||||
constColor.x = _simd_broadcast_ss(&pBlendState->constantColor[0]);
|
||||
constColor.y = _simd_broadcast_ss(&pBlendState->constantColor[1]);
|
||||
constColor.z = _simd_broadcast_ss(&pBlendState->constantColor[2]);
|
||||
constColor.w = _simd_broadcast_ss(&pBlendState->constantColor[3]);
|
||||
|
||||
// clamp src/dst/constant
|
||||
Clamp<type>(src);
|
||||
Clamp<type>(src1);
|
||||
Clamp<type>(dst);
|
||||
Clamp<type>(constColor);
|
||||
|
||||
simdvector srcFactor, dstFactor;
|
||||
if (pBlendState->independentAlphaBlendEnable)
|
||||
{
|
||||
GenerateBlendFactor<true, false>(
|
||||
(SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor);
|
||||
GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->sourceAlphaBlendFactor,
|
||||
constColor,
|
||||
src,
|
||||
src1,
|
||||
dst,
|
||||
srcFactor);
|
||||
|
||||
GenerateBlendFactor<true, false>(
|
||||
(SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor);
|
||||
GenerateBlendFactor<false, true>(
|
||||
(SWR_BLEND_FACTOR)pState->destAlphaBlendFactor, constColor, src, src1, dst, dstFactor);
|
||||
|
||||
BlendFunc<true, false>(
|
||||
(SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result);
|
||||
BlendFunc<false, true>(
|
||||
(SWR_BLEND_OP)pState->alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
|
||||
}
|
||||
else
|
||||
{
|
||||
GenerateBlendFactor<true, true>(
|
||||
(SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor);
|
||||
GenerateBlendFactor<true, true>(
|
||||
(SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor);
|
||||
|
||||
BlendFunc<true, true>(
|
||||
(SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,336 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file clip.cpp
|
||||
*
|
||||
* @brief Implementation for clipping
|
||||
*
|
||||
******************************************************************************/
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "common/os.h"
|
||||
#include "core/clip.h"
|
||||
|
||||
float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1)
|
||||
{
|
||||
return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1));
|
||||
}
|
||||
|
||||
template <SWR_CLIPCODES ClippingPlane>
|
||||
inline void intersect(
|
||||
int s, // index to first edge vertex v0 in pInPts.
|
||||
int p, // index to second edge vertex v1 in pInPts.
|
||||
const float* pInPts, // array of all the input positions.
|
||||
const float* pInAttribs, // array of all attributes for all vertex. All the attributes for each
|
||||
// vertex is contiguous.
|
||||
int numInAttribs, // number of attributes per vertex.
|
||||
int i, // output index.
|
||||
float* pOutPts, // array of output positions. We'll write our new intersection point at i*4.
|
||||
float* pOutAttribs) // array of output attributes. We'll write our new attributes at
|
||||
// i*numInAttribs.
|
||||
{
|
||||
float t;
|
||||
|
||||
// Find the parameter of the intersection.
|
||||
// t = (v1.w - v1.x) / ((v2.x - v1.x) - (v2.w - v1.w)) for x = w (RIGHT) plane, etc.
|
||||
const float* v1 = &pInPts[s * 4];
|
||||
const float* v2 = &pInPts[p * 4];
|
||||
|
||||
switch (ClippingPlane)
|
||||
{
|
||||
case FRUSTUM_LEFT:
|
||||
t = ComputeInterpFactor(v1[3] + v1[0], v2[3] + v2[0]);
|
||||
break;
|
||||
case FRUSTUM_RIGHT:
|
||||
t = ComputeInterpFactor(v1[3] - v1[0], v2[3] - v2[0]);
|
||||
break;
|
||||
case FRUSTUM_TOP:
|
||||
t = ComputeInterpFactor(v1[3] + v1[1], v2[3] + v2[1]);
|
||||
break;
|
||||
case FRUSTUM_BOTTOM:
|
||||
t = ComputeInterpFactor(v1[3] - v1[1], v2[3] - v2[1]);
|
||||
break;
|
||||
case FRUSTUM_NEAR:
|
||||
t = ComputeInterpFactor(v1[2], v2[2]);
|
||||
break;
|
||||
case FRUSTUM_FAR:
|
||||
t = ComputeInterpFactor(v1[3] - v1[2], v2[3] - v2[2]);
|
||||
break;
|
||||
default:
|
||||
SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
|
||||
};
|
||||
|
||||
const float* a1 = &pInAttribs[s * numInAttribs];
|
||||
const float* a2 = &pInAttribs[p * numInAttribs];
|
||||
|
||||
float* pOutP = &pOutPts[i * 4];
|
||||
float* pOutA = &pOutAttribs[i * numInAttribs];
|
||||
|
||||
// Interpolate new position.
|
||||
for (int j = 0; j < 4; ++j)
|
||||
{
|
||||
pOutP[j] = v1[j] + (v2[j] - v1[j]) * t;
|
||||
}
|
||||
|
||||
// Interpolate Attributes
|
||||
for (int attr = 0; attr < numInAttribs; ++attr)
|
||||
{
|
||||
pOutA[attr] = a1[attr] + (a2[attr] - a1[attr]) * t;
|
||||
}
|
||||
}
|
||||
|
||||
// Checks whether vertex v lies inside clipping plane
|
||||
// in homogenous coords check -w < {x,y,z} < w;
|
||||
//
|
||||
template <SWR_CLIPCODES ClippingPlane>
|
||||
inline int inside(const float v[4])
|
||||
{
|
||||
switch (ClippingPlane)
|
||||
{
|
||||
case FRUSTUM_LEFT:
|
||||
return (v[0] >= -v[3]);
|
||||
case FRUSTUM_RIGHT:
|
||||
return (v[0] <= v[3]);
|
||||
case FRUSTUM_TOP:
|
||||
return (v[1] >= -v[3]);
|
||||
case FRUSTUM_BOTTOM:
|
||||
return (v[1] <= v[3]);
|
||||
case FRUSTUM_NEAR:
|
||||
return (v[2] >= 0.0f);
|
||||
case FRUSTUM_FAR:
|
||||
return (v[2] <= v[3]);
|
||||
default:
|
||||
SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Clips a polygon in homogenous coordinates to a particular clipping plane.
|
||||
// Takes in vertices of the polygon (InPts) and the clipping plane
|
||||
// Puts the vertices of the clipped polygon in OutPts
|
||||
// Returns number of points in clipped polygon
|
||||
//
|
||||
template <SWR_CLIPCODES ClippingPlane>
|
||||
int ClipTriToPlane(const float* pInPts,
|
||||
int numInPts,
|
||||
const float* pInAttribs,
|
||||
int numInAttribs,
|
||||
float* pOutPts,
|
||||
float* pOutAttribs)
|
||||
{
|
||||
int i = 0; // index number of OutPts, # of vertices in OutPts = i div 4;
|
||||
|
||||
for (int j = 0; j < numInPts; ++j)
|
||||
{
|
||||
int s = j;
|
||||
int p = (j + 1) % numInPts;
|
||||
|
||||
int s_in = inside<ClippingPlane>(&pInPts[s * 4]);
|
||||
int p_in = inside<ClippingPlane>(&pInPts[p * 4]);
|
||||
|
||||
// test if vertex is to be added to output vertices
|
||||
if (s_in != p_in) // edge crosses clipping plane
|
||||
{
|
||||
// find point of intersection
|
||||
intersect<ClippingPlane>(
|
||||
s, p, pInPts, pInAttribs, numInAttribs, i, pOutPts, pOutAttribs);
|
||||
i++;
|
||||
}
|
||||
if (p_in) // 2nd vertex is inside clipping volume, add it to output
|
||||
{
|
||||
// Copy 2nd vertex position of edge over to output.
|
||||
for (int k = 0; k < 4; ++k)
|
||||
{
|
||||
pOutPts[i * 4 + k] = pInPts[p * 4 + k];
|
||||
}
|
||||
// Copy 2nd vertex attributes of edge over to output.
|
||||
for (int attr = 0; attr < numInAttribs; ++attr)
|
||||
{
|
||||
pOutAttribs[i * numInAttribs + attr] = pInAttribs[p * numInAttribs + attr];
|
||||
}
|
||||
i++;
|
||||
}
|
||||
// edge does not cross clipping plane and vertex outside clipping volume
|
||||
// => do not add vertex
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
void ClipRectangles(DRAW_CONTEXT* pDC,
|
||||
PA_STATE& pa,
|
||||
uint32_t workerId,
|
||||
simdvector prims[],
|
||||
uint32_t primMask,
|
||||
simdscalari const& primId,
|
||||
simdscalari const& viewportIdx,
|
||||
simdscalari const& rtIdx)
|
||||
{
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipRectangles, pDC->drawId);
|
||||
Clipper<SIMD256, 3> clipper(workerId, pDC);
|
||||
clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, FEClipRectangles, 1);
|
||||
}
|
||||
|
||||
void ClipTriangles(DRAW_CONTEXT* pDC,
|
||||
PA_STATE& pa,
|
||||
uint32_t workerId,
|
||||
simdvector prims[],
|
||||
uint32_t primMask,
|
||||
simdscalari const& primId,
|
||||
simdscalari const& viewportIdx,
|
||||
simdscalari const& rtIdx)
|
||||
{
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipTriangles, pDC->drawId);
|
||||
Clipper<SIMD256, 3> clipper(workerId, pDC);
|
||||
clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, FEClipTriangles, 1);
|
||||
}
|
||||
|
||||
void ClipLines(DRAW_CONTEXT* pDC,
|
||||
PA_STATE& pa,
|
||||
uint32_t workerId,
|
||||
simdvector prims[],
|
||||
uint32_t primMask,
|
||||
simdscalari const& primId,
|
||||
simdscalari const& viewportIdx,
|
||||
simdscalari const& rtIdx)
|
||||
{
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipLines, pDC->drawId);
|
||||
Clipper<SIMD256, 2> clipper(workerId, pDC);
|
||||
clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, FEClipLines, 1);
|
||||
}
|
||||
|
||||
void ClipPoints(DRAW_CONTEXT* pDC,
|
||||
PA_STATE& pa,
|
||||
uint32_t workerId,
|
||||
simdvector prims[],
|
||||
uint32_t primMask,
|
||||
simdscalari const& primId,
|
||||
simdscalari const& viewportIdx,
|
||||
simdscalari const& rtIdx)
|
||||
{
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipPoints, pDC->drawId);
|
||||
Clipper<SIMD256, 1> clipper(workerId, pDC);
|
||||
clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, FEClipPoints, 1);
|
||||
}
|
||||
|
||||
#if USE_SIMD16_FRONTEND
|
||||
void SIMDCALL ClipRectangles_simd16(DRAW_CONTEXT* pDC,
|
||||
PA_STATE& pa,
|
||||
uint32_t workerId,
|
||||
simd16vector prims[],
|
||||
uint32_t primMask,
|
||||
simd16scalari const& primId,
|
||||
simd16scalari const& viewportIdx,
|
||||
simd16scalari const& rtIdx)
|
||||
{
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipRectangles, pDC->drawId);
|
||||
|
||||
enum
|
||||
{
|
||||
VERTS_PER_PRIM = 3
|
||||
};
|
||||
|
||||
Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC);
|
||||
|
||||
pa.useAlternateOffset = false;
|
||||
clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
|
||||
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, FEClipRectangles, 1);
|
||||
}
|
||||
|
||||
void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT* pDC,
|
||||
PA_STATE& pa,
|
||||
uint32_t workerId,
|
||||
simd16vector prims[],
|
||||
uint32_t primMask,
|
||||
simd16scalari const& primId,
|
||||
simd16scalari const& viewportIdx,
|
||||
simd16scalari const& rtIdx)
|
||||
{
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipTriangles, pDC->drawId);
|
||||
|
||||
enum
|
||||
{
|
||||
VERTS_PER_PRIM = 3
|
||||
};
|
||||
|
||||
Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC);
|
||||
|
||||
pa.useAlternateOffset = false;
|
||||
clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
|
||||
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, FEClipTriangles, 1);
|
||||
}
|
||||
|
||||
void SIMDCALL ClipLines_simd16(DRAW_CONTEXT* pDC,
|
||||
PA_STATE& pa,
|
||||
uint32_t workerId,
|
||||
simd16vector prims[],
|
||||
uint32_t primMask,
|
||||
simd16scalari const& primId,
|
||||
simd16scalari const& viewportIdx,
|
||||
simd16scalari const& rtIdx)
|
||||
{
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipLines, pDC->drawId);
|
||||
|
||||
enum
|
||||
{
|
||||
VERTS_PER_PRIM = 2
|
||||
};
|
||||
|
||||
Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC);
|
||||
|
||||
pa.useAlternateOffset = false;
|
||||
clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
|
||||
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, FEClipLines, 1);
|
||||
}
|
||||
|
||||
void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT* pDC,
|
||||
PA_STATE& pa,
|
||||
uint32_t workerId,
|
||||
simd16vector prims[],
|
||||
uint32_t primMask,
|
||||
simd16scalari const& primId,
|
||||
simd16scalari const& viewportIdx,
|
||||
simd16scalari const& rtIdx)
|
||||
{
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipPoints, pDC->drawId);
|
||||
|
||||
enum
|
||||
{
|
||||
VERTS_PER_PRIM = 1
|
||||
};
|
||||
|
||||
Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC);
|
||||
|
||||
pa.useAlternateOffset = false;
|
||||
clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
|
||||
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, FEClipPoints, 1);
|
||||
}
|
||||
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,229 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file conservativerast.h
|
||||
*
|
||||
******************************************************************************/
|
||||
#pragma once
|
||||
#include <type_traits>
|
||||
#include "common/simdintrin.h"
|
||||
|
||||
enum FixedPointFmt
|
||||
{
|
||||
FP_UNINIT,
|
||||
_16_8,
|
||||
_16_9,
|
||||
_X_16,
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief convenience typedefs for supported Fixed Point precisions
|
||||
typedef std::integral_constant<uint32_t, FP_UNINIT> Fixed_Uninit;
|
||||
typedef std::integral_constant<uint32_t, _16_8> Fixed_16_8;
|
||||
typedef std::integral_constant<uint32_t, _16_9> Fixed_16_9;
|
||||
typedef std::integral_constant<uint32_t, _X_16> Fixed_X_16;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @struct FixedPointTraits
|
||||
/// @brief holds constants relating to converting between FP and Fixed point
|
||||
/// @tparam FT: fixed precision type
|
||||
template <typename FT>
|
||||
struct FixedPointTraits
|
||||
{
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Fixed_16_8 specialization of FixedPointTraits
|
||||
template <>
|
||||
struct FixedPointTraits<Fixed_16_8>
|
||||
{
|
||||
/// multiplier to go from FP32 to Fixed Point 16.8
|
||||
typedef std::integral_constant<uint32_t, 256> ScaleT;
|
||||
/// number of bits to shift to go from 16.8 fixed => int32
|
||||
typedef std::integral_constant<uint32_t, 8> BitsT;
|
||||
typedef Fixed_16_8 TypeT;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Fixed_16_9 specialization of FixedPointTraits
|
||||
template <>
|
||||
struct FixedPointTraits<Fixed_16_9>
|
||||
{
|
||||
/// multiplier to go from FP32 to Fixed Point 16.9
|
||||
typedef std::integral_constant<uint32_t, 512> ScaleT;
|
||||
/// number of bits to shift to go from 16.9 fixed => int32
|
||||
typedef std::integral_constant<uint32_t, 9> BitsT;
|
||||
typedef Fixed_16_9 TypeT;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Fixed_16_9 specialization of FixedPointTraits
|
||||
template <>
|
||||
struct FixedPointTraits<Fixed_X_16>
|
||||
{
|
||||
/// multiplier to go from FP32 to Fixed Point X.16
|
||||
typedef std::integral_constant<uint32_t, 65536> ScaleT;
|
||||
/// number of bits to shift to go from X.16 fixed => int32
|
||||
typedef std::integral_constant<uint32_t, 16> BitsT;
|
||||
typedef Fixed_X_16 TypeT;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief convenience typedefs for conservative rasterization modes
|
||||
typedef std::false_type StandardRastT;
|
||||
typedef std::true_type ConservativeRastT;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief convenience typedefs for Input Coverage rasterization modes
|
||||
typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_NONE> NoInputCoverageT;
|
||||
typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_NORMAL> OuterConservativeCoverageT;
|
||||
typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
|
||||
InnerConservativeCoverageT;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @struct ConservativeRastTraits
|
||||
/// @brief primary ConservativeRastTraits template. Shouldn't be instantiated
|
||||
/// @tparam ConservativeT: type of conservative rasterization
|
||||
template <typename ConservativeT>
|
||||
struct ConservativeRastFETraits
|
||||
{
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief StandardRast specialization of ConservativeRastTraits
|
||||
template <>
|
||||
struct ConservativeRastFETraits<StandardRastT>
|
||||
{
|
||||
typedef std::false_type IsConservativeT;
|
||||
typedef std::integral_constant<uint32_t, 0> BoundingBoxOffsetT;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief ConservativeRastT specialization of ConservativeRastTraits
|
||||
template <>
|
||||
struct ConservativeRastFETraits<ConservativeRastT>
|
||||
{
|
||||
typedef std::true_type IsConservativeT;
|
||||
typedef std::integral_constant<uint32_t, 1> BoundingBoxOffsetT;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief convenience typedefs for ConservativeRastFETraits
|
||||
typedef ConservativeRastFETraits<StandardRastT> FEStandardRastT;
|
||||
typedef ConservativeRastFETraits<ConservativeRastT> FEConservativeRastT;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @struct ConservativeRastBETraits
|
||||
/// @brief primary ConservativeRastBETraits template. Shouldn't be instantiated;
|
||||
/// default to standard rasterization behavior
|
||||
/// @tparam ConservativeT: type of conservative rasterization
|
||||
/// @tparam InputCoverageT: type of input coverage requested, if any
|
||||
template <typename ConservativeT, typename _InputCoverageT>
|
||||
struct ConservativeRastBETraits
|
||||
{
|
||||
typedef std::false_type IsConservativeT;
|
||||
typedef _InputCoverageT InputCoverageT;
|
||||
typedef FixedPointTraits<Fixed_16_8> ConservativePrecisionT;
|
||||
typedef std::integral_constant<int32_t, 0> ConservativeEdgeOffsetT;
|
||||
typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief StandardRastT specialization of ConservativeRastBETraits
|
||||
template <typename _InputCoverageT>
|
||||
struct ConservativeRastBETraits<StandardRastT, _InputCoverageT>
|
||||
{
|
||||
typedef std::false_type IsConservativeT;
|
||||
typedef _InputCoverageT InputCoverageT;
|
||||
typedef FixedPointTraits<Fixed_16_8> ConservativePrecisionT;
|
||||
typedef std::integral_constant<int32_t, 0> ConservativeEdgeOffsetT;
|
||||
typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief ConservativeRastT specialization of ConservativeRastBETraits
|
||||
/// with no input coverage
|
||||
template <>
|
||||
struct ConservativeRastBETraits<ConservativeRastT, NoInputCoverageT>
|
||||
{
|
||||
typedef std::true_type IsConservativeT;
|
||||
typedef NoInputCoverageT InputCoverageT;
|
||||
|
||||
typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT;
|
||||
|
||||
/// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision
|
||||
/// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead
|
||||
/// of of having to compare individual edges to pixel corners to check if any part of the
|
||||
/// triangle intersects a pixel
|
||||
typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1>
|
||||
ConservativeEdgeOffsetT;
|
||||
typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief ConservativeRastT specialization of ConservativeRastBETraits
|
||||
/// with OuterConservativeCoverage
|
||||
template <>
|
||||
struct ConservativeRastBETraits<ConservativeRastT, OuterConservativeCoverageT>
|
||||
{
|
||||
typedef std::true_type IsConservativeT;
|
||||
typedef OuterConservativeCoverageT InputCoverageT;
|
||||
|
||||
typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT;
|
||||
|
||||
/// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision
|
||||
/// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead
|
||||
/// of of having to compare individual edges to pixel corners to check if any part of the
|
||||
/// triangle intersects a pixel
|
||||
typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1>
|
||||
ConservativeEdgeOffsetT;
|
||||
typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief ConservativeRastT specialization of ConservativeRastBETraits
|
||||
/// with InnerConservativeCoverage
|
||||
template <>
|
||||
struct ConservativeRastBETraits<ConservativeRastT, InnerConservativeCoverageT>
|
||||
{
|
||||
typedef std::true_type IsConservativeT;
|
||||
typedef InnerConservativeCoverageT InputCoverageT;
|
||||
|
||||
typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT;
|
||||
|
||||
/// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision
|
||||
/// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead
|
||||
/// of of having to compare individual edges to pixel corners to check if any part of the
|
||||
/// triangle intersects a pixel
|
||||
typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1>
|
||||
ConservativeEdgeOffsetT;
|
||||
|
||||
/// undo the outer conservative offset and offset edge towards from pixel center by 1/2 pixel +
|
||||
/// 1/512, in Fixed 16.9 precision this allows the rasterizer to do the 3 edge coverage tests
|
||||
/// against a single point, instead of of having to compare individual edges to pixel corners to
|
||||
/// check if a pixel is fully covered by a triangle
|
||||
typedef std::integral_constant<int32_t,
|
||||
static_cast<int32_t>(
|
||||
-((ConservativePrecisionT::ScaleT::value / 2) + 1) -
|
||||
ConservativeEdgeOffsetT::value)>
|
||||
InnerConservativeEdgeOffsetT;
|
||||
};
|
||||
|
|
@ -1,608 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file context.h
|
||||
*
|
||||
* @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
|
||||
* The SWR_CONTEXT is our global context and contains the DC ring,
|
||||
* thread state, etc.
|
||||
*
|
||||
* The DRAW_CONTEXT contains all state associated with a draw operation.
|
||||
*
|
||||
******************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#include <condition_variable>
|
||||
#include <algorithm>
|
||||
|
||||
#include "core/api.h"
|
||||
#include "core/utils.h"
|
||||
#include "core/arena.h"
|
||||
#include "core/fifo.hpp"
|
||||
#include "core/knobs.h"
|
||||
#include "common/intrin.h"
|
||||
#include "common/rdtsc_buckets.h"
|
||||
#include "core/threads.h"
|
||||
#include "ringbuffer.h"
|
||||
#include "archrast/archrast.h"
|
||||
|
||||
// x.8 fixed point precision values
|
||||
#define FIXED_POINT_SHIFT 8
|
||||
#define FIXED_POINT_SCALE 256
|
||||
|
||||
// x.16 fixed point precision values
|
||||
#define FIXED_POINT16_SHIFT 16
|
||||
#define FIXED_POINT16_SCALE 65536
|
||||
|
||||
struct SWR_CONTEXT;
|
||||
struct DRAW_CONTEXT;
|
||||
|
||||
struct TRI_FLAGS
|
||||
{
|
||||
uint32_t frontFacing : 1;
|
||||
uint32_t yMajor : 1;
|
||||
uint32_t coverageMask : (SIMD_TILE_X_DIM* SIMD_TILE_Y_DIM);
|
||||
uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
|
||||
float pointSize;
|
||||
uint32_t renderTargetArrayIndex;
|
||||
uint32_t viewportIndex;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// SWR_TRIANGLE_DESC
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
struct SWR_TRIANGLE_DESC
|
||||
{
|
||||
float I[3];
|
||||
float J[3];
|
||||
float Z[3];
|
||||
float OneOverW[3];
|
||||
float recipDet;
|
||||
|
||||
float* pRecipW;
|
||||
float* pAttribs;
|
||||
float* pPerspAttribs;
|
||||
float* pSamplePos;
|
||||
float* pUserClipBuffer;
|
||||
|
||||
uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
|
||||
uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if
|
||||
// entire pixel is covered
|
||||
uint64_t anyCoveredSamples;
|
||||
|
||||
TRI_FLAGS triFlags;
|
||||
};
|
||||
|
||||
struct TRIANGLE_WORK_DESC
|
||||
{
|
||||
float* pTriBuffer;
|
||||
float* pAttribs;
|
||||
float* pUserClipBuffer;
|
||||
uint32_t numAttribs;
|
||||
TRI_FLAGS triFlags;
|
||||
};
|
||||
|
||||
struct CLEAR_DESC
|
||||
{
|
||||
SWR_RECT rect;
|
||||
uint32_t attachmentMask;
|
||||
uint32_t renderTargetArrayIndex;
|
||||
float clearRTColor[4]; // RGBA_32F
|
||||
float clearDepth; // [0..1]
|
||||
uint8_t clearStencil;
|
||||
};
|
||||
|
||||
struct DISCARD_INVALIDATE_TILES_DESC
|
||||
{
|
||||
uint32_t attachmentMask;
|
||||
SWR_RECT rect;
|
||||
SWR_TILE_STATE newTileState;
|
||||
bool createNewTiles;
|
||||
bool fullTilesOnly;
|
||||
};
|
||||
|
||||
struct SYNC_DESC
|
||||
{
|
||||
PFN_CALLBACK_FUNC pfnCallbackFunc;
|
||||
uint64_t userData;
|
||||
uint64_t userData2;
|
||||
uint64_t userData3;
|
||||
};
|
||||
|
||||
struct STORE_TILES_DESC
|
||||
{
|
||||
uint32_t attachmentMask;
|
||||
SWR_TILE_STATE postStoreTileState;
|
||||
SWR_RECT rect;
|
||||
};
|
||||
|
||||
struct COMPUTE_DESC
|
||||
{
|
||||
uint32_t threadGroupCountX;
|
||||
uint32_t threadGroupCountY;
|
||||
uint32_t threadGroupCountZ;
|
||||
bool enableThreadDispatch;
|
||||
};
|
||||
|
||||
typedef void (*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC,
|
||||
uint32_t workerId,
|
||||
uint32_t macroTile,
|
||||
void* pDesc);
|
||||
|
||||
enum WORK_TYPE
|
||||
{
|
||||
SYNC,
|
||||
DRAW,
|
||||
CLEAR,
|
||||
DISCARDINVALIDATETILES,
|
||||
STORETILES,
|
||||
SHUTDOWN,
|
||||
};
|
||||
|
||||
OSALIGNSIMD(struct) BE_WORK
|
||||
{
|
||||
WORK_TYPE type;
|
||||
PFN_WORK_FUNC pfnWork;
|
||||
union
|
||||
{
|
||||
SYNC_DESC sync;
|
||||
TRIANGLE_WORK_DESC tri;
|
||||
CLEAR_DESC clear;
|
||||
DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
|
||||
STORE_TILES_DESC storeTiles;
|
||||
} desc;
|
||||
};
|
||||
|
||||
struct DRAW_WORK
|
||||
{
|
||||
DRAW_CONTEXT* pDC;
|
||||
union
|
||||
{
|
||||
uint32_t numIndices; // DrawIndexed: Number of indices for draw.
|
||||
uint32_t numVerts; // Draw: Number of verts (triangles, lines, etc)
|
||||
};
|
||||
union
|
||||
{
|
||||
gfxptr_t xpIB; // DrawIndexed: App supplied int32 indices
|
||||
uint32_t startVertex; // Draw: Starting vertex in VB to render from.
|
||||
};
|
||||
int32_t baseVertex;
|
||||
uint32_t numInstances; // Number of instances
|
||||
uint32_t startInstance; // Instance offset
|
||||
uint32_t startPrimID; // starting primitiveID for this draw batch
|
||||
uint32_t
|
||||
startVertexID; // starting VertexID for this draw batch (only needed for non-indexed draws)
|
||||
SWR_FORMAT type; // index buffer type
|
||||
};
|
||||
|
||||
typedef void (*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext,
|
||||
DRAW_CONTEXT* pDC,
|
||||
uint32_t workerId,
|
||||
void* pDesc);
|
||||
struct FE_WORK
|
||||
{
|
||||
WORK_TYPE type;
|
||||
PFN_FE_WORK_FUNC pfnWork;
|
||||
union
|
||||
{
|
||||
SYNC_DESC sync;
|
||||
DRAW_WORK draw;
|
||||
CLEAR_DESC clear;
|
||||
DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
|
||||
STORE_TILES_DESC storeTiles;
|
||||
} desc;
|
||||
};
|
||||
|
||||
struct GUARDBANDS
|
||||
{
|
||||
float left[KNOB_NUM_VIEWPORTS_SCISSORS];
|
||||
float right[KNOB_NUM_VIEWPORTS_SCISSORS];
|
||||
float top[KNOB_NUM_VIEWPORTS_SCISSORS];
|
||||
float bottom[KNOB_NUM_VIEWPORTS_SCISSORS];
|
||||
};
|
||||
|
||||
struct PA_STATE;
|
||||
|
||||
// function signature for pipeline stages that execute after primitive assembly
|
||||
typedef void (*PFN_PROCESS_PRIMS)(DRAW_CONTEXT* pDC,
|
||||
PA_STATE& pa,
|
||||
uint32_t workerId,
|
||||
simdvector prims[],
|
||||
uint32_t primMask,
|
||||
simdscalari const& primID,
|
||||
simdscalari const& viewportIdx,
|
||||
simdscalari const& rtIdx);
|
||||
|
||||
// function signature for pipeline stages that execute after primitive assembly
|
||||
typedef void(SIMDCALL* PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT* pDC,
|
||||
PA_STATE& pa,
|
||||
uint32_t workerId,
|
||||
simd16vector prims[],
|
||||
uint32_t primMask,
|
||||
simd16scalari const& primID,
|
||||
simd16scalari const& viewportIdx,
|
||||
simd16scalari const& rtIdx);
|
||||
|
||||
OSALIGNLINE(struct) API_STATE
|
||||
{
|
||||
// Vertex Buffers
|
||||
SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS];
|
||||
|
||||
// GS - Geometry Shader State
|
||||
SWR_GS_STATE gsState;
|
||||
PFN_GS_FUNC pfnGsFunc;
|
||||
|
||||
// FS - Fetch Shader State
|
||||
PFN_FETCH_FUNC pfnFetchFunc;
|
||||
|
||||
// VS - Vertex Shader State
|
||||
PFN_VERTEX_FUNC pfnVertexFunc;
|
||||
|
||||
// Index Buffer
|
||||
SWR_INDEX_BUFFER_STATE indexBuffer;
|
||||
|
||||
// CS - Compute Shader
|
||||
PFN_CS_FUNC pfnCsFunc;
|
||||
uint32_t totalThreadsInGroup;
|
||||
uint32_t totalSpillFillSize;
|
||||
uint32_t scratchSpaceSizePerWarp;
|
||||
uint32_t scratchSpaceNumWarps;
|
||||
|
||||
// FE - Frontend State
|
||||
SWR_FRONTEND_STATE frontendState;
|
||||
|
||||
// SOS - Streamout Shader State
|
||||
PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS];
|
||||
|
||||
// Streamout state
|
||||
SWR_STREAMOUT_STATE soState;
|
||||
mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS];
|
||||
mutable SWR_STREAMOUT_BUFFER soPausedBuffer[MAX_SO_STREAMS];
|
||||
|
||||
// Tessellation State
|
||||
PFN_HS_FUNC pfnHsFunc;
|
||||
PFN_DS_FUNC pfnDsFunc;
|
||||
SWR_TS_STATE tsState;
|
||||
|
||||
// Number of attributes used by the frontend (vs, so, gs)
|
||||
uint32_t feNumAttributes;
|
||||
|
||||
// RS - Rasterizer State
|
||||
SWR_RASTSTATE rastState;
|
||||
// floating point multisample offsets
|
||||
float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2];
|
||||
|
||||
GUARDBANDS gbState;
|
||||
|
||||
SWR_VIEWPORT vp[KNOB_NUM_VIEWPORTS_SCISSORS];
|
||||
SWR_VIEWPORT_MATRICES vpMatrices;
|
||||
|
||||
SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
|
||||
SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS];
|
||||
bool scissorsTileAligned;
|
||||
|
||||
bool forceFront;
|
||||
PRIMITIVE_TOPOLOGY topology;
|
||||
|
||||
|
||||
// Backend state
|
||||
OSALIGNLINE(SWR_BACKEND_STATE) backendState;
|
||||
|
||||
SWR_DEPTH_BOUNDS_STATE depthBoundsState;
|
||||
|
||||
// PS - Pixel shader state
|
||||
SWR_PS_STATE psState;
|
||||
|
||||
SWR_DEPTH_STENCIL_STATE depthStencilState;
|
||||
|
||||
// OM - Output Merger State
|
||||
SWR_BLEND_STATE blendState;
|
||||
PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS];
|
||||
|
||||
struct
|
||||
{
|
||||
uint32_t enableStatsFE : 1; // Enable frontend pipeline stats
|
||||
uint32_t enableStatsBE : 1; // Enable backend pipeline stats
|
||||
uint32_t colorHottileEnable : 8; // Bitmask of enabled color hottiles
|
||||
uint32_t depthHottileEnable : 1; // Enable depth buffer hottile
|
||||
uint32_t stencilHottileEnable : 1; // Enable stencil buffer hottile
|
||||
};
|
||||
|
||||
PFN_QUANTIZE_DEPTH pfnQuantizeDepth;
|
||||
};
|
||||
|
||||
class MacroTileMgr;
|
||||
class DispatchQueue;
|
||||
class HOTTILE;
|
||||
|
||||
struct RenderOutputBuffers
|
||||
{
|
||||
uint8_t* pColor[SWR_NUM_RENDERTARGETS];
|
||||
uint8_t* pDepth;
|
||||
uint8_t* pStencil;
|
||||
|
||||
HOTTILE* pColorHotTile[SWR_NUM_RENDERTARGETS];
|
||||
HOTTILE* pDepthHotTile;
|
||||
HOTTILE* pStencilHotTile;
|
||||
};
|
||||
|
||||
// Plane equation A/B/C coeffs used to evaluate I/J barycentric coords
|
||||
struct BarycentricCoeffs
|
||||
{
|
||||
simdscalar vIa;
|
||||
simdscalar vIb;
|
||||
simdscalar vIc;
|
||||
|
||||
simdscalar vJa;
|
||||
simdscalar vJb;
|
||||
simdscalar vJc;
|
||||
|
||||
simdscalar vZa;
|
||||
simdscalar vZb;
|
||||
simdscalar vZc;
|
||||
|
||||
simdscalar vRecipDet;
|
||||
|
||||
simdscalar vAOneOverW;
|
||||
simdscalar vBOneOverW;
|
||||
simdscalar vCOneOverW;
|
||||
};
|
||||
|
||||
// pipeline function pointer types
|
||||
typedef void (*PFN_BACKEND_FUNC)(
|
||||
DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
|
||||
typedef void (*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT&,
|
||||
uint8_t* (&)[SWR_NUM_RENDERTARGETS],
|
||||
uint32_t,
|
||||
const SWR_BLEND_STATE*,
|
||||
const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS],
|
||||
simdscalar&,
|
||||
simdscalar const&);
|
||||
typedef void (*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
|
||||
typedef void (*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
|
||||
typedef void (*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&,
|
||||
SWR_PS_CONTEXT&,
|
||||
const uint64_t* const,
|
||||
const uint32_t,
|
||||
simdscalar const&,
|
||||
simdscalar const&);
|
||||
|
||||
struct BACKEND_FUNCS
|
||||
{
|
||||
PFN_BACKEND_FUNC pfnBackend;
|
||||
};
|
||||
|
||||
// Draw State
|
||||
struct DRAW_STATE
|
||||
{
|
||||
API_STATE state;
|
||||
|
||||
void* pPrivateState; // Its required the driver sets this up for each draw.
|
||||
|
||||
// pipeline function pointers, filled in by API thread when setting up the draw
|
||||
BACKEND_FUNCS backendFuncs;
|
||||
PFN_PROCESS_PRIMS pfnProcessPrims;
|
||||
#if USE_SIMD16_FRONTEND
|
||||
PFN_PROCESS_PRIMS_SIMD16 pfnProcessPrims_simd16;
|
||||
#endif
|
||||
|
||||
CachingArena* pArena; // This should only be used by API thread.
|
||||
};
|
||||
|
||||
struct DRAW_DYNAMIC_STATE
|
||||
{
|
||||
void Reset(uint32_t numThreads)
|
||||
{
|
||||
SWR_STATS* pSavePtr = pStats;
|
||||
memset(this, 0, sizeof(*this));
|
||||
pStats = pSavePtr;
|
||||
memset(pStats, 0, sizeof(SWR_STATS) * numThreads);
|
||||
}
|
||||
///@todo Currently assumes only a single FE can do stream output for a draw.
|
||||
uint32_t SoWriteOffset[4];
|
||||
bool SoWriteOffsetDirty[4];
|
||||
|
||||
SWR_STATS_FE statsFE; // Only one FE thread per DC.
|
||||
SWR_STATS* pStats;
|
||||
uint64_t soPrims; // number of primitives written to StreamOut buffer
|
||||
};
|
||||
|
||||
// Draw Context
|
||||
// The api thread sets up a draw context that exists for the life of the draw.
|
||||
// This draw context maintains all of the state needed for the draw operation.
|
||||
struct DRAW_CONTEXT
|
||||
{
|
||||
SWR_CONTEXT* pContext;
|
||||
union
|
||||
{
|
||||
MacroTileMgr* pTileMgr;
|
||||
DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
|
||||
};
|
||||
DRAW_STATE* pState; // Read-only state. Core should not update this outside of API thread.
|
||||
CachingArena* pArena;
|
||||
|
||||
uint32_t drawId;
|
||||
bool dependentFE; // Frontend work is dependent on all previous FE
|
||||
bool dependent; // Backend work is dependent on all previous BE
|
||||
bool isCompute; // Is this DC a compute context?
|
||||
bool cleanupState; // True if this is the last draw using an entry in the state ring.
|
||||
|
||||
FE_WORK FeWork;
|
||||
|
||||
SYNC_DESC retireCallback; // Call this func when this DC is retired.
|
||||
|
||||
DRAW_DYNAMIC_STATE dynState;
|
||||
|
||||
volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw?
|
||||
volatile OSALIGNLINE(uint32_t) FeLock;
|
||||
volatile OSALIGNLINE(uint32_t) threadsDone;
|
||||
};
|
||||
|
||||
static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
|
||||
|
||||
INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
|
||||
{
|
||||
SWR_ASSERT(pDC != nullptr);
|
||||
SWR_ASSERT(pDC->pState != nullptr);
|
||||
|
||||
return pDC->pState->state;
|
||||
}
|
||||
|
||||
INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC)
|
||||
{
|
||||
SWR_ASSERT(pDC != nullptr);
|
||||
SWR_ASSERT(pDC->pState != nullptr);
|
||||
|
||||
return pDC->pState->pPrivateState;
|
||||
}
|
||||
|
||||
class HotTileMgr;
|
||||
|
||||
struct SWR_CONTEXT
|
||||
{
|
||||
// Draw Context Ring
|
||||
// Each draw needs its own state in order to support multiple draws in flight across multiple
|
||||
// threads. We maintain N draw contexts configured as a ring. The size of the ring limits the
|
||||
// maximum number of draws that can be in flight at any given time.
|
||||
//
|
||||
// Description:
|
||||
// 1. State - When an application first sets state we'll request a new draw context to use.
|
||||
// a. If there are no available draw contexts then we'll have to wait until one becomes
|
||||
// free. b. If one is available then set pCurDrawContext to point to it and mark it in use.
|
||||
// c. All state calls set state on pCurDrawContext.
|
||||
// 2. Draw - Creates submits a work item that is associated with current draw context.
|
||||
// a. Set pPrevDrawContext = pCurDrawContext
|
||||
// b. Set pCurDrawContext to NULL.
|
||||
// 3. State - When an applications sets state after draw
|
||||
// a. Same as step 1.
|
||||
// b. State is copied from prev draw context to current.
|
||||
RingBuffer<DRAW_CONTEXT> dcRing;
|
||||
|
||||
DRAW_CONTEXT* pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw.
|
||||
DRAW_CONTEXT* pPrevDrawContext; // This points to DC entry for the previous context submitted
|
||||
// that we can copy state from.
|
||||
|
||||
MacroTileMgr* pMacroTileManagerArray;
|
||||
DispatchQueue* pDispatchQueueArray;
|
||||
|
||||
// Draw State Ring
|
||||
// When draw are very large (lots of primitives) then the API thread will break these up.
|
||||
// These split draws all have identical state. So instead of storing the state directly
|
||||
// in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
|
||||
// to reference a single entry in the DS ring.
|
||||
RingBuffer<DRAW_STATE> dsRing;
|
||||
|
||||
uint32_t curStateId; // Current index to the next available entry in the DS ring.
|
||||
|
||||
uint32_t NumWorkerThreads;
|
||||
uint32_t NumFEThreads;
|
||||
uint32_t NumBEThreads;
|
||||
|
||||
THREAD_POOL threadPool; // Thread pool associated with this context
|
||||
SWR_THREADING_INFO threadInfo;
|
||||
SWR_API_THREADING_INFO apiThreadInfo;
|
||||
SWR_WORKER_PRIVATE_STATE workerPrivateState;
|
||||
|
||||
uint32_t MAX_DRAWS_IN_FLIGHT;
|
||||
|
||||
std::condition_variable FifosNotEmpty;
|
||||
std::mutex WaitLock;
|
||||
|
||||
uint32_t privateStateSize;
|
||||
|
||||
HotTileMgr* pHotTileMgr;
|
||||
|
||||
// Callback functions, passed in at create context time
|
||||
PFN_LOAD_TILE pfnLoadTile;
|
||||
PFN_STORE_TILE pfnStoreTile;
|
||||
PFN_TRANSLATE_GFXPTR_FOR_READ pfnTranslateGfxptrForRead;
|
||||
PFN_TRANSLATE_GFXPTR_FOR_WRITE pfnTranslateGfxptrForWrite;
|
||||
PFN_MAKE_GFXPTR pfnMakeGfxPtr;
|
||||
PFN_CREATE_MEMORY_CONTEXT pfnCreateMemoryContext;
|
||||
PFN_DESTROY_MEMORY_CONTEXT pfnDestroyMemoryContext;
|
||||
PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
|
||||
PFN_UPDATE_STATS pfnUpdateStats;
|
||||
PFN_UPDATE_STATS_FE pfnUpdateStatsFE;
|
||||
PFN_UPDATE_STREAMOUT pfnUpdateStreamOut;
|
||||
|
||||
|
||||
// Global Stats
|
||||
SWR_STATS* pStats;
|
||||
|
||||
// Scratch space for workers.
|
||||
uint8_t** ppScratch;
|
||||
|
||||
volatile OSALIGNLINE(uint32_t) drawsOutstandingFE;
|
||||
|
||||
OSALIGNLINE(CachingAllocator) cachingArenaAllocator;
|
||||
uint32_t frameCount;
|
||||
|
||||
uint32_t lastFrameChecked;
|
||||
uint64_t lastDrawChecked;
|
||||
TileSet* pSingleThreadLockedTiles;
|
||||
|
||||
// ArchRast thread contexts.
|
||||
HANDLE* pArContext;
|
||||
|
||||
// handle to external memory for worker data to create memory contexts
|
||||
HANDLE hExternalMemory;
|
||||
|
||||
BucketManager *pBucketMgr;
|
||||
};
|
||||
|
||||
#define UPDATE_STAT_BE(name, count) \
|
||||
if (GetApiState(pDC).enableStatsBE) \
|
||||
{ \
|
||||
pDC->dynState.pStats[workerId].name += count; \
|
||||
}
|
||||
#define UPDATE_STAT_FE(name, count) \
|
||||
if (GetApiState(pDC).enableStatsFE) \
|
||||
{ \
|
||||
pDC->dynState.statsFE.name += count; \
|
||||
}
|
||||
|
||||
// ArchRast instrumentation framework
|
||||
#define AR_WORKER_CTX pDC->pContext->pArContext[workerId]
|
||||
#define AR_API_CTX pDC->pContext->pArContext[pContext->NumWorkerThreads]
|
||||
|
||||
#ifdef KNOB_ENABLE_RDTSC
|
||||
#define RDTSC_BEGIN(pBucketMgr, type, drawid) RDTSC_START(pBucketMgr, type)
|
||||
#define RDTSC_END(pBucketMgr, type, count) RDTSC_STOP(pBucketMgr, type, count, 0)
|
||||
#else
|
||||
#define RDTSC_BEGIN(pBucketMgr, type, drawid)
|
||||
#define RDTSC_END(pBucketMgr, type, count)
|
||||
#endif
|
||||
|
||||
#ifdef KNOB_ENABLE_AR
|
||||
#define _AR_EVENT(ctx, event) ArchRast::Dispatch(ctx, ArchRast::event)
|
||||
#define _AR_FLUSH(ctx, id) ArchRast::FlushDraw(ctx, id)
|
||||
#else
|
||||
#define _AR_EVENT(ctx, event)
|
||||
#define _AR_FLUSH(ctx, id)
|
||||
#endif
|
||||
|
||||
// Use these macros for api thread.
|
||||
#define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event)
|
||||
|
||||
// Use these macros for worker threads.
|
||||
#define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event)
|
||||
#define AR_FLUSH(id) _AR_FLUSH(AR_WORKER_CTX, id)
|
||||
|
|
@ -1,335 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file depthstencil.h
|
||||
*
|
||||
* @brief Implements depth/stencil functionality
|
||||
*
|
||||
******************************************************************************/
|
||||
#pragma once
|
||||
#include "common/os.h"
|
||||
#include "format_conversion.h"
|
||||
|
||||
INLINE
|
||||
void StencilOp(SWR_STENCILOP op,
|
||||
simdscalar const& mask,
|
||||
simdscalar const& stencilRefps,
|
||||
simdscalar& stencilps)
|
||||
{
|
||||
simdscalari stencil = _simd_castps_si(stencilps);
|
||||
|
||||
switch (op)
|
||||
{
|
||||
case STENCILOP_KEEP:
|
||||
break;
|
||||
case STENCILOP_ZERO:
|
||||
stencilps = _simd_blendv_ps(stencilps, _simd_setzero_ps(), mask);
|
||||
break;
|
||||
case STENCILOP_REPLACE:
|
||||
stencilps = _simd_blendv_ps(stencilps, stencilRefps, mask);
|
||||
break;
|
||||
case STENCILOP_INCRSAT:
|
||||
{
|
||||
simdscalari stencilincr = _simd_adds_epu8(stencil, _simd_set1_epi32(1));
|
||||
stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask);
|
||||
break;
|
||||
}
|
||||
case STENCILOP_DECRSAT:
|
||||
{
|
||||
simdscalari stencildecr = _simd_subs_epu8(stencil, _simd_set1_epi32(1));
|
||||
stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask);
|
||||
break;
|
||||
}
|
||||
case STENCILOP_INCR:
|
||||
{
|
||||
simdscalari stencilincr = _simd_add_epi8(stencil, _simd_set1_epi32(1));
|
||||
stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask);
|
||||
break;
|
||||
}
|
||||
case STENCILOP_DECR:
|
||||
{
|
||||
simdscalari stencildecr = _simd_add_epi8(stencil, _simd_set1_epi32((-1) & 0xff));
|
||||
stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask);
|
||||
break;
|
||||
}
|
||||
case STENCILOP_INVERT:
|
||||
{
|
||||
simdscalar stencilinvert =
|
||||
_simd_andnot_ps(stencilps, _simd_cmpeq_ps(_simd_setzero_ps(), _simd_setzero_ps()));
|
||||
stencilps = _simd_blendv_ps(stencilps, stencilinvert, mask);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
template <SWR_FORMAT depthFormatT>
|
||||
simdscalar QuantizeDepth(simdscalar const& depth)
|
||||
{
|
||||
SWR_TYPE depthType = FormatTraits<depthFormatT>::GetType(0);
|
||||
uint32_t depthBpc = FormatTraits<depthFormatT>::GetBPC(0);
|
||||
|
||||
if (depthType == SWR_TYPE_FLOAT)
|
||||
{
|
||||
// assume only 32bit float depth supported
|
||||
SWR_ASSERT(depthBpc == 32);
|
||||
|
||||
// matches shader precision, no quantizing needed
|
||||
return depth;
|
||||
}
|
||||
|
||||
// should be unorm depth if not float
|
||||
SWR_ASSERT(depthType == SWR_TYPE_UNORM);
|
||||
|
||||
float quantize = (float)((1 << depthBpc) - 1);
|
||||
simdscalar result = _simd_mul_ps(depth, _simd_set1_ps(quantize));
|
||||
result = _simd_add_ps(result, _simd_set1_ps(0.5f));
|
||||
result = _simd_round_ps(result, _MM_FROUND_TO_ZERO);
|
||||
|
||||
if (depthBpc > 16)
|
||||
{
|
||||
result = _simd_div_ps(result, _simd_set1_ps(quantize));
|
||||
}
|
||||
else
|
||||
{
|
||||
result = _simd_mul_ps(result, _simd_set1_ps(1.0f / quantize));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
INLINE
|
||||
simdscalar DepthStencilTest(const API_STATE* pState,
|
||||
bool frontFacing,
|
||||
uint32_t viewportIndex,
|
||||
simdscalar const& iZ,
|
||||
uint8_t* pDepthBase,
|
||||
simdscalar const& coverageMask,
|
||||
uint8_t* pStencilBase,
|
||||
simdscalar* pStencilMask)
|
||||
{
|
||||
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
|
||||
static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format");
|
||||
|
||||
const SWR_DEPTH_STENCIL_STATE* pDSState = &pState->depthStencilState;
|
||||
const SWR_VIEWPORT* pViewport = &pState->vp[viewportIndex];
|
||||
|
||||
simdscalar depthResult = _simd_set1_ps(-1.0f);
|
||||
simdscalar zbuf;
|
||||
|
||||
// clamp Z to viewport [minZ..maxZ]
|
||||
simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ);
|
||||
simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ);
|
||||
simdscalar interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, iZ));
|
||||
|
||||
if (pDSState->depthTestEnable)
|
||||
{
|
||||
switch (pDSState->depthTestFunc)
|
||||
{
|
||||
case ZFUNC_NEVER:
|
||||
depthResult = _simd_setzero_ps();
|
||||
break;
|
||||
case ZFUNC_ALWAYS:
|
||||
break;
|
||||
default:
|
||||
zbuf = _simd_load_ps((const float*)pDepthBase);
|
||||
}
|
||||
|
||||
switch (pDSState->depthTestFunc)
|
||||
{
|
||||
case ZFUNC_LE:
|
||||
depthResult = _simd_cmple_ps(interpZ, zbuf);
|
||||
break;
|
||||
case ZFUNC_LT:
|
||||
depthResult = _simd_cmplt_ps(interpZ, zbuf);
|
||||
break;
|
||||
case ZFUNC_GT:
|
||||
depthResult = _simd_cmpgt_ps(interpZ, zbuf);
|
||||
break;
|
||||
case ZFUNC_GE:
|
||||
depthResult = _simd_cmpge_ps(interpZ, zbuf);
|
||||
break;
|
||||
case ZFUNC_EQ:
|
||||
depthResult = _simd_cmpeq_ps(interpZ, zbuf);
|
||||
break;
|
||||
case ZFUNC_NE:
|
||||
depthResult = _simd_cmpneq_ps(interpZ, zbuf);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
simdscalar stencilMask = _simd_set1_ps(-1.0f);
|
||||
|
||||
if (pDSState->stencilTestEnable)
|
||||
{
|
||||
uint8_t stencilRefValue;
|
||||
uint32_t stencilTestFunc;
|
||||
uint8_t stencilTestMask;
|
||||
if (frontFacing || !pDSState->doubleSidedStencilTestEnable)
|
||||
{
|
||||
stencilRefValue = pDSState->stencilRefValue;
|
||||
stencilTestFunc = pDSState->stencilTestFunc;
|
||||
stencilTestMask = pDSState->stencilTestMask;
|
||||
}
|
||||
else
|
||||
{
|
||||
stencilRefValue = pDSState->backfaceStencilRefValue;
|
||||
stencilTestFunc = pDSState->backfaceStencilTestFunc;
|
||||
stencilTestMask = pDSState->backfaceStencilTestMask;
|
||||
}
|
||||
|
||||
simdvector sbuf;
|
||||
simdscalar stencilWithMask;
|
||||
simdscalar stencilRef;
|
||||
switch (stencilTestFunc)
|
||||
{
|
||||
case ZFUNC_NEVER:
|
||||
stencilMask = _simd_setzero_ps();
|
||||
break;
|
||||
case ZFUNC_ALWAYS:
|
||||
break;
|
||||
default:
|
||||
LoadSOA<R8_UINT>(pStencilBase, sbuf);
|
||||
|
||||
// apply stencil read mask
|
||||
stencilWithMask = _simd_castsi_ps(
|
||||
_simd_and_si(_simd_castps_si(sbuf.v[0]), _simd_set1_epi32(stencilTestMask)));
|
||||
|
||||
// do stencil compare in float to avoid simd integer emulation in AVX1
|
||||
stencilWithMask = _simd_cvtepi32_ps(_simd_castps_si(stencilWithMask));
|
||||
|
||||
stencilRef = _simd_set1_ps((float)(stencilRefValue & stencilTestMask));
|
||||
break;
|
||||
}
|
||||
|
||||
switch (stencilTestFunc)
|
||||
{
|
||||
case ZFUNC_LE:
|
||||
stencilMask = _simd_cmple_ps(stencilRef, stencilWithMask);
|
||||
break;
|
||||
case ZFUNC_LT:
|
||||
stencilMask = _simd_cmplt_ps(stencilRef, stencilWithMask);
|
||||
break;
|
||||
case ZFUNC_GT:
|
||||
stencilMask = _simd_cmpgt_ps(stencilRef, stencilWithMask);
|
||||
break;
|
||||
case ZFUNC_GE:
|
||||
stencilMask = _simd_cmpge_ps(stencilRef, stencilWithMask);
|
||||
break;
|
||||
case ZFUNC_EQ:
|
||||
stencilMask = _simd_cmpeq_ps(stencilRef, stencilWithMask);
|
||||
break;
|
||||
case ZFUNC_NE:
|
||||
stencilMask = _simd_cmpneq_ps(stencilRef, stencilWithMask);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
simdscalar depthWriteMask = _simd_and_ps(depthResult, stencilMask);
|
||||
depthWriteMask = _simd_and_ps(depthWriteMask, coverageMask);
|
||||
|
||||
*pStencilMask = stencilMask;
|
||||
return depthWriteMask;
|
||||
}
|
||||
|
||||
INLINE
|
||||
void DepthStencilWrite(const SWR_VIEWPORT* pViewport,
|
||||
const SWR_DEPTH_STENCIL_STATE* pDSState,
|
||||
bool frontFacing,
|
||||
simdscalar const& iZ,
|
||||
uint8_t* pDepthBase,
|
||||
const simdscalar& depthMask,
|
||||
const simdscalar& coverageMask,
|
||||
uint8_t* pStencilBase,
|
||||
const simdscalar& stencilMask)
|
||||
{
|
||||
if (pDSState->depthWriteEnable)
|
||||
{
|
||||
// clamp Z to viewport [minZ..maxZ]
|
||||
simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ);
|
||||
simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ);
|
||||
simdscalar interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, iZ));
|
||||
|
||||
simdscalar vMask = _simd_and_ps(depthMask, coverageMask);
|
||||
_simd_maskstore_ps((float*)pDepthBase, _simd_castps_si(vMask), interpZ);
|
||||
}
|
||||
|
||||
if (pDSState->stencilWriteEnable)
|
||||
{
|
||||
simdvector sbuf;
|
||||
LoadSOA<R8_UINT>(pStencilBase, sbuf);
|
||||
simdscalar stencilbuf = sbuf.v[0];
|
||||
|
||||
uint8_t stencilRefValue;
|
||||
uint32_t stencilFailOp;
|
||||
uint32_t stencilPassDepthPassOp;
|
||||
uint32_t stencilPassDepthFailOp;
|
||||
uint8_t stencilWriteMask;
|
||||
if (frontFacing || !pDSState->doubleSidedStencilTestEnable)
|
||||
{
|
||||
stencilRefValue = pDSState->stencilRefValue;
|
||||
stencilFailOp = pDSState->stencilFailOp;
|
||||
stencilPassDepthPassOp = pDSState->stencilPassDepthPassOp;
|
||||
stencilPassDepthFailOp = pDSState->stencilPassDepthFailOp;
|
||||
stencilWriteMask = pDSState->stencilWriteMask;
|
||||
}
|
||||
else
|
||||
{
|
||||
stencilRefValue = pDSState->backfaceStencilRefValue;
|
||||
stencilFailOp = pDSState->backfaceStencilFailOp;
|
||||
stencilPassDepthPassOp = pDSState->backfaceStencilPassDepthPassOp;
|
||||
stencilPassDepthFailOp = pDSState->backfaceStencilPassDepthFailOp;
|
||||
stencilWriteMask = pDSState->backfaceStencilWriteMask;
|
||||
}
|
||||
|
||||
simdscalar stencilps = stencilbuf;
|
||||
simdscalar stencilRefps = _simd_castsi_ps(_simd_set1_epi32(stencilRefValue));
|
||||
|
||||
simdscalar stencilFailMask = _simd_andnot_ps(stencilMask, coverageMask);
|
||||
simdscalar stencilPassDepthPassMask = _simd_and_ps(stencilMask, depthMask);
|
||||
simdscalar stencilPassDepthFailMask =
|
||||
_simd_and_ps(stencilMask, _simd_andnot_ps(depthMask, _simd_set1_ps(-1)));
|
||||
|
||||
simdscalar origStencil = stencilps;
|
||||
|
||||
StencilOp((SWR_STENCILOP)stencilFailOp, stencilFailMask, stencilRefps, stencilps);
|
||||
StencilOp((SWR_STENCILOP)stencilPassDepthFailOp,
|
||||
stencilPassDepthFailMask,
|
||||
stencilRefps,
|
||||
stencilps);
|
||||
StencilOp((SWR_STENCILOP)stencilPassDepthPassOp,
|
||||
stencilPassDepthPassMask,
|
||||
stencilRefps,
|
||||
stencilps);
|
||||
|
||||
// apply stencil write mask
|
||||
simdscalari vWriteMask = _simd_set1_epi32(stencilWriteMask);
|
||||
stencilps = _simd_and_ps(stencilps, _simd_castsi_ps(vWriteMask));
|
||||
stencilps =
|
||||
_simd_or_ps(_simd_andnot_ps(_simd_castsi_ps(vWriteMask), origStencil), stencilps);
|
||||
|
||||
simdvector stencilResult;
|
||||
stencilResult.v[0] = _simd_blendv_ps(origStencil, stencilps, coverageMask);
|
||||
StoreSOA<R8_UINT>(stencilResult, pStencilBase);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,138 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file fifo.hpp
|
||||
*
|
||||
* @brief Definitions for our fifos used for thread communication.
|
||||
*
|
||||
******************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#include "common/os.h"
|
||||
#include "arena.h"
|
||||
|
||||
#include <vector>
|
||||
#include <cassert>
|
||||
|
||||
template <class T>
|
||||
struct QUEUE
|
||||
{
|
||||
OSALIGNLINE(volatile uint32_t) mLock{0};
|
||||
OSALIGNLINE(volatile uint32_t) mNumEntries{0};
|
||||
std::vector<T*> mBlocks;
|
||||
T* mCurBlock{nullptr};
|
||||
uint32_t mHead{0};
|
||||
uint32_t mTail{0};
|
||||
uint32_t mCurBlockIdx{0};
|
||||
|
||||
// power of 2
|
||||
static const uint32_t mBlockSizeShift = 6;
|
||||
static const uint32_t mBlockSize = 1 << mBlockSizeShift;
|
||||
|
||||
template <typename ArenaT>
|
||||
void clear(ArenaT& arena)
|
||||
{
|
||||
mHead = 0;
|
||||
mTail = 0;
|
||||
mBlocks.clear();
|
||||
T* pNewBlock = (T*)arena.AllocAligned(sizeof(T) * mBlockSize, KNOB_SIMD_WIDTH * 4);
|
||||
mBlocks.push_back(pNewBlock);
|
||||
mCurBlock = pNewBlock;
|
||||
mCurBlockIdx = 0;
|
||||
mNumEntries = 0;
|
||||
mLock = 0;
|
||||
}
|
||||
|
||||
uint32_t getNumQueued() { return mNumEntries; }
|
||||
|
||||
bool tryLock()
|
||||
{
|
||||
if (mLock)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// try to lock the FIFO
|
||||
long initial = InterlockedCompareExchange(&mLock, 1, 0);
|
||||
return (initial == 0);
|
||||
}
|
||||
|
||||
void unlock() { mLock = 0; }
|
||||
|
||||
T* peek()
|
||||
{
|
||||
if (mNumEntries == 0)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
uint32_t block = mHead >> mBlockSizeShift;
|
||||
return &mBlocks[block][mHead & (mBlockSize - 1)];
|
||||
}
|
||||
|
||||
void dequeue_noinc()
|
||||
{
|
||||
mHead++;
|
||||
mNumEntries--;
|
||||
}
|
||||
|
||||
template <typename ArenaT>
|
||||
bool enqueue_try_nosync(ArenaT& arena, const T* entry)
|
||||
{
|
||||
const float* pSrc = (const float*)entry;
|
||||
float* pDst = (float*)&mCurBlock[mTail];
|
||||
|
||||
auto lambda = [&](int32_t i) {
|
||||
__m256 vSrc = _mm256_load_ps(pSrc + i * KNOB_SIMD_WIDTH);
|
||||
_mm256_stream_ps(pDst + i * KNOB_SIMD_WIDTH, vSrc);
|
||||
};
|
||||
|
||||
const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH * 4);
|
||||
static_assert(numSimdLines * KNOB_SIMD_WIDTH * 4 == sizeof(T),
|
||||
"FIFO element size should be multiple of SIMD width.");
|
||||
|
||||
UnrollerL<0, numSimdLines, 1>::step(lambda);
|
||||
|
||||
mTail++;
|
||||
if (mTail == mBlockSize)
|
||||
{
|
||||
if (++mCurBlockIdx < mBlocks.size())
|
||||
{
|
||||
mCurBlock = mBlocks[mCurBlockIdx];
|
||||
}
|
||||
else
|
||||
{
|
||||
T* newBlock = (T*)arena.AllocAligned(sizeof(T) * mBlockSize, KNOB_SIMD_WIDTH * 4);
|
||||
SWR_ASSERT(newBlock);
|
||||
|
||||
mBlocks.push_back(newBlock);
|
||||
mCurBlock = newBlock;
|
||||
}
|
||||
|
||||
mTail = 0;
|
||||
}
|
||||
|
||||
mNumEntries++;
|
||||
return true;
|
||||
}
|
||||
|
||||
void destroy() {}
|
||||
};
|
||||
|
|
@ -1,262 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file format_conversion.h
|
||||
*
|
||||
* @brief API implementation
|
||||
*
|
||||
******************************************************************************/
|
||||
#include "format_types.h"
|
||||
#include "format_traits.h"
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Load SIMD packed pixels in SOA format and converts to
|
||||
/// SOA RGBA32_FLOAT format.
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param dst - output data in SOA form
|
||||
template <typename SIMD_T, SWR_FORMAT SrcFormat>
|
||||
INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, Vec4<SIMD_T>& dst)
|
||||
{
|
||||
// fast path for float32
|
||||
if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) &&
|
||||
(FormatTraits<SrcFormat>::GetBPC(0) == 32))
|
||||
{
|
||||
auto lambda = [&](int comp)
|
||||
{
|
||||
Float<SIMD_T> vComp =
|
||||
SIMD_T::load_ps(reinterpret_cast<const float*>(pSrc + comp * sizeof(Float<SIMD_T>)));
|
||||
|
||||
dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
|
||||
};
|
||||
|
||||
UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
|
||||
return;
|
||||
}
|
||||
|
||||
auto lambda = [&](int comp)
|
||||
{
|
||||
// load SIMD components
|
||||
Float<SIMD_T> vComp;
|
||||
FormatTraits<SrcFormat>::loadSOA(comp, pSrc, vComp);
|
||||
|
||||
// unpack
|
||||
vComp = FormatTraits<SrcFormat>::unpack(comp, vComp);
|
||||
|
||||
// convert
|
||||
if (FormatTraits<SrcFormat>::isNormalized(comp))
|
||||
{
|
||||
vComp = SIMD_T::cvtepi32_ps(SIMD_T::castps_si(vComp));
|
||||
vComp = SIMD_T::mul_ps(vComp, SIMD_T::set1_ps(FormatTraits<SrcFormat>::toFloat(comp)));
|
||||
}
|
||||
|
||||
dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
|
||||
|
||||
// is there a better way to get this from the SIMD traits?
|
||||
const uint32_t SIMD_WIDTH = sizeof(typename SIMD_T::Float) / sizeof(float);
|
||||
|
||||
pSrc += (FormatTraits<SrcFormat>::GetBPC(comp) * SIMD_WIDTH) / 8;
|
||||
};
|
||||
|
||||
UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
|
||||
}
|
||||
|
||||
template <SWR_FORMAT SrcFormat>
|
||||
INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simdvector& dst)
|
||||
{
|
||||
LoadSOA<SIMD256, SrcFormat>(pSrc, dst);
|
||||
}
|
||||
|
||||
template <SWR_FORMAT SrcFormat>
|
||||
INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simd16vector& dst)
|
||||
{
|
||||
LoadSOA<SIMD512, SrcFormat>(pSrc, dst);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Clamps the given component based on the requirements on the
|
||||
/// Format template arg
|
||||
/// @param vComp - SIMD vector of floats
|
||||
/// @param Component - component
|
||||
template <typename SIMD_T, SWR_FORMAT Format>
|
||||
INLINE Float<SIMD_T> SIMDCALL Clamp(Float<SIMD_T> const& v, uint32_t Component)
|
||||
{
|
||||
Float<SIMD_T> vComp = v;
|
||||
if (Component >= 4 || Component < 0)
|
||||
{
|
||||
// Component shouldn't out of <0;3> range
|
||||
assert(false);
|
||||
return vComp;
|
||||
}
|
||||
if (FormatTraits<Format>::isNormalized(Component))
|
||||
{
|
||||
if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UNORM)
|
||||
{
|
||||
vComp = SIMD_T::max_ps(vComp, SIMD_T::setzero_ps());
|
||||
}
|
||||
|
||||
if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SNORM)
|
||||
{
|
||||
vComp = SIMD_T::max_ps(vComp, SIMD_T::set1_ps(-1.0f));
|
||||
}
|
||||
vComp = SIMD_T::min_ps(vComp, SIMD_T::set1_ps(1.0f));
|
||||
}
|
||||
else if (FormatTraits<Format>::GetBPC(Component) < 32)
|
||||
{
|
||||
if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UINT)
|
||||
{
|
||||
int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1;
|
||||
int iMin = 0;
|
||||
Integer<SIMD_T> vCompi = SIMD_T::castps_si(vComp);
|
||||
vCompi = SIMD_T::max_epu32(vCompi, SIMD_T::set1_epi32(iMin));
|
||||
vCompi = SIMD_T::min_epu32(vCompi, SIMD_T::set1_epi32(iMax));
|
||||
vComp = SIMD_T::castsi_ps(vCompi);
|
||||
}
|
||||
else if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SINT)
|
||||
{
|
||||
int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1;
|
||||
int iMin = -1 - iMax;
|
||||
Integer<SIMD_T> vCompi = SIMD_T::castps_si(vComp);
|
||||
vCompi = SIMD_T::max_epi32(vCompi, SIMD_T::set1_epi32(iMin));
|
||||
vCompi = SIMD_T::min_epi32(vCompi, SIMD_T::set1_epi32(iMax));
|
||||
vComp = SIMD_T::castsi_ps(vCompi);
|
||||
}
|
||||
}
|
||||
|
||||
return vComp;
|
||||
}
|
||||
|
||||
template <SWR_FORMAT Format>
|
||||
INLINE simdscalar SIMDCALL Clamp(simdscalar const& v, uint32_t Component)
|
||||
{
|
||||
return Clamp<SIMD256, Format>(v, Component);
|
||||
}
|
||||
|
||||
template <SWR_FORMAT Format>
|
||||
INLINE simd16scalar SIMDCALL Clamp(simd16scalar const& v, uint32_t Component)
|
||||
{
|
||||
return Clamp<SIMD512, Format>(v, Component);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Normalize the given component based on the requirements on the
|
||||
/// Format template arg
|
||||
/// @param vComp - SIMD vector of floats
|
||||
/// @param Component - component
|
||||
template <typename SIMD_T, SWR_FORMAT Format>
|
||||
INLINE Float<SIMD_T> SIMDCALL Normalize(Float<SIMD_T> const& vComp, uint32_t Component)
|
||||
{
|
||||
Float<SIMD_T> r = vComp;
|
||||
if (FormatTraits<Format>::isNormalized(Component))
|
||||
{
|
||||
r = SIMD_T::mul_ps(r, SIMD_T::set1_ps(FormatTraits<Format>::fromFloat(Component)));
|
||||
r = SIMD_T::castsi_ps(SIMD_T::cvtps_epi32(r));
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
template <SWR_FORMAT Format>
|
||||
INLINE simdscalar SIMDCALL Normalize(simdscalar const& vComp, uint32_t Component)
|
||||
{
|
||||
return Normalize<SIMD256, Format>(vComp, Component);
|
||||
}
|
||||
|
||||
template <SWR_FORMAT Format>
|
||||
INLINE simd16scalar SIMDCALL Normalize(simd16scalar const& vComp, uint32_t Component)
|
||||
{
|
||||
return Normalize<SIMD512, Format>(vComp, Component);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Convert and store simdvector of pixels in SOA
|
||||
/// RGBA32_FLOAT to SOA format
|
||||
/// @param src - source data in SOA form
|
||||
/// @param dst - output data in SOA form
|
||||
template <typename SIMD_T, SWR_FORMAT DstFormat>
|
||||
INLINE void SIMDCALL StoreSOA(const Vec4<SIMD_T>& src, uint8_t* pDst)
|
||||
{
|
||||
// fast path for float32
|
||||
if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) &&
|
||||
(FormatTraits<DstFormat>::GetBPC(0) == 32))
|
||||
{
|
||||
for (uint32_t comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
|
||||
{
|
||||
Float<SIMD_T> vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
|
||||
|
||||
// Gamma-correct
|
||||
if (FormatTraits<DstFormat>::isSRGB)
|
||||
{
|
||||
if (comp < 3) // Input format is always RGBA32_FLOAT.
|
||||
{
|
||||
vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
|
||||
}
|
||||
}
|
||||
|
||||
SIMD_T::store_ps(reinterpret_cast<float*>(pDst + comp * sizeof(simd16scalar)), vComp);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
auto lambda = [&](int comp) {
|
||||
Float<SIMD_T> vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
|
||||
|
||||
// Gamma-correct
|
||||
if (FormatTraits<DstFormat>::isSRGB)
|
||||
{
|
||||
if (comp < 3) // Input format is always RGBA32_FLOAT.
|
||||
{
|
||||
vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
|
||||
}
|
||||
}
|
||||
|
||||
// clamp
|
||||
vComp = Clamp<SIMD_T, DstFormat>(vComp, comp);
|
||||
|
||||
// normalize
|
||||
vComp = Normalize<SIMD_T, DstFormat>(vComp, comp);
|
||||
|
||||
// pack
|
||||
vComp = FormatTraits<DstFormat>::pack(comp, vComp);
|
||||
|
||||
// store
|
||||
FormatTraits<DstFormat>::storeSOA(comp, pDst, vComp);
|
||||
|
||||
// is there a better way to get this from the SIMD traits?
|
||||
const uint32_t SIMD_WIDTH = sizeof(typename SIMD_T::Float) / sizeof(float);
|
||||
|
||||
pDst += (FormatTraits<DstFormat>::GetBPC(comp) * SIMD_WIDTH) / 8;
|
||||
};
|
||||
|
||||
UnrollerL<0, FormatTraits<DstFormat>::numComps, 1>::step(lambda);
|
||||
}
|
||||
|
||||
template <SWR_FORMAT DstFormat>
|
||||
INLINE void SIMDCALL StoreSOA(const simdvector& src, uint8_t* pDst)
|
||||
{
|
||||
StoreSOA<SIMD256, DstFormat>(src, pDst);
|
||||
}
|
||||
|
||||
template <SWR_FORMAT DstFormat>
|
||||
INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst)
|
||||
{
|
||||
StoreSOA<SIMD512, DstFormat>(src, pDst);
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
|
@ -1,939 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file utils.h
|
||||
*
|
||||
* @brief Utilities used by SWR core related to pixel formats.
|
||||
*
|
||||
******************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#include "core/utils.h"
|
||||
#include "common/simdintrin.h"
|
||||
|
||||
INLINE
|
||||
void vTranspose(simd4scalar& row0, simd4scalar& row1, simd4scalar& row2, simd4scalar& row3)
|
||||
{
|
||||
simd4scalari row0i = SIMD128::castps_si(row0);
|
||||
simd4scalari row1i = SIMD128::castps_si(row1);
|
||||
simd4scalari row2i = SIMD128::castps_si(row2);
|
||||
simd4scalari row3i = SIMD128::castps_si(row3);
|
||||
|
||||
simd4scalari vTemp = row2i;
|
||||
row2i = SIMD128::unpacklo_epi32(row2i, row3i);
|
||||
vTemp = SIMD128::unpackhi_epi32(vTemp, row3i);
|
||||
|
||||
row3i = row0i;
|
||||
row0i = SIMD128::unpacklo_epi32(row0i, row1i);
|
||||
row3i = SIMD128::unpackhi_epi32(row3i, row1i);
|
||||
|
||||
row1i = row0i;
|
||||
row0i = SIMD128::unpacklo_epi64(row0i, row2i);
|
||||
row1i = SIMD128::unpackhi_epi64(row1i, row2i);
|
||||
|
||||
row2i = row3i;
|
||||
row2i = SIMD128::unpacklo_epi64(row2i, vTemp);
|
||||
row3i = SIMD128::unpackhi_epi64(row3i, vTemp);
|
||||
|
||||
row0 = SIMD128::castsi_ps(row0i);
|
||||
row1 = SIMD128::castsi_ps(row1i);
|
||||
row2 = SIMD128::castsi_ps(row2i);
|
||||
row3 = SIMD128::castsi_ps(row3i);
|
||||
}
|
||||
|
||||
INLINE
|
||||
void vTranspose(simd4scalari& row0, simd4scalari& row1, simd4scalari& row2, simd4scalari& row3)
|
||||
{
|
||||
simd4scalari vTemp = row2;
|
||||
row2 = SIMD128::unpacklo_epi32(row2, row3);
|
||||
vTemp = SIMD128::unpackhi_epi32(vTemp, row3);
|
||||
|
||||
row3 = row0;
|
||||
row0 = SIMD128::unpacklo_epi32(row0, row1);
|
||||
row3 = SIMD128::unpackhi_epi32(row3, row1);
|
||||
|
||||
row1 = row0;
|
||||
row0 = SIMD128::unpacklo_epi64(row0, row2);
|
||||
row1 = SIMD128::unpackhi_epi64(row1, row2);
|
||||
|
||||
row2 = row3;
|
||||
row2 = SIMD128::unpacklo_epi64(row2, vTemp);
|
||||
row3 = SIMD128::unpackhi_epi64(row3, vTemp);
|
||||
}
|
||||
|
||||
#if KNOB_SIMD_WIDTH == 8
|
||||
INLINE
|
||||
void vTranspose3x8(simd4scalar (&vDst)[8],
|
||||
const simdscalar& vSrc0,
|
||||
const simdscalar& vSrc1,
|
||||
const simdscalar& vSrc2)
|
||||
{
|
||||
simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); // x0z0x1z1 x4z4x5z5
|
||||
simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); // y0w0y1w1 y4w4y5w5
|
||||
simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); // x0y0z0w0 x4y4z4w4
|
||||
simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); // x1y1z1w1 x5y5z5w5
|
||||
|
||||
r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); // x2z2x3z3 x6z6x7z7
|
||||
r1rx = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps()); // y2w2y3w3 y6w6yw77
|
||||
simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); // x2y2z2w2 x6y6z6w6
|
||||
simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); // x3y3z3w3 x7y7z7w7
|
||||
|
||||
vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
|
||||
vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
|
||||
vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
|
||||
vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
|
||||
|
||||
vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
|
||||
vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
|
||||
vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
|
||||
vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
|
||||
}
|
||||
|
||||
INLINE
|
||||
void vTranspose4x8(simd4scalar (&vDst)[8],
|
||||
const simdscalar& vSrc0,
|
||||
const simdscalar& vSrc1,
|
||||
const simdscalar& vSrc2,
|
||||
const simdscalar& vSrc3)
|
||||
{
|
||||
simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); // x0z0x1z1 x4z4x5z5
|
||||
simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3); // y0w0y1w1 y4w4y5w5
|
||||
simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); // x0y0z0w0 x4y4z4w4
|
||||
simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); // x1y1z1w1 x5y5z5w5
|
||||
|
||||
r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); // x2z2x3z3 x6z6x7z7
|
||||
r1rx = _simd_unpackhi_ps(vSrc1, vSrc3); // y2w2y3w3 y6w6yw77
|
||||
simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); // x2y2z2w2 x6y6z6w6
|
||||
simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); // x3y3z3w3 x7y7z7w7
|
||||
|
||||
vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
|
||||
vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
|
||||
vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
|
||||
vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
|
||||
|
||||
vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
|
||||
vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
|
||||
vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
|
||||
vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
|
||||
}
|
||||
|
||||
INLINE
|
||||
void vTranspose4x16(simd16scalar (&dst)[4],
|
||||
const simd16scalar& src0,
|
||||
const simd16scalar& src1,
|
||||
const simd16scalar& src2,
|
||||
const simd16scalar& src3)
|
||||
{
|
||||
const simd16scalari perm =
|
||||
_simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
|
||||
|
||||
// pre-permute input to setup the right order after all the unpacking
|
||||
|
||||
simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r
|
||||
simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g
|
||||
simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b
|
||||
simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a
|
||||
|
||||
simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2);
|
||||
simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3);
|
||||
simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2);
|
||||
simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3);
|
||||
|
||||
dst[0] = _simd16_unpacklo_ps(rblo, galo);
|
||||
dst[1] = _simd16_unpackhi_ps(rblo, galo);
|
||||
dst[2] = _simd16_unpacklo_ps(rbhi, gahi);
|
||||
dst[3] = _simd16_unpackhi_ps(rbhi, gahi);
|
||||
}
|
||||
|
||||
INLINE
|
||||
void vTranspose8x8(simdscalar (&vDst)[8],
|
||||
const simdscalar& vMask0,
|
||||
const simdscalar& vMask1,
|
||||
const simdscalar& vMask2,
|
||||
const simdscalar& vMask3,
|
||||
const simdscalar& vMask4,
|
||||
const simdscalar& vMask5,
|
||||
const simdscalar& vMask6,
|
||||
const simdscalar& vMask7)
|
||||
{
|
||||
simdscalar __t0 = _simd_unpacklo_ps(vMask0, vMask1);
|
||||
simdscalar __t1 = _simd_unpackhi_ps(vMask0, vMask1);
|
||||
simdscalar __t2 = _simd_unpacklo_ps(vMask2, vMask3);
|
||||
simdscalar __t3 = _simd_unpackhi_ps(vMask2, vMask3);
|
||||
simdscalar __t4 = _simd_unpacklo_ps(vMask4, vMask5);
|
||||
simdscalar __t5 = _simd_unpackhi_ps(vMask4, vMask5);
|
||||
simdscalar __t6 = _simd_unpacklo_ps(vMask6, vMask7);
|
||||
simdscalar __t7 = _simd_unpackhi_ps(vMask6, vMask7);
|
||||
simdscalar __tt0 = _simd_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
simdscalar __tt1 = _simd_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2));
|
||||
simdscalar __tt2 = _simd_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
simdscalar __tt3 = _simd_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2));
|
||||
simdscalar __tt4 = _simd_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
simdscalar __tt5 = _simd_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2));
|
||||
simdscalar __tt6 = _simd_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
simdscalar __tt7 = _simd_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2));
|
||||
vDst[0] = _simd_permute2f128_ps(__tt0, __tt4, 0x20);
|
||||
vDst[1] = _simd_permute2f128_ps(__tt1, __tt5, 0x20);
|
||||
vDst[2] = _simd_permute2f128_ps(__tt2, __tt6, 0x20);
|
||||
vDst[3] = _simd_permute2f128_ps(__tt3, __tt7, 0x20);
|
||||
vDst[4] = _simd_permute2f128_ps(__tt0, __tt4, 0x31);
|
||||
vDst[5] = _simd_permute2f128_ps(__tt1, __tt5, 0x31);
|
||||
vDst[6] = _simd_permute2f128_ps(__tt2, __tt6, 0x31);
|
||||
vDst[7] = _simd_permute2f128_ps(__tt3, __tt7, 0x31);
|
||||
}
|
||||
|
||||
INLINE
|
||||
void vTranspose8x8(simdscalar (&vDst)[8],
|
||||
const simdscalari& vMask0,
|
||||
const simdscalari& vMask1,
|
||||
const simdscalari& vMask2,
|
||||
const simdscalari& vMask3,
|
||||
const simdscalari& vMask4,
|
||||
const simdscalari& vMask5,
|
||||
const simdscalari& vMask6,
|
||||
const simdscalari& vMask7)
|
||||
{
|
||||
vTranspose8x8(vDst,
|
||||
_simd_castsi_ps(vMask0),
|
||||
_simd_castsi_ps(vMask1),
|
||||
_simd_castsi_ps(vMask2),
|
||||
_simd_castsi_ps(vMask3),
|
||||
_simd_castsi_ps(vMask4),
|
||||
_simd_castsi_ps(vMask5),
|
||||
_simd_castsi_ps(vMask6),
|
||||
_simd_castsi_ps(vMask7));
|
||||
}
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// TranposeSingleComponent
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
template <uint32_t bpp>
|
||||
struct TransposeSingleComponent
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Pass-thru for single component.
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
|
||||
{
|
||||
memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
|
||||
}
|
||||
|
||||
INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
|
||||
{
|
||||
memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose8_8_8_8
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose8_8_8_8
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
|
||||
{
|
||||
simdscalari src = _simd_load_si((const simdscalari*)pSrc);
|
||||
|
||||
#if KNOB_SIMD_WIDTH == 8
|
||||
#if KNOB_ARCH <= KNOB_ARCH_AVX
|
||||
simd4scalari c0c1 = src.v4[0]; // rrrrrrrrgggggggg
|
||||
simd4scalari c2c3 =
|
||||
SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1)); // bbbbbbbbaaaaaaaa
|
||||
simd4scalari c0c2 = SIMD128::unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
|
||||
simd4scalari c1c3 = SIMD128::unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
|
||||
simd4scalari c01 = SIMD128::unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
|
||||
simd4scalari c23 = SIMD128::unpackhi_epi8(c0c2, c1c3); // babababababababa
|
||||
simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23); // rgbargbargbargba
|
||||
simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23); // rgbargbargbargba
|
||||
SIMD128::store_si((simd4scalari*)pDst, c0123lo);
|
||||
SIMD128::store_si((simd4scalari*)(pDst + 16), c0123hi);
|
||||
#else
|
||||
simdscalari dst01 = _simd_shuffle_epi8(src,
|
||||
_simd_set_epi32(0x0f078080,
|
||||
0x0e068080,
|
||||
0x0d058080,
|
||||
0x0c048080,
|
||||
0x80800b03,
|
||||
0x80800a02,
|
||||
0x80800901,
|
||||
0x80800800));
|
||||
simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
|
||||
dst23 = _simd_shuffle_epi8(dst23,
|
||||
_simd_set_epi32(0x80800f07,
|
||||
0x80800e06,
|
||||
0x80800d05,
|
||||
0x80800c04,
|
||||
0x0b038080,
|
||||
0x0a028080,
|
||||
0x09018080,
|
||||
0x08008080));
|
||||
simdscalari dst = _simd_or_si(dst01, dst23);
|
||||
_simd_store_si((simdscalari*)pDst, dst);
|
||||
#endif
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
}
|
||||
|
||||
INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
|
||||
{
|
||||
#if KNOB_SIMD16_WIDTH == 16
|
||||
// clang-format off
|
||||
|
||||
simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc)); // rrrrrrrrrrrrrrrr
|
||||
simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1); // gggggggggggggggg
|
||||
simd4scalari src2 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb
|
||||
simd4scalari src3 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 3); // aaaaaaaaaaaaaaaa
|
||||
|
||||
simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
|
||||
simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
|
||||
simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2);
|
||||
simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3);
|
||||
|
||||
simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8);
|
||||
simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16);
|
||||
simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24);
|
||||
|
||||
simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3));
|
||||
|
||||
_simd16_store_si(reinterpret_cast<simd16scalari*>(pDst), dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
|
||||
|
||||
// clang-format on
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose8_8_8
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose8_8_8
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose8_8
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose8_8
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion for packed 8_8 data.
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
|
||||
{
|
||||
#if KNOB_SIMD_WIDTH == 8
|
||||
simdscalari src = _simd_load_si((const simdscalari*)pSrc);
|
||||
|
||||
simd4scalari rg = src.v4[0]; // rrrrrrrr gggggggg
|
||||
simd4scalari g = SIMD128::unpackhi_epi64(rg, rg); // gggggggg gggggggg
|
||||
rg = SIMD128::unpacklo_epi8(rg, g);
|
||||
SIMD128::store_si((simd4scalari*)pDst, rg);
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
}
|
||||
|
||||
INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
|
||||
{
|
||||
#if KNOB_SIMD16_WIDTH == 16
|
||||
// clang-format off
|
||||
|
||||
simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc)); // rrrrrrrrrrrrrrrr
|
||||
simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1); // gggggggggggggggg
|
||||
|
||||
simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
|
||||
simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
|
||||
|
||||
simdscalari shl1 = _simd_slli_epi32(cvt1, 8);
|
||||
|
||||
simdscalari dst = _simd_or_si(cvt0, shl1);
|
||||
|
||||
_simd_store_si(reinterpret_cast<simdscalari*>(pDst), dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
|
||||
|
||||
// clang-format on
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose32_32_32_32
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose32_32_32_32
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
|
||||
{
|
||||
#if KNOB_SIMD_WIDTH == 8
|
||||
simdscalar src0 = _simd_load_ps((const float*)pSrc);
|
||||
simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
|
||||
simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
|
||||
simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
|
||||
|
||||
simd4scalar vDst[8];
|
||||
vTranspose4x8(vDst, src0, src1, src2, src3);
|
||||
SIMD128::store_ps((float*)pDst, vDst[0]);
|
||||
SIMD128::store_ps((float*)pDst + 4, vDst[1]);
|
||||
SIMD128::store_ps((float*)pDst + 8, vDst[2]);
|
||||
SIMD128::store_ps((float*)pDst + 12, vDst[3]);
|
||||
SIMD128::store_ps((float*)pDst + 16, vDst[4]);
|
||||
SIMD128::store_ps((float*)pDst + 20, vDst[5]);
|
||||
SIMD128::store_ps((float*)pDst + 24, vDst[6]);
|
||||
SIMD128::store_ps((float*)pDst + 28, vDst[7]);
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
}
|
||||
|
||||
INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
|
||||
{
|
||||
#if KNOB_SIMD16_WIDTH == 16
|
||||
// clang-format off
|
||||
|
||||
simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc));
|
||||
simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
|
||||
simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
|
||||
simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 48);
|
||||
|
||||
simd16scalar dst[4];
|
||||
|
||||
vTranspose4x16(dst, src0, src1, src2, src3);
|
||||
|
||||
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst[0]);
|
||||
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]);
|
||||
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]);
|
||||
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]);
|
||||
|
||||
// clang-format on
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose32_32_32
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose32_32_32
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
|
||||
{
|
||||
#if KNOB_SIMD_WIDTH == 8
|
||||
simdscalar src0 = _simd_load_ps((const float*)pSrc);
|
||||
simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
|
||||
simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
|
||||
|
||||
simd4scalar vDst[8];
|
||||
vTranspose3x8(vDst, src0, src1, src2);
|
||||
SIMD128::store_ps((float*)pDst, vDst[0]);
|
||||
SIMD128::store_ps((float*)pDst + 4, vDst[1]);
|
||||
SIMD128::store_ps((float*)pDst + 8, vDst[2]);
|
||||
SIMD128::store_ps((float*)pDst + 12, vDst[3]);
|
||||
SIMD128::store_ps((float*)pDst + 16, vDst[4]);
|
||||
SIMD128::store_ps((float*)pDst + 20, vDst[5]);
|
||||
SIMD128::store_ps((float*)pDst + 24, vDst[6]);
|
||||
SIMD128::store_ps((float*)pDst + 28, vDst[7]);
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
}
|
||||
|
||||
INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
|
||||
{
|
||||
#if KNOB_SIMD16_WIDTH == 16
|
||||
// clang-format off
|
||||
|
||||
simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc));
|
||||
simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
|
||||
simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
|
||||
simd16scalar src3 = _simd16_setzero_ps();
|
||||
|
||||
simd16scalar dst[4];
|
||||
|
||||
vTranspose4x16(dst, src0, src1, src2, src3);
|
||||
|
||||
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst[0]);
|
||||
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]);
|
||||
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]);
|
||||
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]);
|
||||
|
||||
// clang-format on
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose32_32
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose32_32
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion for packed 32_32 data.
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
|
||||
{
|
||||
#if KNOB_SIMD_WIDTH == 8
|
||||
const float* pfSrc = (const float*)pSrc;
|
||||
simd4scalar src_r0 = SIMD128::load_ps(pfSrc + 0);
|
||||
simd4scalar src_r1 = SIMD128::load_ps(pfSrc + 4);
|
||||
simd4scalar src_g0 = SIMD128::load_ps(pfSrc + 8);
|
||||
simd4scalar src_g1 = SIMD128::load_ps(pfSrc + 12);
|
||||
|
||||
simd4scalar dst0 = SIMD128::unpacklo_ps(src_r0, src_g0);
|
||||
simd4scalar dst1 = SIMD128::unpackhi_ps(src_r0, src_g0);
|
||||
simd4scalar dst2 = SIMD128::unpacklo_ps(src_r1, src_g1);
|
||||
simd4scalar dst3 = SIMD128::unpackhi_ps(src_r1, src_g1);
|
||||
|
||||
float* pfDst = (float*)pDst;
|
||||
SIMD128::store_ps(pfDst + 0, dst0);
|
||||
SIMD128::store_ps(pfDst + 4, dst1);
|
||||
SIMD128::store_ps(pfDst + 8, dst2);
|
||||
SIMD128::store_ps(pfDst + 12, dst3);
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
}
|
||||
|
||||
INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
|
||||
{
|
||||
#if KNOB_SIMD16_WIDTH == 16
|
||||
// clang-format off
|
||||
|
||||
simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc)); // rrrrrrrrrrrrrrrr
|
||||
simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16); // gggggggggggggggg
|
||||
|
||||
simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
|
||||
simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
|
||||
|
||||
simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
|
||||
simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
|
||||
|
||||
simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
|
||||
simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
|
||||
|
||||
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg
|
||||
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg
|
||||
|
||||
// clang-format on
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose16_16_16_16
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose16_16_16_16
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
|
||||
{
|
||||
#if KNOB_SIMD_WIDTH == 8
|
||||
simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
|
||||
simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
|
||||
|
||||
simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
|
||||
simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
|
||||
simd4scalari src_b = _simd_extractf128_si(src_ba, 0);
|
||||
simd4scalari src_a = _simd_extractf128_si(src_ba, 1);
|
||||
|
||||
simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
|
||||
simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
|
||||
simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
|
||||
simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
|
||||
|
||||
simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
|
||||
simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
|
||||
simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
|
||||
simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
|
||||
|
||||
SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
|
||||
SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
|
||||
SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
|
||||
SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
}
|
||||
|
||||
INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
|
||||
{
|
||||
#if KNOB_SIMD16_WIDTH == 16
|
||||
// clang-format off
|
||||
|
||||
simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr
|
||||
simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg
|
||||
simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb
|
||||
simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 3); // aaaaaaaaaaaaaaaa
|
||||
|
||||
simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
|
||||
simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
|
||||
simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
|
||||
simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF
|
||||
|
||||
simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9
|
||||
simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB
|
||||
simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD
|
||||
simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF
|
||||
|
||||
simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
|
||||
simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
|
||||
simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
|
||||
simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
|
||||
|
||||
_simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgbargbargbargba
|
||||
_simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgbargbargbargba
|
||||
_simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2); // rgbargbargbargba
|
||||
_simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3); // rgbargbargbargba
|
||||
|
||||
// clang-format on
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose16_16_16
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose16_16_16
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
|
||||
{
|
||||
#if KNOB_SIMD_WIDTH == 8
|
||||
simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
|
||||
|
||||
simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
|
||||
simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
|
||||
simd4scalari src_b = SIMD128::load_si((const simd4scalari*)(pSrc + sizeof(simdscalari)));
|
||||
simd4scalari src_a = SIMD128::setzero_si();
|
||||
|
||||
simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
|
||||
simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
|
||||
simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
|
||||
simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
|
||||
|
||||
simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
|
||||
simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
|
||||
simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
|
||||
simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
|
||||
|
||||
SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
|
||||
SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
|
||||
SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
|
||||
SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
}
|
||||
|
||||
INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
|
||||
{
|
||||
#if KNOB_SIMD16_WIDTH == 16
|
||||
// clang-format off
|
||||
|
||||
simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr
|
||||
simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg
|
||||
simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb
|
||||
simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa
|
||||
|
||||
simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
|
||||
simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
|
||||
simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
|
||||
simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF
|
||||
|
||||
simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9
|
||||
simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB
|
||||
simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD
|
||||
simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF
|
||||
|
||||
simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
|
||||
simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
|
||||
simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
|
||||
simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
|
||||
|
||||
_simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgbargbargbargba
|
||||
_simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgbargbargbargba
|
||||
_simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2); // rgbargbargbargba
|
||||
_simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3); // rgbargbargbargba
|
||||
|
||||
// clang-format on
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose16_16
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose16_16
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion for packed 16_16 data.
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
|
||||
{
|
||||
#if KNOB_SIMD_WIDTH == 8
|
||||
simdscalar src = _simd_load_ps((const float*)pSrc);
|
||||
|
||||
simd4scalar comp0 = _simd_extractf128_ps(src, 0);
|
||||
simd4scalar comp1 = _simd_extractf128_ps(src, 1);
|
||||
|
||||
simd4scalari comp0i = SIMD128::castps_si(comp0);
|
||||
simd4scalari comp1i = SIMD128::castps_si(comp1);
|
||||
|
||||
simd4scalari resLo = SIMD128::unpacklo_epi16(comp0i, comp1i);
|
||||
simd4scalari resHi = SIMD128::unpackhi_epi16(comp0i, comp1i);
|
||||
|
||||
SIMD128::store_si((simd4scalari*)pDst, resLo);
|
||||
SIMD128::store_si((simd4scalari*)pDst + 1, resHi);
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
}
|
||||
|
||||
INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
|
||||
{
|
||||
#if KNOB_SIMD16_WIDTH == 16
|
||||
// clang-format off
|
||||
|
||||
simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr
|
||||
simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg
|
||||
|
||||
simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
|
||||
simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
|
||||
|
||||
simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
|
||||
simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF
|
||||
|
||||
_simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg
|
||||
_simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg
|
||||
|
||||
// clang-format on
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose24_8
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose24_8
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion for packed 24_8 data.
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose32_8_24
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose32_8_24
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose4_4_4_4
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose4_4_4_4
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose5_6_5
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose5_6_5
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose9_9_9_5
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose9_9_9_5
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose5_5_5_1
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose5_5_5_1
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose1_5_5_5
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose1_5_5_5
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose10_10_10_2
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose10_10_10_2
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose11_11_10
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose11_11_10
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose64
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose64
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose64_64
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose64_64
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose64_64_64
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose64_64_64
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Transpose64_64_64_64
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
struct Transpose64_64_64_64
|
||||
{
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Performs an SOA to AOS conversion
|
||||
/// @param pSrc - source data in SOA form
|
||||
/// @param pDst - output data in AOS form
|
||||
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
|
||||
};
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,448 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file frontend.h
|
||||
*
|
||||
* @brief Definitions for Frontend which handles vertex processing,
|
||||
* primitive assembly, clipping, binning, etc.
|
||||
*
|
||||
******************************************************************************/
|
||||
#pragma once
|
||||
#include "context.h"
|
||||
#include "common/simdintrin.h"
|
||||
#include <type_traits>
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Helper macro to generate a bitmask
|
||||
static INLINE uint32_t
|
||||
GenMask(uint32_t numBits)
|
||||
{
|
||||
SWR_ASSERT(
|
||||
numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
|
||||
return ((1U << numBits) - 1);
|
||||
}
|
||||
|
||||
// Calculates the A and B coefficients for the 3 edges of the triangle
|
||||
//
|
||||
// maths for edge equations:
|
||||
// standard form of a line in 2d
|
||||
// Ax + By + C = 0
|
||||
// A = y0 - y1
|
||||
// B = x1 - x0
|
||||
// C = x0y1 - x1y0
|
||||
INLINE
|
||||
void triangleSetupAB(const __m128 vX, const __m128 vY, __m128& vA, __m128& vB)
|
||||
{
|
||||
// vYsub = y1 y2 y0 dc
|
||||
__m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1));
|
||||
// vY = y0 y1 y2 dc
|
||||
vA = _mm_sub_ps(vY, vYsub);
|
||||
|
||||
// Result:
|
||||
// A[0] = y0 - y1
|
||||
// A[1] = y1 - y2
|
||||
// A[2] = y2 - y0
|
||||
|
||||
// vXsub = x1 x2 x0 dc
|
||||
__m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1));
|
||||
// vX = x0 x1 x2 dc
|
||||
vB = _mm_sub_ps(vXsub, vX);
|
||||
|
||||
// Result:
|
||||
// B[0] = x1 - x0
|
||||
// B[1] = x2 - x1
|
||||
// B[2] = x0 - x2
|
||||
}
|
||||
|
||||
INLINE
|
||||
void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i& vA, __m128i& vB)
|
||||
{
|
||||
// generate edge equations
|
||||
// A = y0 - y1
|
||||
// B = x1 - x0
|
||||
// C = x0y1 - x1y0
|
||||
__m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1));
|
||||
vA = _mm_sub_epi32(vY, vYsub);
|
||||
|
||||
__m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1));
|
||||
vB = _mm_sub_epi32(vXsub, vX);
|
||||
}
|
||||
|
||||
INLINE
|
||||
void triangleSetupABIntVertical(const simdscalari vX[3],
|
||||
const simdscalari vY[3],
|
||||
simdscalari (&vA)[3],
|
||||
simdscalari (&vB)[3])
|
||||
{
|
||||
// A = y0 - y1
|
||||
// B = x1 - x0
|
||||
vA[0] = _simd_sub_epi32(vY[0], vY[1]);
|
||||
vA[1] = _simd_sub_epi32(vY[1], vY[2]);
|
||||
vA[2] = _simd_sub_epi32(vY[2], vY[0]);
|
||||
|
||||
vB[0] = _simd_sub_epi32(vX[1], vX[0]);
|
||||
vB[1] = _simd_sub_epi32(vX[2], vX[1]);
|
||||
vB[2] = _simd_sub_epi32(vX[0], vX[2]);
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
INLINE
|
||||
void triangleSetupABIntVertical(const simd16scalari vX[3],
|
||||
const simd16scalari vY[3],
|
||||
simd16scalari (&vA)[3],
|
||||
simd16scalari (&vB)[3])
|
||||
{
|
||||
// A = y0 - y1
|
||||
// B = x1 - x0
|
||||
vA[0] = _simd16_sub_epi32(vY[0], vY[1]);
|
||||
vA[1] = _simd16_sub_epi32(vY[1], vY[2]);
|
||||
vA[2] = _simd16_sub_epi32(vY[2], vY[0]);
|
||||
|
||||
vB[0] = _simd16_sub_epi32(vX[1], vX[0]);
|
||||
vB[1] = _simd16_sub_epi32(vX[2], vX[1]);
|
||||
vB[2] = _simd16_sub_epi32(vX[0], vX[2]);
|
||||
}
|
||||
|
||||
#endif
|
||||
// Calculate the determinant of the triangle
|
||||
// 2 vectors between the 3 points: P, Q
|
||||
// Px = x0-x2, Py = y0-y2
|
||||
// Qx = x1-x2, Qy = y1-y2
|
||||
// |Px Qx|
|
||||
// det = | | = PxQy - PyQx
|
||||
// |Py Qy|
|
||||
// simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2)
|
||||
// try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx
|
||||
// : B[2]*A[1] - (-(y2-y0))*(-(x2-x1))
|
||||
// : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1)
|
||||
// : B[2]*A[1] - A[2]*B[1]
|
||||
INLINE
|
||||
float calcDeterminantInt(const __m128i vA, const __m128i vB)
|
||||
{
|
||||
// vAShuf = [A1, A0, A2, A0]
|
||||
__m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1));
|
||||
// vBShuf = [B2, B0, B1, B0]
|
||||
__m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2));
|
||||
// vMul = [A1*B2, B1*A2]
|
||||
__m128i vMul = _mm_mul_epi32(vAShuf, vBShuf);
|
||||
|
||||
// shuffle upper to lower
|
||||
// vMul2 = [B1*A2, B1*A2]
|
||||
__m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2));
|
||||
// vMul = [A1*B2 - B1*A2]
|
||||
vMul = _mm_sub_epi64(vMul, vMul2);
|
||||
|
||||
int64_t result;
|
||||
_mm_store_sd((double*)&result, _mm_castsi128_pd(vMul));
|
||||
|
||||
double dResult = (double)result;
|
||||
dResult = dResult * (1.0 / FIXED_POINT16_SCALE);
|
||||
|
||||
return (float)dResult;
|
||||
}
|
||||
|
||||
INLINE
|
||||
void calcDeterminantIntVertical(const simdscalari vA[3],
|
||||
const simdscalari vB[3],
|
||||
simdscalari* pvDet)
|
||||
{
|
||||
// refer to calcDeterminantInt comment for calculation explanation
|
||||
|
||||
// A1*B2
|
||||
simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5
|
||||
simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7
|
||||
|
||||
simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]);
|
||||
simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]);
|
||||
|
||||
simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5
|
||||
simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7
|
||||
|
||||
// B1*A2
|
||||
simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]);
|
||||
simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]);
|
||||
|
||||
simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]);
|
||||
simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]);
|
||||
|
||||
simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo);
|
||||
simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi);
|
||||
|
||||
// A1*B2 - A2*B1
|
||||
simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo);
|
||||
simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi);
|
||||
|
||||
// shuffle 0 1 4 5 2 3 6 7 -> 0 1 2 3
|
||||
simdscalari vResultLo = _simd_permute2f128_si(detLo, detHi, 0x20);
|
||||
|
||||
// shuffle 0 1 4 5 2 3 6 7 -> 4 5 6 7
|
||||
simdscalari vResultHi = _simd_permute2f128_si(detLo, detHi, 0x31);
|
||||
|
||||
pvDet[0] = vResultLo;
|
||||
pvDet[1] = vResultHi;
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
INLINE
|
||||
void calcDeterminantIntVertical(const simd16scalari vA[3],
|
||||
const simd16scalari vB[3],
|
||||
simd16scalari* pvDet)
|
||||
{
|
||||
// refer to calcDeterminantInt comment for calculation explanation
|
||||
|
||||
// A1*B2
|
||||
simd16scalari vA1_lo =
|
||||
_simd16_unpacklo_epi32(vA[1], vA[1]); // X 0 X 1 X 4 X 5 X 8 X 9 X C X D (32b)
|
||||
simd16scalari vA1_hi = _simd16_unpackhi_epi32(vA[1], vA[1]); // X 2 X 3 X 6 X 7 X A X B X E X F
|
||||
|
||||
simd16scalari vB2_lo = _simd16_unpacklo_epi32(vB[2], vB[2]);
|
||||
simd16scalari vB2_hi = _simd16_unpackhi_epi32(vB[2], vB[2]);
|
||||
|
||||
simd16scalari vA1B2_lo = _simd16_mul_epi32(vA1_lo, vB2_lo); // 0 1 4 5 8 9 C D (64b)
|
||||
simd16scalari vA1B2_hi = _simd16_mul_epi32(vA1_hi, vB2_hi); // 2 3 6 7 A B E F
|
||||
|
||||
// B1*A2
|
||||
simd16scalari vA2_lo = _simd16_unpacklo_epi32(vA[2], vA[2]);
|
||||
simd16scalari vA2_hi = _simd16_unpackhi_epi32(vA[2], vA[2]);
|
||||
|
||||
simd16scalari vB1_lo = _simd16_unpacklo_epi32(vB[1], vB[1]);
|
||||
simd16scalari vB1_hi = _simd16_unpackhi_epi32(vB[1], vB[1]);
|
||||
|
||||
simd16scalari vA2B1_lo = _simd16_mul_epi32(vA2_lo, vB1_lo);
|
||||
simd16scalari vA2B1_hi = _simd16_mul_epi32(vA2_hi, vB1_hi);
|
||||
|
||||
// A1*B2 - A2*B1
|
||||
simd16scalari difflo = _simd16_sub_epi64(vA1B2_lo, vA2B1_lo); // 0 1 4 5 8 9 C D (64b)
|
||||
simd16scalari diffhi = _simd16_sub_epi64(vA1B2_hi, vA2B1_hi); // 2 3 6 7 A B E F
|
||||
|
||||
// (1, 0, 1, 0) = 01 00 01 00 = 0x44, (3, 2, 3, 2) = 11 10 11 10 = 0xEE
|
||||
simd16scalari templo = _simd16_permute2f128_si(difflo, diffhi, 0x44); // 0 1 4 5 2 3 6 7 (64b)
|
||||
simd16scalari temphi = _simd16_permute2f128_si(difflo, diffhi, 0xEE); // 8 9 C D A B E F
|
||||
|
||||
// (3, 1, 2, 0) = 11 01 10 00 = 0xD8
|
||||
pvDet[0] = _simd16_permute2f128_si(templo, templo, 0xD8); // 0 1 2 3 4 5 6 7 (64b)
|
||||
pvDet[1] = _simd16_permute2f128_si(temphi, temphi, 0xD8); // 8 9 A B C D E F
|
||||
}
|
||||
|
||||
#endif
|
||||
INLINE
|
||||
void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128& vB, __m128& vC)
|
||||
{
|
||||
// C = -Ax - By
|
||||
vC = _mm_mul_ps(vA, vX);
|
||||
__m128 vCy = _mm_mul_ps(vB, vY);
|
||||
vC = _mm_mul_ps(vC, _mm_set1_ps(-1.0f));
|
||||
vC = _mm_sub_ps(vC, vCy);
|
||||
}
|
||||
|
||||
template <uint32_t NumVerts>
|
||||
INLINE void viewportTransform(simdvector* v, const SWR_VIEWPORT_MATRICES& vpMatrices)
|
||||
{
|
||||
simdscalar m00 = _simd_load1_ps(&vpMatrices.m00[0]);
|
||||
simdscalar m30 = _simd_load1_ps(&vpMatrices.m30[0]);
|
||||
simdscalar m11 = _simd_load1_ps(&vpMatrices.m11[0]);
|
||||
simdscalar m31 = _simd_load1_ps(&vpMatrices.m31[0]);
|
||||
simdscalar m22 = _simd_load1_ps(&vpMatrices.m22[0]);
|
||||
simdscalar m32 = _simd_load1_ps(&vpMatrices.m32[0]);
|
||||
|
||||
for (uint32_t i = 0; i < NumVerts; ++i)
|
||||
{
|
||||
v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
|
||||
v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
|
||||
v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
|
||||
}
|
||||
}
|
||||
|
||||
#if USE_SIMD16_FRONTEND
|
||||
template <uint32_t NumVerts>
|
||||
INLINE void viewportTransform(simd16vector* v, const SWR_VIEWPORT_MATRICES& vpMatrices)
|
||||
{
|
||||
const simd16scalar m00 = _simd16_broadcast_ss(&vpMatrices.m00[0]);
|
||||
const simd16scalar m30 = _simd16_broadcast_ss(&vpMatrices.m30[0]);
|
||||
const simd16scalar m11 = _simd16_broadcast_ss(&vpMatrices.m11[0]);
|
||||
const simd16scalar m31 = _simd16_broadcast_ss(&vpMatrices.m31[0]);
|
||||
const simd16scalar m22 = _simd16_broadcast_ss(&vpMatrices.m22[0]);
|
||||
const simd16scalar m32 = _simd16_broadcast_ss(&vpMatrices.m32[0]);
|
||||
|
||||
for (uint32_t i = 0; i < NumVerts; ++i)
|
||||
{
|
||||
v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30);
|
||||
v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31);
|
||||
v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
template <uint32_t NumVerts>
|
||||
INLINE void viewportTransform(simdvector* v,
|
||||
const SWR_VIEWPORT_MATRICES& vpMatrices,
|
||||
simdscalari const& vViewportIdx)
|
||||
{
|
||||
// perform a gather of each matrix element based on the viewport array indexes
|
||||
simdscalar m00 = _simd_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
|
||||
simdscalar m30 = _simd_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4);
|
||||
simdscalar m11 = _simd_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4);
|
||||
simdscalar m31 = _simd_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4);
|
||||
simdscalar m22 = _simd_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4);
|
||||
simdscalar m32 = _simd_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4);
|
||||
|
||||
for (uint32_t i = 0; i < NumVerts; ++i)
|
||||
{
|
||||
v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
|
||||
v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
|
||||
v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
|
||||
}
|
||||
}
|
||||
|
||||
#if USE_SIMD16_FRONTEND
|
||||
template <uint32_t NumVerts>
|
||||
INLINE void viewportTransform(simd16vector* v,
|
||||
const SWR_VIEWPORT_MATRICES& vpMatrices,
|
||||
simd16scalari const& vViewportIdx)
|
||||
{
|
||||
// perform a gather of each matrix element based on the viewport array indexes
|
||||
const simd16scalar m00 = _simd16_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
|
||||
const simd16scalar m30 = _simd16_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4);
|
||||
const simd16scalar m11 = _simd16_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4);
|
||||
const simd16scalar m31 = _simd16_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4);
|
||||
const simd16scalar m22 = _simd16_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4);
|
||||
const simd16scalar m32 = _simd16_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4);
|
||||
|
||||
for (uint32_t i = 0; i < NumVerts; ++i)
|
||||
{
|
||||
v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30);
|
||||
v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31);
|
||||
v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
INLINE
|
||||
void calcBoundingBoxInt(const __m128i& vX, const __m128i& vY, SWR_RECT& bbox)
|
||||
{
|
||||
// Need horizontal fp min here
|
||||
__m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1));
|
||||
__m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2));
|
||||
|
||||
__m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1));
|
||||
__m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2));
|
||||
|
||||
__m128i vMinX = _mm_min_epi32(vX, vX1);
|
||||
vMinX = _mm_min_epi32(vMinX, vX2);
|
||||
|
||||
__m128i vMaxX = _mm_max_epi32(vX, vX1);
|
||||
vMaxX = _mm_max_epi32(vMaxX, vX2);
|
||||
|
||||
__m128i vMinY = _mm_min_epi32(vY, vY1);
|
||||
vMinY = _mm_min_epi32(vMinY, vY2);
|
||||
|
||||
__m128i vMaxY = _mm_max_epi32(vY, vY1);
|
||||
vMaxY = _mm_max_epi32(vMaxY, vY2);
|
||||
|
||||
bbox.xmin = _mm_extract_epi32(vMinX, 0);
|
||||
bbox.xmax = _mm_extract_epi32(vMaxX, 0);
|
||||
bbox.ymin = _mm_extract_epi32(vMinY, 0);
|
||||
bbox.ymax = _mm_extract_epi32(vMaxY, 0);
|
||||
}
|
||||
|
||||
INLINE
|
||||
bool CanUseSimplePoints(DRAW_CONTEXT* pDC)
|
||||
{
|
||||
const API_STATE& state = GetApiState(pDC);
|
||||
|
||||
return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X &&
|
||||
state.rastState.pointSize == 1.0f && !state.rastState.pointParam &&
|
||||
!state.rastState.pointSpriteEnable && !state.backendState.clipDistanceMask);
|
||||
}
|
||||
|
||||
INLINE
|
||||
bool vHasNaN(const __m128& vec)
|
||||
{
|
||||
const __m128 result = _mm_cmpunord_ps(vec, vec);
|
||||
const int32_t mask = _mm_movemask_ps(result);
|
||||
return (mask != 0);
|
||||
}
|
||||
|
||||
uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements);
|
||||
uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts);
|
||||
|
||||
// ProcessDraw front-end function. All combinations of parameter values are available
|
||||
PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed,
|
||||
bool IsCutIndexEnabled,
|
||||
bool HasTessellation,
|
||||
bool HasGeometryShader,
|
||||
bool HasStreamOut,
|
||||
bool HasRasterization);
|
||||
|
||||
void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData);
|
||||
void ProcessStoreTiles(SWR_CONTEXT* pContext,
|
||||
DRAW_CONTEXT* pDC,
|
||||
uint32_t workerId,
|
||||
void* pUserData);
|
||||
void ProcessDiscardInvalidateTiles(SWR_CONTEXT* pContext,
|
||||
DRAW_CONTEXT* pDC,
|
||||
uint32_t workerId,
|
||||
void* pUserData);
|
||||
void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData);
|
||||
void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData);
|
||||
|
||||
PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative);
|
||||
#if USE_SIMD16_FRONTEND
|
||||
PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative);
|
||||
#endif
|
||||
|
||||
struct PA_STATE_BASE; // forward decl
|
||||
void BinPoints(DRAW_CONTEXT* pDC,
|
||||
PA_STATE& pa,
|
||||
uint32_t workerId,
|
||||
simdvector prims[3],
|
||||
uint32_t primMask,
|
||||
simdscalari const& primID,
|
||||
simdscalari const& viewportIdx,
|
||||
simdscalari const& rtIdx);
|
||||
void BinLines(DRAW_CONTEXT* pDC,
|
||||
PA_STATE& pa,
|
||||
uint32_t workerId,
|
||||
simdvector prims[3],
|
||||
uint32_t primMask,
|
||||
simdscalari const& primID,
|
||||
simdscalari const& viewportIdx,
|
||||
simdscalari const& rtIdx);
|
||||
#if USE_SIMD16_FRONTEND
|
||||
void SIMDCALL BinPoints_simd16(DRAW_CONTEXT* pDC,
|
||||
PA_STATE& pa,
|
||||
uint32_t workerId,
|
||||
simd16vector prims[3],
|
||||
uint32_t primMask,
|
||||
simd16scalari const& primID,
|
||||
simd16scalari const& viewportIdx,
|
||||
simd16scalari const& rtIdx);
|
||||
void SIMDCALL BinLines_simd16(DRAW_CONTEXT* pDC,
|
||||
PA_STATE& pa,
|
||||
uint32_t workerId,
|
||||
simd16vector prims[3],
|
||||
uint32_t primMask,
|
||||
simd16scalari const& primID,
|
||||
simd16scalari const& viewportIdx,
|
||||
simd16scalari const& rtIdx);
|
||||
#endif
|
||||
|
||||
|
|
@ -1,175 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file knobs.h
|
||||
*
|
||||
* @brief Static (Compile-Time) Knobs for Core.
|
||||
*
|
||||
******************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
#include <gen_knobs.h>
|
||||
|
||||
#define KNOB_ARCH_AVX 0
|
||||
#define KNOB_ARCH_AVX2 1
|
||||
#define KNOB_ARCH_AVX512 2
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// AVX512 Support
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define ENABLE_AVX512_SIMD16 1
|
||||
#define USE_SIMD16_FRONTEND 1
|
||||
#define USE_SIMD16_SHADERS 1 // requires USE_SIMD16_FRONTEND
|
||||
#define USE_SIMD16_VS 1 // requires USE_SIMD16_SHADERS
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Architecture validation
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
#if !defined(KNOB_ARCH)
|
||||
#define KNOB_ARCH KNOB_ARCH_AVX
|
||||
#endif
|
||||
|
||||
#if (KNOB_ARCH == KNOB_ARCH_AVX)
|
||||
#define KNOB_ARCH_ISA AVX
|
||||
#define KNOB_ARCH_STR "AVX"
|
||||
#elif (KNOB_ARCH == KNOB_ARCH_AVX2)
|
||||
#define KNOB_ARCH_ISA AVX2
|
||||
#define KNOB_ARCH_STR "AVX2"
|
||||
#elif (KNOB_ARCH == KNOB_ARCH_AVX512)
|
||||
#define KNOB_ARCH_ISA AVX512F
|
||||
#define KNOB_ARCH_STR "AVX512"
|
||||
#else
|
||||
#error "Unknown architecture"
|
||||
#endif
|
||||
|
||||
#define KNOB_SIMD_WIDTH 8
|
||||
#define KNOB_SIMD_BYTES 32
|
||||
|
||||
#define KNOB_SIMD16_WIDTH 16
|
||||
#define KNOB_SIMD16_BYTES 64
|
||||
|
||||
#define MAX_KNOB_ARCH_STR_LEN sizeof("AVX512_PLUS_PADDING")
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Configuration knobs
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Maximum supported number of active vertex buffer streams
|
||||
#define KNOB_NUM_STREAMS 32
|
||||
|
||||
// Maximum supported active viewports and scissors
|
||||
#define KNOB_NUM_VIEWPORTS_SCISSORS 16
|
||||
|
||||
// Guardband range used by the clipper
|
||||
#define KNOB_GUARDBAND_WIDTH 32768.0f
|
||||
#define KNOB_GUARDBAND_HEIGHT 32768.0f
|
||||
|
||||
// Scratch space requirements per worker. Currently only used for TGSM sizing for some stages
|
||||
#define KNOB_WORKER_SCRATCH_SPACE_SIZE (32 * 1024)
|
||||
|
||||
///////////////////////////////
|
||||
// Macro tile configuration
|
||||
///////////////////////////////
|
||||
|
||||
// raster tile dimensions
|
||||
#define KNOB_TILE_X_DIM 8
|
||||
#define KNOB_TILE_X_DIM_SHIFT 3
|
||||
#define KNOB_TILE_Y_DIM 8
|
||||
#define KNOB_TILE_Y_DIM_SHIFT 3
|
||||
|
||||
// fixed macrotile pixel dimension for now, eventually will be
|
||||
// dynamically set based on tile format and pixel size
|
||||
#define KNOB_MACROTILE_X_DIM 32
|
||||
#define KNOB_MACROTILE_Y_DIM 32
|
||||
#define KNOB_MACROTILE_X_DIM_FIXED_SHIFT 13
|
||||
#define KNOB_MACROTILE_Y_DIM_FIXED_SHIFT 13
|
||||
#define KNOB_MACROTILE_X_DIM_FIXED (KNOB_MACROTILE_X_DIM << 8)
|
||||
#define KNOB_MACROTILE_Y_DIM_FIXED (KNOB_MACROTILE_Y_DIM << 8)
|
||||
#define KNOB_MACROTILE_X_DIM_IN_TILES (KNOB_MACROTILE_X_DIM >> KNOB_TILE_X_DIM_SHIFT)
|
||||
#define KNOB_MACROTILE_Y_DIM_IN_TILES (KNOB_MACROTILE_Y_DIM >> KNOB_TILE_Y_DIM_SHIFT)
|
||||
|
||||
// total # of hot tiles available. This should be enough to
|
||||
// fully render a 16kx16k 128bpp render target
|
||||
#define KNOB_NUM_HOT_TILES_X 512
|
||||
#define KNOB_NUM_HOT_TILES_Y 512
|
||||
#define KNOB_COLOR_HOT_TILE_FORMAT R32G32B32A32_FLOAT
|
||||
#define KNOB_DEPTH_HOT_TILE_FORMAT R32_FLOAT
|
||||
#define KNOB_STENCIL_HOT_TILE_FORMAT R8_UINT
|
||||
|
||||
// Max scissor rectangle
|
||||
#define KNOB_MAX_SCISSOR_X KNOB_NUM_HOT_TILES_X* KNOB_MACROTILE_X_DIM
|
||||
#define KNOB_MAX_SCISSOR_Y KNOB_NUM_HOT_TILES_Y* KNOB_MACROTILE_Y_DIM
|
||||
|
||||
#if KNOB_SIMD_WIDTH == 8 && KNOB_TILE_X_DIM < 4
|
||||
#error "incompatible width/tile dimensions"
|
||||
#endif
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
#if KNOB_SIMD16_WIDTH == 16 && KNOB_TILE_X_DIM < 8
|
||||
#error "incompatible width/tile dimensions"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if KNOB_SIMD_WIDTH == 8
|
||||
#define SIMD_TILE_X_DIM 4
|
||||
#define SIMD_TILE_Y_DIM 2
|
||||
#else
|
||||
#error "Invalid simd width"
|
||||
#endif
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
#if KNOB_SIMD16_WIDTH == 16
|
||||
#define SIMD16_TILE_X_DIM 8
|
||||
#define SIMD16_TILE_Y_DIM 2
|
||||
#else
|
||||
#error "Invalid simd width"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Optimization knobs
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
#define KNOB_USE_FAST_SRGB TRUE
|
||||
|
||||
// enables cut-aware primitive assembler
|
||||
#define KNOB_ENABLE_CUT_AWARE_PA TRUE
|
||||
|
||||
// enables early rasterization (useful for small triangles)
|
||||
#if !defined(KNOB_ENABLE_EARLY_RAST)
|
||||
#define KNOB_ENABLE_EARLY_RAST 1
|
||||
#endif
|
||||
|
||||
#if KNOB_ENABLE_EARLY_RAST
|
||||
#define ER_SIMD_TILE_X_SHIFT 2
|
||||
#define ER_SIMD_TILE_Y_SHIFT 2
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Debug knobs
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
//#define KNOB_ENABLE_RDTSC
|
||||
|
||||
// Set to 1 to use the dynamic KNOB_TOSS_XXXX knobs.
|
||||
#if !defined(KNOB_ENABLE_TOSS_POINTS)
|
||||
#define KNOB_ENABLE_TOSS_POINTS 0
|
||||
#endif
|
||||
|
|
@ -1,108 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file knobs_init.h
|
||||
*
|
||||
* @brief Dynamic Knobs Initialization for Core.
|
||||
*
|
||||
******************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#include <core/knobs.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
|
||||
// Assume the type is compatible with a 32-bit integer
|
||||
template <typename T>
|
||||
static inline void ConvertEnvToKnob(const char* pOverride, T& knobValue)
|
||||
{
|
||||
uint32_t value = 0;
|
||||
char* pStopped = nullptr;
|
||||
value = strtoul(pOverride, &pStopped, 0);
|
||||
if (pStopped != pOverride)
|
||||
{
|
||||
knobValue = static_cast<T>(value);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void ConvertEnvToKnob(const char* pOverride, bool& knobValue)
|
||||
{
|
||||
size_t len = strlen(pOverride);
|
||||
if (len == 1)
|
||||
{
|
||||
auto c = tolower(pOverride[0]);
|
||||
if (c == 'y' || c == 't' || c == '1')
|
||||
{
|
||||
knobValue = true;
|
||||
return;
|
||||
}
|
||||
if (c == 'n' || c == 'f' || c == '0')
|
||||
{
|
||||
knobValue = false;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Try converting to a number and casting to bool
|
||||
uint32_t value = 0;
|
||||
char* pStopped = nullptr;
|
||||
value = strtoul(pOverride, &pStopped, 0);
|
||||
if (pStopped != pOverride)
|
||||
{
|
||||
knobValue = value != 0;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void ConvertEnvToKnob(const char* pOverride, float& knobValue)
|
||||
{
|
||||
float value = knobValue;
|
||||
if (sscanf(pOverride, "%f", &value))
|
||||
{
|
||||
knobValue = value;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void ConvertEnvToKnob(const char* pOverride, std::string& knobValue)
|
||||
{
|
||||
knobValue = pOverride;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline void InitKnob(T& knob)
|
||||
{
|
||||
// Read environment variables
|
||||
const char* pOverride = getenv(knob.Name());
|
||||
|
||||
if (pOverride)
|
||||
{
|
||||
auto knobValue = knob.DefaultValue();
|
||||
ConvertEnvToKnob(pOverride, knobValue);
|
||||
knob.Value(knobValue);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Set default value
|
||||
knob.Value(knob.DefaultValue());
|
||||
}
|
||||
}
|
||||
|
|
@ -1,459 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file multisample.h
|
||||
*
|
||||
******************************************************************************/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "context.h"
|
||||
#include "format_traits.h"
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief convenience typedef for testing for single sample case
|
||||
typedef std::integral_constant<int, 1> SingleSampleT;
|
||||
|
||||
INLINE
|
||||
SWR_MULTISAMPLE_COUNT GetSampleCount(uint32_t numSamples)
|
||||
{
|
||||
switch (numSamples)
|
||||
{
|
||||
case 1:
|
||||
return SWR_MULTISAMPLE_1X;
|
||||
case 2:
|
||||
return SWR_MULTISAMPLE_2X;
|
||||
case 4:
|
||||
return SWR_MULTISAMPLE_4X;
|
||||
case 8:
|
||||
return SWR_MULTISAMPLE_8X;
|
||||
case 16:
|
||||
return SWR_MULTISAMPLE_16X;
|
||||
default:
|
||||
assert(0);
|
||||
return SWR_MULTISAMPLE_1X;
|
||||
}
|
||||
}
|
||||
|
||||
// hardcoded offsets based on Direct3d standard multisample positions
|
||||
// 8 x 8 pixel grid ranging from (0, 0) to (15, 15), with (0, 0) = UL pixel corner
|
||||
// coords are 0.8 fixed point offsets from (0, 0)
|
||||
template <SWR_MULTISAMPLE_COUNT sampleCount, bool isCenter = false>
|
||||
struct MultisampleTraits
|
||||
{
|
||||
INLINE static float X(uint32_t sampleNum) = delete;
|
||||
INLINE static float Y(uint32_t sampleNum) = delete;
|
||||
INLINE static simdscalari FullSampleMask() = delete;
|
||||
|
||||
static const uint32_t numSamples = 0;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct MultisampleTraits<SWR_MULTISAMPLE_1X, false>
|
||||
{
|
||||
INLINE static float X(uint32_t sampleNum) { return samplePosX[sampleNum]; };
|
||||
INLINE static float Y(uint32_t sampleNum) { return samplePosY[sampleNum]; };
|
||||
INLINE static simdscalari FullSampleMask() { return _simd_set1_epi32(0x1); };
|
||||
|
||||
static const uint32_t numSamples = 1;
|
||||
static const uint32_t numCoverageSamples = 1;
|
||||
static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_1X;
|
||||
static constexpr uint32_t samplePosXi[1] = {0x80};
|
||||
static constexpr uint32_t samplePosYi[1] = {0x80};
|
||||
static constexpr float samplePosX[1] = {0.5f};
|
||||
static constexpr float samplePosY[1] = {0.5f};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct MultisampleTraits<SWR_MULTISAMPLE_1X, true>
|
||||
{
|
||||
INLINE static float X(uint32_t sampleNum) { return 0.5f; };
|
||||
INLINE static float Y(uint32_t sampleNum) { return 0.5f; };
|
||||
INLINE static simdscalari FullSampleMask() { return _simd_set1_epi32(0x1); };
|
||||
|
||||
static const uint32_t numSamples = 1;
|
||||
static const uint32_t numCoverageSamples = 1;
|
||||
static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_1X;
|
||||
static constexpr uint32_t samplePosXi[1] = {0x80};
|
||||
static constexpr uint32_t samplePosYi[1] = {0x80};
|
||||
static constexpr float samplePosX[1] = {0.5f};
|
||||
static constexpr float samplePosY[1] = {0.5f};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct MultisampleTraits<SWR_MULTISAMPLE_2X, false>
|
||||
{
|
||||
INLINE static float X(uint32_t sampleNum)
|
||||
{
|
||||
SWR_ASSERT(sampleNum < numSamples);
|
||||
return samplePosX[sampleNum];
|
||||
};
|
||||
INLINE static float Y(uint32_t sampleNum)
|
||||
{
|
||||
SWR_ASSERT(sampleNum < numSamples);
|
||||
return samplePosY[sampleNum];
|
||||
};
|
||||
INLINE static simdscalari FullSampleMask()
|
||||
{
|
||||
static const simdscalari mask = _simd_set1_epi32(0x3);
|
||||
return mask;
|
||||
}
|
||||
|
||||
static const uint32_t numSamples = 2;
|
||||
static const uint32_t numCoverageSamples = 2;
|
||||
static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_2X;
|
||||
static constexpr uint32_t samplePosXi[2] = {0xC0, 0x40};
|
||||
static constexpr uint32_t samplePosYi[2] = {0xC0, 0x40};
|
||||
static constexpr float samplePosX[2] = {0.75f, 0.25f};
|
||||
static constexpr float samplePosY[2] = {0.75f, 0.25f};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct MultisampleTraits<SWR_MULTISAMPLE_2X, true>
|
||||
{
|
||||
INLINE static float X(uint32_t sampleNum) { return 0.5f; };
|
||||
INLINE static float Y(uint32_t sampleNum) { return 0.5f; };
|
||||
INLINE static simdscalari FullSampleMask()
|
||||
{
|
||||
static const simdscalari mask = _simd_set1_epi32(0x3);
|
||||
return mask;
|
||||
}
|
||||
static const uint32_t numSamples = 2;
|
||||
static const uint32_t numCoverageSamples = 1;
|
||||
static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_2X;
|
||||
static constexpr uint32_t samplePosXi[2] = {0x80, 0x80};
|
||||
static constexpr uint32_t samplePosYi[2] = {0x80, 0x80};
|
||||
static constexpr float samplePosX[2] = {0.5f, 0.5f};
|
||||
static constexpr float samplePosY[2] = {0.5f, 0.5f};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct MultisampleTraits<SWR_MULTISAMPLE_4X, false>
|
||||
{
|
||||
INLINE static float X(uint32_t sampleNum)
|
||||
{
|
||||
SWR_ASSERT(sampleNum < numSamples);
|
||||
return samplePosX[sampleNum];
|
||||
};
|
||||
INLINE static float Y(uint32_t sampleNum)
|
||||
{
|
||||
SWR_ASSERT(sampleNum < numSamples);
|
||||
return samplePosY[sampleNum];
|
||||
};
|
||||
INLINE static simdscalari FullSampleMask()
|
||||
{
|
||||
static const simdscalari mask = _simd_set1_epi32(0xF);
|
||||
return mask;
|
||||
}
|
||||
|
||||
static const uint32_t numSamples = 4;
|
||||
static const uint32_t numCoverageSamples = 4;
|
||||
static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_4X;
|
||||
static constexpr uint32_t samplePosXi[4] = {0x60, 0xE0, 0x20, 0xA0};
|
||||
static constexpr uint32_t samplePosYi[4] = {0x20, 0x60, 0xA0, 0xE0};
|
||||
static constexpr float samplePosX[4] = {0.375f, 0.875f, 0.125f, 0.625f};
|
||||
static constexpr float samplePosY[4] = {0.125f, 0.375f, 0.625f, 0.875f};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct MultisampleTraits<SWR_MULTISAMPLE_4X, true>
|
||||
{
|
||||
INLINE static float X(uint32_t sampleNum) { return 0.5f; };
|
||||
INLINE static float Y(uint32_t sampleNum) { return 0.5f; };
|
||||
INLINE static simdscalari FullSampleMask()
|
||||
{
|
||||
static const simdscalari mask = _simd_set1_epi32(0xF);
|
||||
return mask;
|
||||
}
|
||||
|
||||
static const uint32_t numSamples = 4;
|
||||
static const uint32_t numCoverageSamples = 1;
|
||||
static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_4X;
|
||||
static constexpr uint32_t samplePosXi[4] = {0x80, 0x80, 0x80, 0x80};
|
||||
static constexpr uint32_t samplePosYi[4] = {0x80, 0x80, 0x80, 0x80};
|
||||
static constexpr float samplePosX[4] = {0.5f, 0.5f, 0.5f, 0.5f};
|
||||
static constexpr float samplePosY[4] = {0.5f, 0.5f, 0.5f, 0.5f};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct MultisampleTraits<SWR_MULTISAMPLE_8X, false>
|
||||
{
|
||||
INLINE static float X(uint32_t sampleNum)
|
||||
{
|
||||
SWR_ASSERT(sampleNum < numSamples);
|
||||
return samplePosX[sampleNum];
|
||||
};
|
||||
INLINE static float Y(uint32_t sampleNum)
|
||||
{
|
||||
SWR_ASSERT(sampleNum < numSamples);
|
||||
return samplePosY[sampleNum];
|
||||
};
|
||||
INLINE static simdscalari FullSampleMask()
|
||||
{
|
||||
static const simdscalari mask = _simd_set1_epi32(0xFF);
|
||||
return mask;
|
||||
}
|
||||
|
||||
static const uint32_t numSamples = 8;
|
||||
static const uint32_t numCoverageSamples = 8;
|
||||
static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_8X;
|
||||
static constexpr uint32_t samplePosXi[8] = {0x90, 0x70, 0xD0, 0x50, 0x30, 0x10, 0xB0, 0xF0};
|
||||
static constexpr uint32_t samplePosYi[8] = {0x50, 0xB0, 0x90, 0x30, 0xD0, 0x70, 0xF0, 0x10};
|
||||
static constexpr float samplePosX[8] = {
|
||||
0.5625f, 0.4375f, 0.8125f, 0.3125f, 0.1875f, 0.0625f, 0.6875f, 0.9375f};
|
||||
static constexpr float samplePosY[8] = {
|
||||
0.3125f, 0.6875f, 0.5625f, 0.1875f, 0.8125f, 0.4375f, 0.9375f, 0.0625f};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct MultisampleTraits<SWR_MULTISAMPLE_8X, true>
|
||||
{
|
||||
INLINE static float X(uint32_t sampleNum) { return 0.5f; };
|
||||
INLINE static float Y(uint32_t sampleNum) { return 0.5f; };
|
||||
INLINE static simdscalari FullSampleMask()
|
||||
{
|
||||
static const simdscalari mask = _simd_set1_epi32(0xFF);
|
||||
return mask;
|
||||
}
|
||||
static const uint32_t numSamples = 8;
|
||||
static const uint32_t numCoverageSamples = 1;
|
||||
static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_8X;
|
||||
static constexpr uint32_t samplePosXi[8] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
|
||||
static constexpr uint32_t samplePosYi[8] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
|
||||
static constexpr float samplePosX[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
|
||||
static constexpr float samplePosY[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct MultisampleTraits<SWR_MULTISAMPLE_16X, false>
|
||||
{
|
||||
INLINE static float X(uint32_t sampleNum)
|
||||
{
|
||||
SWR_ASSERT(sampleNum < numSamples);
|
||||
return samplePosX[sampleNum];
|
||||
};
|
||||
INLINE static float Y(uint32_t sampleNum)
|
||||
{
|
||||
SWR_ASSERT(sampleNum < numSamples);
|
||||
return samplePosY[sampleNum];
|
||||
};
|
||||
INLINE static simdscalari FullSampleMask()
|
||||
{
|
||||
static const simdscalari mask = _simd_set1_epi32(0xFFFF);
|
||||
return mask;
|
||||
}
|
||||
|
||||
static const uint32_t numSamples = 16;
|
||||
static const uint32_t numCoverageSamples = 16;
|
||||
static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_16X;
|
||||
static constexpr uint32_t samplePosXi[16] = {0x90,
|
||||
0x70,
|
||||
0x50,
|
||||
0xC0,
|
||||
0x30,
|
||||
0xA0,
|
||||
0xD0,
|
||||
0xB0,
|
||||
0x60,
|
||||
0x80,
|
||||
0x40,
|
||||
0x20,
|
||||
0x00,
|
||||
0xF0,
|
||||
0xE0,
|
||||
0x10};
|
||||
static constexpr uint32_t samplePosYi[16] = {0x90,
|
||||
0x50,
|
||||
0xA0,
|
||||
0x70,
|
||||
0x60,
|
||||
0xD0,
|
||||
0xB0,
|
||||
0x30,
|
||||
0xE0,
|
||||
0x10,
|
||||
0x20,
|
||||
0xC0,
|
||||
0x80,
|
||||
0x40,
|
||||
0xF0,
|
||||
0x00};
|
||||
static constexpr float samplePosX[16] = {0.5625f,
|
||||
0.4375f,
|
||||
0.3125f,
|
||||
0.7500f,
|
||||
0.1875f,
|
||||
0.6250f,
|
||||
0.8125f,
|
||||
0.6875f,
|
||||
0.3750f,
|
||||
0.5000f,
|
||||
0.2500f,
|
||||
0.1250f,
|
||||
0.0000f,
|
||||
0.9375f,
|
||||
0.8750f,
|
||||
0.0625f};
|
||||
static constexpr float samplePosY[16] = {0.5625f,
|
||||
0.3125f,
|
||||
0.6250f,
|
||||
0.4375f,
|
||||
0.3750f,
|
||||
0.8125f,
|
||||
0.6875f,
|
||||
0.1875f,
|
||||
0.8750f,
|
||||
0.0625f,
|
||||
0.1250f,
|
||||
0.7500f,
|
||||
0.5000f,
|
||||
0.2500f,
|
||||
0.9375f,
|
||||
0.0000f};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct MultisampleTraits<SWR_MULTISAMPLE_16X, true>
|
||||
{
|
||||
INLINE static float X(uint32_t sampleNum) { return 0.5f; };
|
||||
INLINE static float Y(uint32_t sampleNum) { return 0.5f; };
|
||||
INLINE static simdscalari FullSampleMask()
|
||||
{
|
||||
static const simdscalari mask = _simd_set1_epi32(0xFFFF);
|
||||
return mask;
|
||||
}
|
||||
static const uint32_t numSamples = 16;
|
||||
static const uint32_t numCoverageSamples = 1;
|
||||
static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_16X;
|
||||
static constexpr uint32_t samplePosXi[16] = {0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80};
|
||||
static constexpr uint32_t samplePosYi[16] = {0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80,
|
||||
0x80};
|
||||
static constexpr float samplePosX[16] = {0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f};
|
||||
static constexpr float samplePosY[16] = {0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f,
|
||||
0.5f};
|
||||
};
|
||||
|
||||
INLINE
|
||||
bool isNonStandardPattern(const SWR_MULTISAMPLE_COUNT sampleCount,
|
||||
const SWR_MULTISAMPLE_POS& samplePos)
|
||||
{
|
||||
// detect if we're using standard or center sample patterns
|
||||
const uint32_t *standardPosX, *standardPosY;
|
||||
switch (sampleCount)
|
||||
{
|
||||
case SWR_MULTISAMPLE_1X:
|
||||
standardPosX = MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosXi;
|
||||
standardPosY = MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosYi;
|
||||
break;
|
||||
case SWR_MULTISAMPLE_2X:
|
||||
standardPosX = MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosXi;
|
||||
standardPosY = MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosYi;
|
||||
break;
|
||||
case SWR_MULTISAMPLE_4X:
|
||||
standardPosX = MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosXi;
|
||||
standardPosY = MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosYi;
|
||||
break;
|
||||
case SWR_MULTISAMPLE_8X:
|
||||
standardPosX = MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosXi;
|
||||
standardPosY = MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosYi;
|
||||
break;
|
||||
case SWR_MULTISAMPLE_16X:
|
||||
standardPosX = MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosXi;
|
||||
standardPosY = MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosYi;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
// scan sample pattern for standard or center
|
||||
uint32_t numSamples = GetNumSamples(sampleCount);
|
||||
bool bIsStandard = true;
|
||||
if (numSamples > 1)
|
||||
{
|
||||
for (uint32_t i = 0; i < numSamples; i++)
|
||||
{
|
||||
bIsStandard =
|
||||
(standardPosX[i] == samplePos.Xi(i)) || (standardPosY[i] == samplePos.Yi(i));
|
||||
if (!bIsStandard)
|
||||
break;
|
||||
}
|
||||
}
|
||||
return !bIsStandard;
|
||||
}
|
||||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
|
@ -1,473 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file rasterizer.cpp
|
||||
*
|
||||
* @brief Implementation for the rasterizer.
|
||||
*
|
||||
******************************************************************************/
|
||||
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
#include "rasterizer.h"
|
||||
#include "backends/gen_rasterizer.hpp"
|
||||
#include "rdtsc_core.h"
|
||||
#include "backend.h"
|
||||
#include "utils.h"
|
||||
#include "frontend.h"
|
||||
#include "tilemgr.h"
|
||||
#include "memory/tilingtraits.h"
|
||||
#include "rasterizer_impl.h"
|
||||
|
||||
PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT]
|
||||
[STATE_VALID_TRI_EDGE_COUNT][2];
|
||||
|
||||
void RasterizeLine(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
|
||||
{
|
||||
const TRIANGLE_WORK_DESC& workDesc = *((TRIANGLE_WORK_DESC*)pData);
|
||||
#if KNOB_ENABLE_TOSS_POINTS
|
||||
if (KNOB_TOSS_BIN_TRIS)
|
||||
{
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
// bloat line to two tris and call the triangle rasterizer twice
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizeLine, pDC->drawId);
|
||||
|
||||
const API_STATE& state = GetApiState(pDC);
|
||||
const SWR_RASTSTATE& rastState = state.rastState;
|
||||
|
||||
// macrotile dimensioning
|
||||
uint32_t macroX, macroY;
|
||||
MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
|
||||
int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
|
||||
int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
|
||||
int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
|
||||
int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
|
||||
|
||||
const SWR_RECT& scissorInFixedPoint =
|
||||
state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
|
||||
|
||||
// create a copy of the triangle buffer to write our adjusted vertices to
|
||||
OSALIGNSIMD(float) newTriBuffer[4 * 4];
|
||||
TRIANGLE_WORK_DESC newWorkDesc = workDesc;
|
||||
newWorkDesc.pTriBuffer = &newTriBuffer[0];
|
||||
|
||||
// create a copy of the attrib buffer to write our adjusted attribs to
|
||||
OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
|
||||
newWorkDesc.pAttribs = &newAttribBuffer[0];
|
||||
|
||||
const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f);
|
||||
const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f);
|
||||
|
||||
__m128 vX, vY, vZ, vRecipW;
|
||||
|
||||
vX = _mm_load_ps(workDesc.pTriBuffer);
|
||||
vY = _mm_load_ps(workDesc.pTriBuffer + 4);
|
||||
vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
|
||||
vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
|
||||
|
||||
// triangle 0
|
||||
// v0,v1 -> v0,v0,v1
|
||||
__m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
|
||||
__m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
|
||||
__m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
|
||||
__m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0));
|
||||
|
||||
__m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth);
|
||||
__m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0);
|
||||
if (workDesc.triFlags.yMajor)
|
||||
{
|
||||
vXa = _mm_add_ps(vAdjust, vXa);
|
||||
}
|
||||
else
|
||||
{
|
||||
vYa = _mm_add_ps(vAdjust, vYa);
|
||||
}
|
||||
|
||||
// Store triangle description for rasterizer
|
||||
_mm_store_ps((float*)&newTriBuffer[0], vXa);
|
||||
_mm_store_ps((float*)&newTriBuffer[4], vYa);
|
||||
_mm_store_ps((float*)&newTriBuffer[8], vZa);
|
||||
_mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
|
||||
|
||||
// binner bins 3 edges for lines as v0, v1, v1
|
||||
// tri0 needs v0, v0, v1
|
||||
for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
|
||||
{
|
||||
__m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
|
||||
__m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
|
||||
|
||||
_mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib0);
|
||||
_mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib0);
|
||||
_mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib1);
|
||||
}
|
||||
|
||||
// Store user clip distances for triangle 0
|
||||
float newClipBuffer[3 * 8];
|
||||
uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
|
||||
if (numClipDist)
|
||||
{
|
||||
newWorkDesc.pUserClipBuffer = newClipBuffer;
|
||||
|
||||
float* pOldBuffer = workDesc.pUserClipBuffer;
|
||||
float* pNewBuffer = newClipBuffer;
|
||||
for (uint32_t i = 0; i < numClipDist; ++i)
|
||||
{
|
||||
// read barycentric coeffs from binner
|
||||
float a = *(pOldBuffer++);
|
||||
float b = *(pOldBuffer++);
|
||||
|
||||
// reconstruct original clip distance at vertices
|
||||
float c0 = a + b;
|
||||
float c1 = b;
|
||||
|
||||
// construct triangle barycentrics
|
||||
*(pNewBuffer++) = c0 - c1;
|
||||
*(pNewBuffer++) = c0 - c1;
|
||||
*(pNewBuffer++) = c1;
|
||||
}
|
||||
}
|
||||
|
||||
// setup triangle rasterizer function
|
||||
PFN_WORK_FUNC pfnTriRast;
|
||||
// conservative rast not supported for points/lines
|
||||
pfnTriRast = GetRasterizerFunc(rastState.sampleCount,
|
||||
rastState.bIsCenterPattern,
|
||||
false,
|
||||
SWR_INPUT_COVERAGE_NONE,
|
||||
EdgeValToEdgeState(ALL_EDGES_VALID),
|
||||
(pDC->pState->state.scissorsTileAligned == false));
|
||||
|
||||
// make sure this macrotile intersects the triangle
|
||||
__m128i vXai = fpToFixedPoint(vXa);
|
||||
__m128i vYai = fpToFixedPoint(vYa);
|
||||
OSALIGNSIMD(SWR_RECT) bboxA;
|
||||
calcBoundingBoxInt(vXai, vYai, bboxA);
|
||||
|
||||
if (!(bboxA.xmin > macroBoxRight || bboxA.xmin > scissorInFixedPoint.xmax ||
|
||||
bboxA.xmax - 1 < macroBoxLeft || bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
|
||||
bboxA.ymin > macroBoxBottom || bboxA.ymin > scissorInFixedPoint.ymax ||
|
||||
bboxA.ymax - 1 < macroBoxTop || bboxA.ymax - 1 < scissorInFixedPoint.ymin))
|
||||
{
|
||||
// rasterize triangle
|
||||
pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
|
||||
}
|
||||
|
||||
// triangle 1
|
||||
// v0,v1 -> v1,v1,v0
|
||||
vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
|
||||
vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
|
||||
vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
|
||||
vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1));
|
||||
|
||||
vAdjust = _mm_mul_ps(vLineWidth, vBloat1);
|
||||
if (workDesc.triFlags.yMajor)
|
||||
{
|
||||
vXa = _mm_add_ps(vAdjust, vXa);
|
||||
}
|
||||
else
|
||||
{
|
||||
vYa = _mm_add_ps(vAdjust, vYa);
|
||||
}
|
||||
|
||||
// Store triangle description for rasterizer
|
||||
_mm_store_ps((float*)&newTriBuffer[0], vXa);
|
||||
_mm_store_ps((float*)&newTriBuffer[4], vYa);
|
||||
_mm_store_ps((float*)&newTriBuffer[8], vZa);
|
||||
_mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
|
||||
|
||||
// binner bins 3 edges for lines as v0, v1, v1
|
||||
// tri1 needs v1, v1, v0
|
||||
for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
|
||||
{
|
||||
__m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
|
||||
__m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
|
||||
|
||||
_mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1);
|
||||
_mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1);
|
||||
_mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0);
|
||||
}
|
||||
|
||||
// store user clip distance for triangle 1
|
||||
if (numClipDist)
|
||||
{
|
||||
float* pOldBuffer = workDesc.pUserClipBuffer;
|
||||
float* pNewBuffer = newClipBuffer;
|
||||
for (uint32_t i = 0; i < numClipDist; ++i)
|
||||
{
|
||||
// read barycentric coeffs from binner
|
||||
float a = *(pOldBuffer++);
|
||||
float b = *(pOldBuffer++);
|
||||
|
||||
// reconstruct original clip distance at vertices
|
||||
float c0 = a + b;
|
||||
float c1 = b;
|
||||
|
||||
// construct triangle barycentrics
|
||||
*(pNewBuffer++) = c1 - c0;
|
||||
*(pNewBuffer++) = c1 - c0;
|
||||
*(pNewBuffer++) = c0;
|
||||
}
|
||||
}
|
||||
|
||||
vXai = fpToFixedPoint(vXa);
|
||||
vYai = fpToFixedPoint(vYa);
|
||||
calcBoundingBoxInt(vXai, vYai, bboxA);
|
||||
|
||||
if (!(bboxA.xmin > macroBoxRight || bboxA.xmin > scissorInFixedPoint.xmax ||
|
||||
bboxA.xmax - 1 < macroBoxLeft || bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
|
||||
bboxA.ymin > macroBoxBottom || bboxA.ymin > scissorInFixedPoint.ymax ||
|
||||
bboxA.ymax - 1 < macroBoxTop || bboxA.ymax - 1 < scissorInFixedPoint.ymin))
|
||||
{
|
||||
// rasterize triangle
|
||||
pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
|
||||
}
|
||||
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizeLine, 1);
|
||||
}
|
||||
|
||||
void RasterizeSimplePoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
|
||||
{
|
||||
#if KNOB_ENABLE_TOSS_POINTS
|
||||
if (KNOB_TOSS_BIN_TRIS)
|
||||
{
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
|
||||
const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
|
||||
|
||||
// map x,y relative offsets from start of raster tile to bit position in
|
||||
// coverage mask for the point
|
||||
static const uint32_t coverageMap[8][8] = {{0, 1, 4, 5, 8, 9, 12, 13},
|
||||
{2, 3, 6, 7, 10, 11, 14, 15},
|
||||
{16, 17, 20, 21, 24, 25, 28, 29},
|
||||
{18, 19, 22, 23, 26, 27, 30, 31},
|
||||
{32, 33, 36, 37, 40, 41, 44, 45},
|
||||
{34, 35, 38, 39, 42, 43, 46, 47},
|
||||
{48, 49, 52, 53, 56, 57, 60, 61},
|
||||
{50, 51, 54, 55, 58, 59, 62, 63}};
|
||||
|
||||
OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc = {};
|
||||
|
||||
// pull point information from triangle buffer
|
||||
// @todo use structs for readability
|
||||
uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
|
||||
uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
|
||||
float z = *(workDesc.pTriBuffer + 2);
|
||||
|
||||
// construct triangle descriptor for point
|
||||
// no interpolation, set up i,j for constant interpolation of z and attribs
|
||||
// @todo implement an optimized backend that doesn't require triangle information
|
||||
|
||||
// compute coverage mask from x,y packed into the coverageMask flag
|
||||
// mask indices by the maximum valid index for x/y of coveragemap.
|
||||
uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
|
||||
uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
|
||||
for (uint32_t i = 0; i < _countof(triDesc.coverageMask); ++i)
|
||||
{
|
||||
triDesc.coverageMask[i] = 1ULL << coverageMap[tY][tX];
|
||||
}
|
||||
triDesc.anyCoveredSamples = triDesc.coverageMask[0];
|
||||
triDesc.innerCoverageMask = triDesc.coverageMask[0];
|
||||
|
||||
// no persp divide needed for points
|
||||
triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
|
||||
triDesc.triFlags = workDesc.triFlags;
|
||||
triDesc.recipDet = 1.0f;
|
||||
triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
|
||||
triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
|
||||
triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
|
||||
triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
|
||||
|
||||
RenderOutputBuffers renderBuffers;
|
||||
GetRenderHotTiles(pDC,
|
||||
workerId,
|
||||
macroTile,
|
||||
tileAlignedX >> KNOB_TILE_X_DIM_SHIFT,
|
||||
tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT,
|
||||
renderBuffers,
|
||||
triDesc.triFlags.renderTargetArrayIndex);
|
||||
|
||||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelBackend, pDC->drawId);
|
||||
backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
|
||||
RDTSC_END(pDC->pContext->pBucketMgr, BEPixelBackend, 0);
|
||||
}
|
||||
|
||||
void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
|
||||
{
|
||||
const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
|
||||
const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
|
||||
const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
|
||||
|
||||
bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
|
||||
|
||||
// load point vertex
|
||||
float x = *workDesc.pTriBuffer;
|
||||
float y = *(workDesc.pTriBuffer + 1);
|
||||
float z = *(workDesc.pTriBuffer + 2);
|
||||
|
||||
// create a copy of the triangle buffer to write our adjusted vertices to
|
||||
OSALIGNSIMD(float) newTriBuffer[4 * 4];
|
||||
TRIANGLE_WORK_DESC newWorkDesc = workDesc;
|
||||
newWorkDesc.pTriBuffer = &newTriBuffer[0];
|
||||
|
||||
// create a copy of the attrib buffer to write our adjusted attribs to
|
||||
OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
|
||||
newWorkDesc.pAttribs = &newAttribBuffer[0];
|
||||
|
||||
newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
|
||||
newWorkDesc.numAttribs = workDesc.numAttribs;
|
||||
newWorkDesc.triFlags = workDesc.triFlags;
|
||||
|
||||
// construct two tris by bloating point by point size
|
||||
float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
|
||||
float lowerX = x - halfPointSize;
|
||||
float upperX = x + halfPointSize;
|
||||
float lowerY = y - halfPointSize;
|
||||
float upperY = y + halfPointSize;
|
||||
|
||||
// tri 0
|
||||
float* pBuf = &newTriBuffer[0];
|
||||
*pBuf++ = lowerX;
|
||||
*pBuf++ = lowerX;
|
||||
*pBuf++ = upperX;
|
||||
pBuf++;
|
||||
*pBuf++ = lowerY;
|
||||
*pBuf++ = upperY;
|
||||
*pBuf++ = upperY;
|
||||
pBuf++;
|
||||
_mm_store_ps(pBuf, _mm_set1_ps(z));
|
||||
_mm_store_ps(pBuf += 4, _mm_set1_ps(1.0f));
|
||||
|
||||
// setup triangle rasterizer function
|
||||
PFN_WORK_FUNC pfnTriRast;
|
||||
// conservative rast not supported for points/lines
|
||||
pfnTriRast = GetRasterizerFunc(rastState.sampleCount,
|
||||
rastState.bIsCenterPattern,
|
||||
false,
|
||||
SWR_INPUT_COVERAGE_NONE,
|
||||
EdgeValToEdgeState(ALL_EDGES_VALID),
|
||||
(pDC->pState->state.scissorsTileAligned == false));
|
||||
|
||||
// overwrite texcoords for point sprites
|
||||
if (isPointSpriteTexCoordEnabled)
|
||||
{
|
||||
// copy original attribs
|
||||
memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float));
|
||||
newWorkDesc.pAttribs = &newAttribBuffer[0];
|
||||
|
||||
// overwrite texcoord for point sprites
|
||||
uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
|
||||
unsigned long texCoordAttrib = 0;
|
||||
|
||||
while (_BitScanForward(&texCoordAttrib, texCoordMask))
|
||||
{
|
||||
texCoordMask &= ~(1 << texCoordAttrib);
|
||||
__m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
|
||||
if (rastState.pointSpriteTopOrigin)
|
||||
{
|
||||
pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
|
||||
pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0);
|
||||
pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
|
||||
pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0);
|
||||
pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// no texcoord overwrite, can reuse the attrib buffer from frontend
|
||||
newWorkDesc.pAttribs = workDesc.pAttribs;
|
||||
}
|
||||
|
||||
pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
|
||||
|
||||
// tri 1
|
||||
pBuf = &newTriBuffer[0];
|
||||
*pBuf++ = lowerX;
|
||||
*pBuf++ = upperX;
|
||||
*pBuf++ = upperX;
|
||||
pBuf++;
|
||||
*pBuf++ = lowerY;
|
||||
*pBuf++ = upperY;
|
||||
*pBuf++ = lowerY;
|
||||
// z, w unchanged
|
||||
|
||||
if (isPointSpriteTexCoordEnabled)
|
||||
{
|
||||
uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
|
||||
unsigned long texCoordAttrib = 0;
|
||||
|
||||
while (_BitScanForward(&texCoordAttrib, texCoordMask))
|
||||
{
|
||||
texCoordMask &= ~(1 << texCoordAttrib);
|
||||
__m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
|
||||
if (rastState.pointSpriteTopOrigin)
|
||||
{
|
||||
pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
|
||||
pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
|
||||
pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
|
||||
pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1);
|
||||
pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
|
||||
}
|
||||
|
||||
void InitRasterizerFunctions()
|
||||
{
|
||||
InitRasterizerFuncs();
|
||||
}
|
||||
|
||||
// Selector for correct templated RasterizeTriangle function
|
||||
PFN_WORK_FUNC GetRasterizerFunc(SWR_MULTISAMPLE_COUNT numSamples,
|
||||
bool IsCenter,
|
||||
bool IsConservative,
|
||||
SWR_INPUT_COVERAGE InputCoverage,
|
||||
uint32_t EdgeEnable,
|
||||
bool RasterizeScissorEdges)
|
||||
{
|
||||
SWR_ASSERT(numSamples >= 0 && numSamples < SWR_MULTISAMPLE_TYPE_COUNT);
|
||||
SWR_ASSERT(InputCoverage >= 0 && InputCoverage < SWR_INPUT_COVERAGE_COUNT);
|
||||
SWR_ASSERT(EdgeEnable < STATE_VALID_TRI_EDGE_COUNT);
|
||||
|
||||
PFN_WORK_FUNC func = gRasterizerFuncs[numSamples][IsCenter][IsConservative][InputCoverage]
|
||||
[EdgeEnable][RasterizeScissorEdges];
|
||||
SWR_ASSERT(func);
|
||||
|
||||
return func;
|
||||
}
|
||||
|
|
@ -1,237 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file rasterizer.h
|
||||
*
|
||||
* @brief Definitions for the rasterizer.
|
||||
*
|
||||
******************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#include "context.h"
|
||||
#include <type_traits>
|
||||
#include "conservativeRast.h"
|
||||
#include "multisample.h"
|
||||
|
||||
void RasterizeLine(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData);
|
||||
void RasterizeSimplePoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData);
|
||||
void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData);
|
||||
void InitRasterizerFunctions();
|
||||
|
||||
INLINE
|
||||
__m128i fpToFixedPoint(const __m128 vIn)
|
||||
{
|
||||
__m128 vFixed = _mm_mul_ps(vIn, _mm_set1_ps(FIXED_POINT_SCALE));
|
||||
return _mm_cvtps_epi32(vFixed);
|
||||
}
|
||||
|
||||
enum TriEdgesStates
|
||||
{
|
||||
STATE_NO_VALID_EDGES = 0,
|
||||
STATE_E0_E1_VALID,
|
||||
STATE_E0_E2_VALID,
|
||||
STATE_E1_E2_VALID,
|
||||
STATE_ALL_EDGES_VALID,
|
||||
STATE_VALID_TRI_EDGE_COUNT,
|
||||
};
|
||||
|
||||
enum TriEdgesValues
|
||||
{
|
||||
NO_VALID_EDGES = 0,
|
||||
E0_E1_VALID = 0x3,
|
||||
E0_E2_VALID = 0x5,
|
||||
E1_E2_VALID = 0x6,
|
||||
ALL_EDGES_VALID = 0x7,
|
||||
VALID_TRI_EDGE_COUNT,
|
||||
};
|
||||
|
||||
// Selector for correct templated RasterizeTriangle function
|
||||
PFN_WORK_FUNC GetRasterizerFunc(SWR_MULTISAMPLE_COUNT numSamples,
|
||||
bool IsCenter,
|
||||
bool IsConservative,
|
||||
SWR_INPUT_COVERAGE InputCoverage,
|
||||
uint32_t EdgeEnable,
|
||||
bool RasterizeScissorEdges);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief ValidTriEdges convenience typedefs used for templated function
|
||||
/// specialization supported Fixed Point precisions
|
||||
typedef std::integral_constant<uint32_t, ALL_EDGES_VALID> AllEdgesValidT;
|
||||
typedef std::integral_constant<uint32_t, E0_E1_VALID> E0E1ValidT;
|
||||
typedef std::integral_constant<uint32_t, E0_E2_VALID> E0E2ValidT;
|
||||
typedef std::integral_constant<uint32_t, E1_E2_VALID> E1E2ValidT;
|
||||
typedef std::integral_constant<uint32_t, NO_VALID_EDGES> NoEdgesValidT;
|
||||
|
||||
typedef std::integral_constant<uint32_t, STATE_ALL_EDGES_VALID> StateAllEdgesValidT;
|
||||
typedef std::integral_constant<uint32_t, STATE_E0_E1_VALID> StateE0E1ValidT;
|
||||
typedef std::integral_constant<uint32_t, STATE_E0_E2_VALID> StateE0E2ValidT;
|
||||
typedef std::integral_constant<uint32_t, STATE_E1_E2_VALID> StateE1E2ValidT;
|
||||
typedef std::integral_constant<uint32_t, STATE_NO_VALID_EDGES> StateNoEdgesValidT;
|
||||
|
||||
// some specializations to convert from edge state to edge bitmask values
|
||||
template <typename EdgeMask>
|
||||
struct EdgeMaskVal
|
||||
{
|
||||
static_assert(EdgeMask::value > STATE_ALL_EDGES_VALID,
|
||||
"Primary EdgeMaskVal shouldn't be instantiated");
|
||||
};
|
||||
|
||||
template <>
|
||||
struct EdgeMaskVal<StateAllEdgesValidT>
|
||||
{
|
||||
typedef AllEdgesValidT T;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct EdgeMaskVal<StateE0E1ValidT>
|
||||
{
|
||||
typedef E0E1ValidT T;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct EdgeMaskVal<StateE0E2ValidT>
|
||||
{
|
||||
typedef E0E2ValidT T;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct EdgeMaskVal<StateE1E2ValidT>
|
||||
{
|
||||
typedef E1E2ValidT T;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct EdgeMaskVal<StateNoEdgesValidT>
|
||||
{
|
||||
typedef NoEdgesValidT T;
|
||||
};
|
||||
|
||||
INLINE uint32_t EdgeValToEdgeState(uint32_t val)
|
||||
{
|
||||
SWR_ASSERT(val < VALID_TRI_EDGE_COUNT, "Unexpected tri edge mask");
|
||||
static const uint32_t edgeValToEdgeState[VALID_TRI_EDGE_COUNT] = {0, 0, 0, 1, 0, 2, 3, 4};
|
||||
return edgeValToEdgeState[val];
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @struct RasterScissorEdgesT
|
||||
/// @brief Primary RasterScissorEdgesT templated struct that holds compile
|
||||
/// time information about the number of edges needed to be rasterized,
|
||||
/// If either the scissor rect or conservative rast is enabled,
|
||||
/// the scissor test is enabled and the rasterizer will test
|
||||
/// 3 triangle edges + 4 scissor edges for coverage.
|
||||
/// @tparam RasterScissorEdgesT: number of multisamples
|
||||
/// @tparam ConservativeT: is this a conservative rasterization
|
||||
/// @tparam EdgeMaskT: Which edges are valid(not degenerate)
|
||||
template <typename RasterScissorEdgesT, typename ConservativeT, typename EdgeMaskT>
|
||||
struct RasterEdgeTraits
|
||||
{
|
||||
typedef std::true_type RasterizeScissorEdgesT;
|
||||
typedef std::integral_constant<uint32_t, 7> NumEdgesT;
|
||||
// typedef std::integral_constant<uint32_t, EdgeMaskT::value> ValidEdgeMaskT;
|
||||
typedef typename EdgeMaskVal<EdgeMaskT>::T ValidEdgeMaskT;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief specialization of RasterEdgeTraits. If neither scissor rect
|
||||
/// nor conservative rast is enabled, only test 3 triangle edges
|
||||
/// for coverage
|
||||
template <typename EdgeMaskT>
|
||||
struct RasterEdgeTraits<std::false_type, std::false_type, EdgeMaskT>
|
||||
{
|
||||
typedef std::false_type RasterizeScissorEdgesT;
|
||||
typedef std::integral_constant<uint32_t, 3> NumEdgesT;
|
||||
// no need for degenerate edge masking in non-conservative case; rasterize all triangle edges
|
||||
typedef std::integral_constant<uint32_t, ALL_EDGES_VALID> ValidEdgeMaskT;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @struct RasterizerTraits
|
||||
/// @brief templated struct that holds compile time information used
|
||||
/// during rasterization. Inherits EdgeTraits and ConservativeRastBETraits.
|
||||
/// @tparam NumSamplesT: number of multisamples
|
||||
/// @tparam ConservativeT: is this a conservative rasterization
|
||||
/// @tparam InputCoverageT: what type of input coverage is the PS expecting?
|
||||
/// (only used with conservative rasterization)
|
||||
/// @tparam RasterScissorEdgesT: do we need to rasterize with a scissor?
|
||||
template <typename NumSamplesT,
|
||||
typename CenterPatternT,
|
||||
typename ConservativeT,
|
||||
typename InputCoverageT,
|
||||
typename EdgeEnableT,
|
||||
typename RasterScissorEdgesT>
|
||||
struct _RasterizerTraits : public ConservativeRastBETraits<ConservativeT, InputCoverageT>,
|
||||
public RasterEdgeTraits<RasterScissorEdgesT, ConservativeT, EdgeEnableT>
|
||||
{
|
||||
typedef MultisampleTraits<static_cast<SWR_MULTISAMPLE_COUNT>(NumSamplesT::value),
|
||||
CenterPatternT::value>
|
||||
MT;
|
||||
|
||||
/// Fixed point precision the rasterizer is using
|
||||
typedef FixedPointTraits<Fixed_16_8> PrecisionT;
|
||||
/// Fixed point precision of the edge tests used during rasterization
|
||||
typedef FixedPointTraits<Fixed_X_16> EdgePrecisionT;
|
||||
|
||||
// If conservative rast or MSAA center pattern is enabled, only need a single sample coverage
|
||||
// test, with the result copied to all samples
|
||||
typedef std::integral_constant<int, ConservativeT::value ? 1 : MT::numCoverageSamples>
|
||||
NumCoverageSamplesT;
|
||||
|
||||
static_assert(
|
||||
EdgePrecisionT::BitsT::value >=
|
||||
ConservativeRastBETraits<ConservativeT,
|
||||
InputCoverageT>::ConservativePrecisionT::BitsT::value,
|
||||
"Rasterizer edge fixed point precision < required conservative rast precision");
|
||||
|
||||
/// constants used to offset between different types of raster tiles
|
||||
static const int colorRasterTileStep{
|
||||
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)) *
|
||||
MT::numSamples};
|
||||
static const int depthRasterTileStep{
|
||||
(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)) *
|
||||
MT::numSamples};
|
||||
static const int stencilRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM *
|
||||
(FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)) *
|
||||
MT::numSamples};
|
||||
static const int colorRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) *
|
||||
colorRasterTileStep};
|
||||
static const int depthRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) *
|
||||
depthRasterTileStep};
|
||||
static const int stencilRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) *
|
||||
stencilRasterTileStep};
|
||||
};
|
||||
|
||||
template <uint32_t NumSamplesT,
|
||||
uint32_t CenterPatternT,
|
||||
uint32_t ConservativeT,
|
||||
uint32_t InputCoverageT,
|
||||
uint32_t EdgeEnableT,
|
||||
uint32_t RasterScissorEdgesT>
|
||||
struct RasterizerTraits final
|
||||
: public _RasterizerTraits<std::integral_constant<uint32_t, NumSamplesT>,
|
||||
std::integral_constant<bool, CenterPatternT != 0>,
|
||||
std::integral_constant<bool, ConservativeT != 0>,
|
||||
std::integral_constant<uint32_t, InputCoverageT>,
|
||||
std::integral_constant<uint32_t, EdgeEnableT>,
|
||||
std::integral_constant<bool, RasterScissorEdgesT != 0>>
|
||||
{
|
||||
};
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,94 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
|
||||
#include "rdtsc_core.h"
|
||||
#include "common/rdtsc_buckets.h"
|
||||
|
||||
// must match CORE_BUCKETS enum order
|
||||
BUCKET_DESC gCoreBuckets[] = {
|
||||
{"APIClearRenderTarget", "", true, 0xff0b8bea},
|
||||
{"APIDraw", "", true, 0xff000066},
|
||||
{"APIDrawWakeAllThreads", "", false, 0xffffffff},
|
||||
{"APIDrawIndexed", "", true, 0xff000066},
|
||||
{"APIDispatch", "", true, 0xff660000},
|
||||
{"APIStoreTiles", "", true, 0xff00ffff},
|
||||
{"APIGetDrawContext", "", false, 0xffffffff},
|
||||
{"APISync", "", true, 0xff6666ff},
|
||||
{"APIWaitForIdle", "", true, 0xff0000ff},
|
||||
{"FEProcessDraw", "", true, 0xff009900},
|
||||
{"FEProcessDrawIndexed", "", true, 0xff009900},
|
||||
{"FEFetchShader", "", false, 0xffffffff},
|
||||
{"FEVertexShader", "", false, 0xffffffff},
|
||||
{"FEHullShader", "", false, 0xffffffff},
|
||||
{"FETessellation", "", false, 0xffffffff},
|
||||
{"FEDomainShader", "", false, 0xffffffff},
|
||||
{"FEGeometryShader", "", false, 0xffffffff},
|
||||
{"FEStreamout", "", false, 0xffffffff},
|
||||
{"FEPAAssemble", "", false, 0xffffffff},
|
||||
{"FEBinPoints", "", false, 0xff29b854},
|
||||
{"FEBinLines", "", false, 0xff29b854},
|
||||
{"FEBinTriangles", "", false, 0xff29b854},
|
||||
{"FETriangleSetup", "", false, 0xffffffff},
|
||||
{"FEViewportCull", "", false, 0xffffffff},
|
||||
{"FEGuardbandClip", "", false, 0xffffffff},
|
||||
{"FEClipPoints", "", false, 0xffffffff},
|
||||
{"FEClipLines", "", false, 0xffffffff},
|
||||
{"FEClipTriangles", "", false, 0xffffffff},
|
||||
{"FEClipRectangles", "", false, 0xffffffff},
|
||||
{"FECullZeroAreaAndBackface", "", false, 0xffffffff},
|
||||
{"FECullBetweenCenters", "", false, 0xffffffff},
|
||||
{"FEEarlyRastEnter", "", false, 0xffffffff},
|
||||
{"FEEarlyRastExit", "", false, 0xffffffff},
|
||||
{"FEProcessStoreTiles", "", true, 0xff39c864},
|
||||
{"FEProcessInvalidateTiles", "", true, 0xffffffff},
|
||||
{"WorkerWorkOnFifoBE", "", false, 0xff40261c},
|
||||
{"WorkerFoundWork", "", false, 0xff573326},
|
||||
{"BELoadTiles", "", true, 0xffb0e2ff},
|
||||
{"BEDispatch", "", true, 0xff00a2ff},
|
||||
{"BEClear", "", true, 0xff00ccbb},
|
||||
{"BERasterizeLine", "", true, 0xffb26a4e},
|
||||
{"BERasterizeTriangle", "", true, 0xffb26a4e},
|
||||
{"BETriangleSetup", "", false, 0xffffffff},
|
||||
{"BEStepSetup", "", false, 0xffffffff},
|
||||
{"BECullZeroArea", "", false, 0xffffffff},
|
||||
{"BEEmptyTriangle", "", false, 0xffffffff},
|
||||
{"BETrivialAccept", "", false, 0xffffffff},
|
||||
{"BETrivialReject", "", false, 0xffffffff},
|
||||
{"BERasterizePartial", "", false, 0xffffffff},
|
||||
{"BEPixelBackend", "", false, 0xffffffff},
|
||||
{"BESetup", "", false, 0xffffffff},
|
||||
{"BEBarycentric", "", false, 0xffffffff},
|
||||
{"BEEarlyDepthTest", "", false, 0xffffffff},
|
||||
{"BEPixelShader", "", false, 0xffffffff},
|
||||
{"BESingleSampleBackend", "", false, 0xffffffff},
|
||||
{"BEPixelRateBackend", "", false, 0xffffffff},
|
||||
{"BESampleRateBackend", "", false, 0xffffffff},
|
||||
{"BENullBackend", "", false, 0xffffffff},
|
||||
{"BELateDepthTest", "", false, 0xffffffff},
|
||||
{"BEOutputMerger", "", false, 0xffffffff},
|
||||
{"BEStoreTiles", "", true, 0xff00cccc},
|
||||
{"BEEndTile", "", false, 0xffffffff},
|
||||
};
|
||||
static_assert(NumBuckets == (sizeof(gCoreBuckets) / sizeof(gCoreBuckets[0])),
|
||||
"RDTSC Bucket enum and description table size mismatched.");
|
||||
|
||||
|
|
@ -1,185 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
|
||||
#pragma once
|
||||
#include "knobs.h"
|
||||
|
||||
#include "common/os.h"
|
||||
#include "common/rdtsc_buckets.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// NOTE: This enum MUST be kept in sync with gCoreBuckets in rdtsc_core.cpp
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
enum CORE_BUCKETS
|
||||
{
|
||||
APIClearRenderTarget,
|
||||
APIDraw,
|
||||
APIDrawWakeAllThreads,
|
||||
APIDrawIndexed,
|
||||
APIDispatch,
|
||||
APIStoreTiles,
|
||||
APIGetDrawContext,
|
||||
APISync,
|
||||
APIWaitForIdle,
|
||||
FEProcessDraw,
|
||||
FEProcessDrawIndexed,
|
||||
FEFetchShader,
|
||||
FEVertexShader,
|
||||
FEHullShader,
|
||||
FETessellation,
|
||||
FEDomainShader,
|
||||
FEGeometryShader,
|
||||
FEStreamout,
|
||||
FEPAAssemble,
|
||||
FEBinPoints,
|
||||
FEBinLines,
|
||||
FEBinTriangles,
|
||||
FETriangleSetup,
|
||||
FEViewportCull,
|
||||
FEGuardbandClip,
|
||||
FEClipPoints,
|
||||
FEClipLines,
|
||||
FEClipTriangles,
|
||||
FEClipRectangles,
|
||||
FECullZeroAreaAndBackface,
|
||||
FECullBetweenCenters,
|
||||
FEEarlyRastEnter,
|
||||
FEEarlyRastExit,
|
||||
FEProcessStoreTiles,
|
||||
FEProcessInvalidateTiles,
|
||||
WorkerWorkOnFifoBE,
|
||||
WorkerFoundWork,
|
||||
BELoadTiles,
|
||||
BEDispatch,
|
||||
BEClear,
|
||||
BERasterizeLine,
|
||||
BERasterizeTriangle,
|
||||
BETriangleSetup,
|
||||
BEStepSetup,
|
||||
BECullZeroArea,
|
||||
BEEmptyTriangle,
|
||||
BETrivialAccept,
|
||||
BETrivialReject,
|
||||
BERasterizePartial,
|
||||
BEPixelBackend,
|
||||
BESetup,
|
||||
BEBarycentric,
|
||||
BEEarlyDepthTest,
|
||||
BEPixelShader,
|
||||
BESingleSampleBackend,
|
||||
BEPixelRateBackend,
|
||||
BESampleRateBackend,
|
||||
BENullBackend,
|
||||
BELateDepthTest,
|
||||
BEOutputMerger,
|
||||
BEStoreTiles,
|
||||
BEEndTile,
|
||||
|
||||
NumBuckets
|
||||
};
|
||||
|
||||
void rdtscReset(BucketManager* pBucketMgr);
|
||||
void rdtscInit(BucketManager* pBucketMgr, int threadId);
|
||||
void rdtscStart(BucketManager* pBucketMgr, uint32_t bucketId);
|
||||
void rdtscStop(BucketManager* pBucketMgr, uint32_t bucketId, uint32_t count, uint64_t drawId);
|
||||
void rdtscEvent(BucketManager* pBucketMgr, uint32_t bucketId, uint32_t count1, uint32_t count2);
|
||||
void rdtscEndFrame(BucketManager* pBucketMgr);
|
||||
|
||||
#ifdef KNOB_ENABLE_RDTSC
|
||||
#define RDTSC_RESET(pBucketMgr) rdtscReset(pBucketMgr)
|
||||
#define RDTSC_INIT(pBucketMgr, threadId) rdtscInit(pBucketMgr,threadId)
|
||||
#define RDTSC_START(pBucketMgr, bucket) rdtscStart(pBucketMgr, bucket)
|
||||
#define RDTSC_STOP(pBucketMgr, bucket, count, draw) rdtscStop(pBucketMgr, bucket, count, draw)
|
||||
#define RDTSC_EVENT(pBucketMgr, bucket, count1, count2) rdtscEvent(pBucketMgr, bucket, count1, count2)
|
||||
#define RDTSC_ENDFRAME(pBucketMgr) rdtscEndFrame(pBucketMgr)
|
||||
#else
|
||||
#define RDTSC_RESET(pBucketMgr)
|
||||
#define RDTSC_INIT(pBucketMgr, threadId)
|
||||
#define RDTSC_START(pBucketMgr, bucket)
|
||||
#define RDTSC_STOP(pBucketMgr, bucket, count, draw)
|
||||
#define RDTSC_EVENT(pBucketMgr, bucket, count1, count2)
|
||||
#define RDTSC_ENDFRAME(pBucketMgr)
|
||||
#endif
|
||||
|
||||
extern BUCKET_DESC gCoreBuckets[];
|
||||
|
||||
INLINE void rdtscReset(BucketManager *pBucketMgr)
|
||||
{
|
||||
pBucketMgr->mCurrentFrame = 0;
|
||||
pBucketMgr->ClearThreads();
|
||||
}
|
||||
|
||||
INLINE void rdtscInit(BucketManager* pBucketMgr, int threadId)
|
||||
{
|
||||
// register all the buckets once
|
||||
if (!pBucketMgr->mBucketsInitialized && (threadId == 0))
|
||||
{
|
||||
pBucketMgr->mBucketMap.resize(NumBuckets);
|
||||
for (uint32_t i = 0; i < NumBuckets; ++i)
|
||||
{
|
||||
pBucketMgr->mBucketMap[i] = pBucketMgr->RegisterBucket(gCoreBuckets[i]);
|
||||
}
|
||||
pBucketMgr->mBucketsInitialized = true;
|
||||
}
|
||||
|
||||
std::string name = threadId == 0 ? "API" : "WORKER";
|
||||
pBucketMgr->RegisterThread(name);
|
||||
}
|
||||
|
||||
INLINE void rdtscStart(BucketManager* pBucketMgr, uint32_t bucketId)
|
||||
{
|
||||
uint32_t id = pBucketMgr->mBucketMap[bucketId];
|
||||
pBucketMgr->StartBucket(id);
|
||||
}
|
||||
|
||||
INLINE void rdtscStop(BucketManager* pBucketMgr, uint32_t bucketId, uint32_t count, uint64_t drawId)
|
||||
{
|
||||
uint32_t id = pBucketMgr->mBucketMap[bucketId];
|
||||
pBucketMgr->StopBucket(id);
|
||||
}
|
||||
|
||||
INLINE void rdtscEvent(BucketManager* pBucketMgr, uint32_t bucketId, uint32_t count1, uint32_t count2)
|
||||
{
|
||||
uint32_t id = pBucketMgr->mBucketMap[bucketId];
|
||||
pBucketMgr->AddEvent(id, count1);
|
||||
}
|
||||
|
||||
INLINE void rdtscEndFrame(BucketManager* pBucketMgr)
|
||||
{
|
||||
pBucketMgr->mCurrentFrame++;
|
||||
|
||||
if (pBucketMgr->mCurrentFrame == KNOB_BUCKETS_START_FRAME &&
|
||||
KNOB_BUCKETS_START_FRAME < KNOB_BUCKETS_END_FRAME)
|
||||
{
|
||||
pBucketMgr->StartCapture();
|
||||
}
|
||||
|
||||
if (pBucketMgr->mCurrentFrame == KNOB_BUCKETS_END_FRAME &&
|
||||
KNOB_BUCKETS_START_FRAME < KNOB_BUCKETS_END_FRAME)
|
||||
{
|
||||
pBucketMgr->StopCapture();
|
||||
pBucketMgr->PrintReport("rdtsc.txt");
|
||||
}
|
||||
}
|
||||
|
|
@ -1,95 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file arena.h
|
||||
*
|
||||
* @brief RingBuffer
|
||||
* The RingBuffer class manages all aspects of the ring buffer including
|
||||
* the head/tail indices, etc.
|
||||
*
|
||||
******************************************************************************/
|
||||
#pragma once
|
||||
|
||||
template <typename T>
|
||||
class RingBuffer
|
||||
{
|
||||
public:
|
||||
RingBuffer() : mpRingBuffer(nullptr), mNumEntries(0), mRingHead(0), mRingTail(0) {}
|
||||
|
||||
~RingBuffer() { Destroy(); }
|
||||
|
||||
void Init(uint32_t numEntries)
|
||||
{
|
||||
SWR_ASSERT(numEntries > 0);
|
||||
SWR_ASSERT(((1ULL << 32) % numEntries) == 0,
|
||||
"%d is not evenly divisible into 2 ^ 32. Wrap errors will occur!",
|
||||
numEntries);
|
||||
mNumEntries = numEntries;
|
||||
mpRingBuffer = (T*)AlignedMalloc(sizeof(T) * numEntries, 64);
|
||||
SWR_ASSERT(mpRingBuffer != nullptr);
|
||||
memset((void*)mpRingBuffer, 0, sizeof(T) * numEntries);
|
||||
}
|
||||
|
||||
void Destroy()
|
||||
{
|
||||
AlignedFree(mpRingBuffer);
|
||||
mpRingBuffer = nullptr;
|
||||
}
|
||||
|
||||
T& operator[](const uint32_t index)
|
||||
{
|
||||
SWR_ASSERT(index < mNumEntries);
|
||||
return mpRingBuffer[index];
|
||||
}
|
||||
|
||||
INLINE void Enqueue()
|
||||
{
|
||||
mRingHead++; // There's only one producer.
|
||||
// Assert to find wrap-around cases, NEVER ENABLE DURING CHECKIN!!
|
||||
// SWR_REL_ASSERT(mRingHead);
|
||||
}
|
||||
|
||||
INLINE void Dequeue()
|
||||
{
|
||||
InterlockedIncrement(&mRingTail); // There are multiple consumers.
|
||||
}
|
||||
|
||||
INLINE bool IsEmpty() { return (GetHead() == GetTail()); }
|
||||
|
||||
INLINE bool IsFull()
|
||||
{
|
||||
uint32_t numEnqueued = GetHead() - GetTail();
|
||||
SWR_ASSERT(numEnqueued <= mNumEntries);
|
||||
|
||||
return (numEnqueued == mNumEntries);
|
||||
}
|
||||
|
||||
INLINE uint32_t GetTail() volatile { return mRingTail; }
|
||||
INLINE uint32_t GetHead() volatile { return mRingHead; }
|
||||
|
||||
protected:
|
||||
T* mpRingBuffer;
|
||||
uint32_t mNumEntries;
|
||||
|
||||
OSALIGNLINE(volatile uint32_t) mRingHead; // Consumer Counter
|
||||
OSALIGNLINE(volatile uint32_t) mRingTail; // Producer Counter
|
||||
};
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,67 +0,0 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* @file state.h
|
||||
*
|
||||
* @brief Definitions for API state - complex function implementation.
|
||||
*
|
||||
******************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#include "core/state.h"
|
||||
#include "common/simdintrin.h"
|
||||
|
||||
template <typename MaskT>
|
||||
INLINE __m128i SWR_MULTISAMPLE_POS::expandThenBlend4(uint32_t* min, uint32_t* max)
|
||||
{
|
||||
__m128i vMin = _mm_set1_epi32(*min);
|
||||
__m128i vMax = _mm_set1_epi32(*max);
|
||||
return _simd_blend4_epi32<MaskT::value>(vMin, vMax);
|
||||
}
|
||||
|
||||
INLINE void SWR_MULTISAMPLE_POS::PrecalcSampleData(int numSamples)
|
||||
{
|
||||
for (int i = 0; i < numSamples; i++)
|
||||
{
|
||||
_vXi[i] = _mm_set1_epi32(_xi[i]);
|
||||
_vYi[i] = _mm_set1_epi32(_yi[i]);
|
||||
_vX[i] = _simd_set1_ps(_x[i]);
|
||||
_vY[i] = _simd_set1_ps(_y[i]);
|
||||
}
|
||||
// precalculate the raster tile BB for the rasterizer.
|
||||
CalcTileSampleOffsets(numSamples);
|
||||
}
|
||||
|
||||
INLINE void SWR_MULTISAMPLE_POS::CalcTileSampleOffsets(int numSamples)
|
||||
{
|
||||
auto minXi = std::min_element(std::begin(_xi), &_xi[numSamples]);
|
||||
auto maxXi = std::max_element(std::begin(_xi), &_xi[numSamples]);
|
||||
using xMask = std::integral_constant<int, 0xA>;
|
||||
// BR(max), BL(min), UR(max), UL(min)
|
||||
tileSampleOffsetsX = expandThenBlend4<xMask>(minXi, maxXi);
|
||||
|
||||
auto minYi = std::min_element(std::begin(_yi), &_yi[numSamples]);
|
||||
auto maxYi = std::max_element(std::begin(_yi), &_yi[numSamples]);
|
||||
using yMask = std::integral_constant<int, 0xC>;
|
||||
// BR(max), BL(min), UR(max), UL(min)
|
||||
tileSampleOffsetsY = expandThenBlend4<yMask>(minYi, maxYi);
|
||||
};
|
||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue