gallium/swr: Remove driver source

The OpenSWR will be maintained on a classic/LTS branch. Reviewed-by: Dylan Baker <dylan@pnwbakers.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11264>
2026-01-03 18:00:10 +01:00 · 2021-06-09 13:19:44 +02:00 · 2021-06-09 13:19:44 +02:00 · 855793c6c6
commit 855793c6c6
parent d22d328859
178 changed files with 0 additions and 85594 deletions
--- a/src/gallium/drivers/swr/.clang-format
+++ b/src/gallium/drivers/swr/.clang-format
@ -1,64 +0,0 @@
---
-Language:        Cpp
-AccessModifierOffset: -3
-AlignAfterOpenBracket: true
-AlignEscapedNewlinesLeft: false
-AlignOperands:   false
-AlignTrailingComments: false
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: false
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortIfStatementsOnASingleLine: false
-AllowShortLoopsOnASingleLine: false
-AllowShortFunctionsOnASingleLine: All
-AlwaysBreakAfterDefinitionReturnType: true
-AlwaysBreakTemplateDeclarations: false
-AlwaysBreakBeforeMultilineStrings: false
-BreakBeforeBinaryOperators: NonAssignment
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: true
-BinPackParameters: false
-BinPackArguments: false
-ColumnLimit:     78
-ConstructorInitializerAllOnOneLineOrOnePerLine: false
-ConstructorInitializerIndentWidth: 3
-DerivePointerAlignment: false
-ExperimentalAutoDetectBinPacking: false
-IndentCaseLabels: false
-IndentWrappedFunctionNames: false
-IndentFunctionDeclarationAfterType: false
-MaxEmptyLinesToKeep: 2
-KeepEmptyLinesAtTheStartOfBlocks: true
-NamespaceIndentation: Inner
-ObjCBlockIndentWidth: 3
-ObjCSpaceAfterProperty: true
-ObjCSpaceBeforeProtocolList: true
-PenaltyBreakBeforeFirstCallParameter: 19
-PenaltyBreakComment: 300
-PenaltyBreakString: 1000
-PenaltyBreakFirstLessLess: 120
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 0
-PointerAlignment: Right
-SpacesBeforeTrailingComments: 1
-Cpp11BracedListStyle: true
-Standard:        Cpp11
-IndentWidth:     3
-TabWidth:        8
-UseTab:          Never
-BreakBeforeBraces: Linux
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-SpacesInAngles:  false
-SpaceInEmptyParentheses: false
-SpacesInCStyleCastParentheses: false
-SpaceAfterCStyleCast: false
-SpacesInContainerLiterals: true
-SpaceBeforeAssignmentOperators: true
-ContinuationIndentWidth: 3
-CommentPragmas:  '^ IWYU pragma:'
-ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
-SpaceBeforeParens: ControlStatements
-DisableFormat:   false
-...
-
--- a/src/gallium/drivers/swr/meson.build
+++ b/src/gallium/drivers/swr/meson.build
@ -1,411 +0,0 @@
-# Copyright © 2017-2020 Intel Corporation
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-files_swr_common = files(
-  'rasterizer/common/formats.cpp',
-  'rasterizer/common/formats.h',
-  'rasterizer/common/intrin.h',
-  'rasterizer/common/isa.hpp',
-  'rasterizer/common/os.cpp',
-  'rasterizer/common/os.h',
-  'rasterizer/common/rdtsc_buckets.cpp',
-  'rasterizer/common/rdtsc_buckets.h',
-  'rasterizer/common/rdtsc_buckets_shared.h',
-  'rasterizer/common/rdtsc_buckets_shared.h',
-  'rasterizer/common/simd16intrin.h',
-  'rasterizer/common/simdintrin.h',
-  'rasterizer/common/simdlib.hpp',
-  'rasterizer/common/simdlib_interface.hpp',
-  'rasterizer/common/simdlib_types.hpp',
-  'rasterizer/common/swr_assert.cpp',
-  'rasterizer/common/swr_assert.h',
-)
-
-files_swr_mesa = files(
-  'swr_loader.cpp',
-  'swr_clear.cpp',
-  'swr_context.cpp',
-  'swr_context.h',
-  'swr_draw.cpp',
-  'swr_public.h',
-  'swr_resource.h',
-  'swr_screen.cpp',
-  'swr_screen.h',
-  'swr_state.cpp',
-  'swr_state.h',
-  'swr_tex_sample.cpp',
-  'swr_tex_sample.h',
-  'swr_scratch.h',
-  'swr_scratch.cpp',
-  'swr_shader.cpp',
-  'swr_shader.h',
-  'swr_memory.h',
-  'swr_fence.h',
-  'swr_fence.cpp',
-  'swr_fence_work.h',
-  'swr_fence_work.cpp',
-  'swr_query.h',
-  'swr_query.cpp',
-  'rasterizer/jitter/blend_jit.cpp',
-  'rasterizer/jitter/blend_jit.h',
-  'rasterizer/jitter/builder.cpp',
-  'rasterizer/jitter/builder.h',
-  'rasterizer/jitter/builder_math.h',
-  'rasterizer/jitter/builder_mem.cpp',
-  'rasterizer/jitter/builder_mem.h',
-  'rasterizer/jitter/builder_gfx_mem.cpp',
-  'rasterizer/jitter/builder_gfx_mem.h',
-  'rasterizer/jitter/builder_misc.cpp',
-  'rasterizer/jitter/builder_misc.h',
-  'rasterizer/jitter/fetch_jit.cpp',
-  'rasterizer/jitter/fetch_jit.h',
-  'rasterizer/jitter/jit_api.h',
-  'rasterizer/jitter/JitManager.cpp',
-  'rasterizer/jitter/JitManager.h',
-  'rasterizer/jitter/streamout_jit.cpp',
-  'rasterizer/jitter/streamout_jit.h',
-  'rasterizer/jitter/shader_lib/DebugOutput.cpp',
-  'rasterizer/jitter/shader_lib/Scatter.cpp',
-  'rasterizer/jitter/functionpasses/lower_x86.cpp',
-  'rasterizer/memory/SurfaceState.h'
-)
-
-files_swr_arch = files(
-  'rasterizer/archrast/archrast.cpp',
-  'rasterizer/archrast/archrast.h',
-  'rasterizer/archrast/eventmanager.h',
-  'rasterizer/core/api.cpp',
-  'rasterizer/core/api.h',
-  'rasterizer/core/arena.h',
-  'rasterizer/core/backend.cpp',
-  'rasterizer/core/backend_clear.cpp',
-  'rasterizer/core/backend_sample.cpp',
-  'rasterizer/core/backend_singlesample.cpp',
-  'rasterizer/core/backend.h',
-  'rasterizer/core/backend_impl.h',
-  'rasterizer/core/binner.cpp',
-  'rasterizer/core/binner.h',
-  'rasterizer/core/blend.h',
-  'rasterizer/core/clip.cpp',
-  'rasterizer/core/clip.h',
-  'rasterizer/core/conservativeRast.h',
-  'rasterizer/core/context.h',
-  'rasterizer/core/depthstencil.h',
-  'rasterizer/core/fifo.hpp',
-  'rasterizer/core/format_conversion.h',
-  'rasterizer/core/format_traits.h',
-  'rasterizer/core/format_types.h',
-  'rasterizer/core/format_utils.h',
-  'rasterizer/core/frontend.cpp',
-  'rasterizer/core/frontend.h',
-  'rasterizer/core/knobs.h',
-  'rasterizer/core/knobs_init.h',
-  'rasterizer/core/multisample.h',
-  'rasterizer/core/pa_avx.cpp',
-  'rasterizer/core/pa.h',
-  'rasterizer/core/rasterizer.cpp',
-  'rasterizer/core/rasterizer.h',
-  'rasterizer/core/rasterizer_impl.h',
-  'rasterizer/core/rdtsc_core.cpp',
-  'rasterizer/core/rdtsc_core.h',
-  'rasterizer/core/ringbuffer.h',
-  'rasterizer/core/state.h',
-  'rasterizer/core/state_funcs.h',
-  'rasterizer/core/tessellator.h',
-  'rasterizer/core/tessellator.hpp',
-  'rasterizer/core/tessellator.cpp',
-  'rasterizer/core/threads.cpp',
-  'rasterizer/core/threads.h',
-  'rasterizer/core/tilemgr.cpp',
-  'rasterizer/core/tilemgr.h',
-  'rasterizer/core/tileset.h',
-  'rasterizer/core/utils.h',
-  'rasterizer/memory/ClearTile.cpp',
-  'rasterizer/memory/Convert.h',
-  'rasterizer/memory/LoadTile.cpp',
-  'rasterizer/memory/LoadTile.h',
-  'rasterizer/memory/LoadTile_Linear.cpp',
-  'rasterizer/memory/LoadTile_TileX.cpp',
-  'rasterizer/memory/LoadTile_TileY.cpp',
-  'rasterizer/memory/StoreTile.cpp',
-  'rasterizer/memory/StoreTile.h',
-  'rasterizer/memory/StoreTile_Linear2.cpp',
-  'rasterizer/memory/StoreTile_Linear.cpp',
-  'rasterizer/memory/StoreTile_TileW.cpp',
-  'rasterizer/memory/StoreTile_TileX2.cpp',
-  'rasterizer/memory/StoreTile_TileX.cpp',
-  'rasterizer/memory/StoreTile_TileY2.cpp',
-  'rasterizer/memory/StoreTile_TileY.cpp',
-  'rasterizer/memory/TilingFunctions.h',
-  'rasterizer/memory/tilingtraits.h',
-  'rasterizer/memory/InitMemory.h',
-  'rasterizer/memory/InitMemory.cpp',
-  'rasterizer/memory/SurfaceState.h'
-)
-
-swr_context_files = files('swr_context.h')
-swr_state_files = files('rasterizer/core/state.h')
-swr_surf_state_files = files('rasterizer/memory/SurfaceState.h')
-swr_event_proto_files = files('rasterizer/archrast/events.proto')
-swr_event_pproto_files = files('rasterizer/archrast/events_private.proto')
-swr_gen_backend_files = files('rasterizer/codegen/templates/gen_backend.cpp')
-swr_gen_rasterizer_files = files('rasterizer/codegen/templates/gen_rasterizer.cpp')
-swr_gen_header_init_files = files('rasterizer/codegen/templates/gen_header_init.hpp')
-
-swr_gen_llvm_ir_macros_py = files('rasterizer/codegen/gen_llvm_ir_macros.py')
-swr_gen_backends_py = files('rasterizer/codegen/gen_backends.py')
-
-swr_gen_builder_depends = files(
-    'rasterizer/codegen/templates/gen_builder.hpp',
-    'rasterizer/codegen/gen_common.py'
-    )
-
-
-subdir('rasterizer/jitter')
-subdir('rasterizer/codegen')
-subdir('rasterizer/core/backends')
-
-swr_incs = include_directories(
-  'rasterizer/codegen', 'rasterizer/core', 'rasterizer/jitter',
-  'rasterizer/archrast', 'rasterizer',
-)
-
-swr_cpp_args = []
-if cpp.has_argument('-fno-strict-aliasing')
-  swr_cpp_args += '-fno-strict-aliasing'
-endif
-if cpp.has_argument('-Wno-aligned-new')
-  swr_cpp_args += '-Wno-aligned-new'
-endif
-
-
-swr_arch_libs = []
-swr_defines = []
-
-swr_avx_args = cpp.first_supported_argument(
-  '-target-cpu=sandybridge', '-mavx', '-march=core-avx', '-tp=sandybridge',
-  '/arch:AVX',
-)
-if swr_avx_args == []
-  error('Cannot find AVX support for swr. (these are required for SWR an all architectures.)')
-endif
-
-shared_swr = get_option('shared-swr')
-if not shared_swr
-  if with_swr_arches.length() > 1
-    error('When SWR is linked statically only one architecture is allowed.')
-  endif
-  swr_defines += '-DHAVE_SWR_BUILTIN'
-endif
-
-if with_swr_arches.contains('skx')
-  swr_skx_args = cpp.first_supported_argument(
-    '-march=skylake-avx512', '-target-cpu=x86-skylake', '-xCORE-AVX512',
-  )
-  if swr_skx_args == []
-    error('Cannot find SKX support for swr.')
-  endif
-
-  swr_defines += '-DHAVE_SWR_SKX'
-  if shared_swr
-    swr_arch_libs += shared_library(
-      'swrSKX',
-      [files_swr_common, files_swr_arch],
-      cpp_args : [
-        cpp_msvc_compat_args, swr_cpp_args, swr_skx_args,
-        '-DKNOB_ARCH=KNOB_ARCH_AVX512',
-      ],
-      gnu_symbol_visibility : 'hidden',
-      link_args : [ld_args_gc_sections],
-      include_directories : [swr_incs],
-      dependencies : [dep_thread, dep_llvm],
-      version : '0.0.0',
-      soversion : host_machine.system() == 'windows' ? '' : '0',
-      install : true,
-      name_prefix : host_machine.system() == 'windows' ? '' : 'lib',
-    )
-  else
-    swr_arch_libs += static_library(
-      'swrSKX',
-      [files_swr_common, files_swr_arch],
-      cpp_args : [
-        cpp_msvc_compat_args, swr_cpp_args, swr_skx_args,
-        '-DKNOB_ARCH=KNOB_ARCH_AVX512',
-      ],
-      gnu_symbol_visibility : 'hidden',
-      link_args : [ld_args_gc_sections],
-      include_directories : [swr_incs],
-      dependencies : [dep_thread, dep_llvm],
-    )
-  endif
-endif
-
-if with_swr_arches.contains('knl')
-  swr_knl_args = cpp.first_supported_argument(
-    '-march=knl', '-target-cpu=mic-knl', '-xMIC-AVX512',
-  )
-  if swr_knl_args == []
-    error('Cannot find KNL support for swr.')
-  endif
-
-  swr_defines += '-DHAVE_SWR_KNL'
-  if shared_swr
-    swr_arch_libs += shared_library(
-      'swrKNL',
-      [files_swr_common, files_swr_arch],
-      cpp_args : [
-        cpp_msvc_compat_args, swr_cpp_args, swr_knl_args,
-        '-DKNOB_ARCH=KNOB_ARCH_AVX512', '-DSIMD_ARCH_KNIGHTS',
-      ],
-      gnu_symbol_visibility : 'hidden',
-      link_args : [ld_args_gc_sections],
-      include_directories : [swr_incs],
-      dependencies : [dep_thread, dep_llvm],
-      version : '0.0.0',
-      soversion : host_machine.system() == 'windows' ? '' : '0',
-      install : true,
-      name_prefix : host_machine.system() == 'windows' ? '' : 'lib',
-    )
-  else
-    swr_arch_libs += static_library(
-      'swrKNL',
-      [files_swr_common, files_swr_arch],
-      cpp_args : [
-        cpp_msvc_compat_args, swr_cpp_args, swr_knl_args,
-        '-DKNOB_ARCH=KNOB_ARCH_AVX512', '-DSIMD_ARCH_KNIGHTS',
-      ],
-      gnu_symbol_visibility : 'hidden',
-      link_args : [ld_args_gc_sections],
-      include_directories : [swr_incs],
-      dependencies : [dep_thread, dep_llvm],
-    )
-  endif
-endif
-
-
-if with_swr_arches.contains('avx2')
-  swr_avx2_args = cpp.first_supported_argument(
-    '-target-cpu=haswell', '-march=core-avx2', '-tp=haswell', '/arch:AVX2',
-  )
-  if swr_avx2_args == []
-    if cpp.has_argument(['-mavx2', '-mfma', '-mbmi2', '-mf16c'])
-      swr_avx2_args = ['-mavx2', '-mfma', '-mbmi2', '-mf16c']
-    else
-      error('Cannot find AVX2 support for swr.')
-    endif
-  endif
-
-  swr_defines += '-DHAVE_SWR_AVX2'
-  if shared_swr
-    swr_arch_libs += shared_library(
-      'swrAVX2',
-      [files_swr_common, files_swr_arch],
-      cpp_args : [
-        cpp_msvc_compat_args, swr_cpp_args, swr_avx2_args,
-        '-DKNOB_ARCH=KNOB_ARCH_AVX2',
-      ],
-      gnu_symbol_visibility : 'hidden',
-      link_args : [ld_args_gc_sections],
-      include_directories : [swr_incs],
-      dependencies : [dep_thread, dep_llvm],
-      version : '0.0.0',
-      soversion : host_machine.system() == 'windows' ? '' : '0',
-      install : true,
-      name_prefix : host_machine.system() == 'windows' ? '' : 'lib',
-    )
-  else
-    swr_arch_libs += static_library(
-      'swrAVX2',
-      [files_swr_common, files_swr_arch],
-      cpp_args : [
-        cpp_msvc_compat_args, swr_cpp_args, swr_avx2_args,
-        '-DKNOB_ARCH=KNOB_ARCH_AVX2',
-      ],
-      gnu_symbol_visibility : 'hidden',
-      link_args : [ld_args_gc_sections],
-      include_directories : [swr_incs],
-      dependencies : [dep_thread, dep_llvm],
-    )
-  endif
-endif
-
-if with_swr_arches.contains('avx')
-  swr_defines += '-DHAVE_SWR_AVX'
-  if shared_swr
-    swr_arch_libs += shared_library(
-      'swrAVX',
-      [files_swr_common, files_swr_arch],
-      cpp_args : [
-        cpp_msvc_compat_args, swr_cpp_args, swr_avx_args,
-        '-DKNOB_ARCH=KNOB_ARCH_AVX',
-      ],
-      gnu_symbol_visibility : 'hidden',
-      link_args : [ld_args_gc_sections],
-      include_directories : [swr_incs],
-      dependencies : [dep_thread, dep_llvm],
-      version : '0.0.0',
-      soversion : host_machine.system() == 'windows' ? '' : '0',
-      install : true,
-      name_prefix : host_machine.system() == 'windows' ? '' : 'lib',
-    )
-  else
-    swr_arch_libs += static_library(
-      'swrAVX',
-      [files_swr_common, files_swr_arch],
-      cpp_args : [
-        cpp_msvc_compat_args, swr_cpp_args, swr_avx_args,
-        '-DKNOB_ARCH=KNOB_ARCH_AVX',
-      ],
-      gnu_symbol_visibility : 'hidden',
-      link_args : [ld_args_gc_sections],
-      include_directories : [swr_incs],
-      dependencies : [dep_thread, dep_llvm],
-    )
-  endif
-endif
-
-
-if swr_arch_libs == []
-  error('SWR configured, but no SWR architectures configured')
-endif
-
-# The swr_avx_args are needed for intrensic usage in swr api headers.
-libmesaswr = static_library(
-  'mesaswr',
-  [files_swr_mesa, files_swr_common, gen_knobs_h, gen_knobs_cpp,
-   gen_builder_hpp, gen_builder_meta_hpp, gen_builder_intrin_hpp],
-  cpp_args : [
-    cpp_msvc_compat_args, swr_cpp_args, swr_avx_args,
-    swr_defines,
-  ],
-  gnu_symbol_visibility : 'hidden',
-  include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, swr_incs],
-  dependencies : [dep_llvm, idep_mesautil],
-)
-
-link_libs = [libmesaswr]
-if not shared_swr
-  link_libs += swr_arch_libs
-endif
-
-driver_swr = declare_dependency(
-  compile_args : '-DGALLIUM_SWR',
-  link_with : link_libs
-)
--- a/src/gallium/drivers/swr/rasterizer/.dir-locals.el
+++ b/src/gallium/drivers/swr/rasterizer/.dir-locals.el
@ -1,8 +0,0 @@
-((prog-mode
-  (c-basic-offset . 4)
-  (c-file-style . "k&r")
-  (fill-column . 78)
-  (indent-tabs-mode . nil)
-  (show-trailing-whitespace . t)
-  )
- )
--- a/src/gallium/drivers/swr/rasterizer/_clang-format
+++ b/src/gallium/drivers/swr/rasterizer/_clang-format
@ -1,114 +0,0 @@
---
-Language:        Cpp
-# BasedOnStyle:  LLVM
-AccessModifierOffset: -4
-AlignAfterOpenBracket: Align
-AlignConsecutiveAssignments: true
-AlignConsecutiveDeclarations: true
-AlignEscapedNewlines: Left
-AlignOperands:   true
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: false
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortFunctionsOnASingleLine: Inline
-AllowShortIfStatementsOnASingleLine: false
-AllowShortLoopsOnASingleLine: false
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: false
-AlwaysBreakTemplateDeclarations: true
-BinPackArguments: false
-BinPackParameters: false
-BraceWrapping:   
-  AfterClass:      true
-  AfterControlStatement: true
-  AfterEnum:       true
-  AfterFunction:   true
-  AfterNamespace:  true
-  AfterObjCDeclaration: true
-  AfterStruct:     true
-  AfterUnion:      true
-  #AfterExternBlock: false
-  BeforeCatch:     true
-  BeforeElse:      true
-  IndentBraces:    false
-  SplitEmptyFunction: true
-  SplitEmptyRecord: true
-  SplitEmptyNamespace: true
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Custom
-BreakBeforeInheritanceComma: false
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: AfterColon
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: true
-ColumnLimit:     100
-CommentPragmas:  '^ IWYU pragma:'
-CompactNamespaces: false
-ConstructorInitializerAllOnOneLineOrOnePerLine: false
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: true
-DerivePointerAlignment: false
-DisableFormat:   false
-ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: true
-ForEachMacros:   
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-#IncludeBlocks:   Preserve
-IncludeCategories: 
-  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
-    Priority:        2
-  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
-    Priority:        3
-  - Regex:           '.*'
-    Priority:        1
-IncludeIsMainRegex: '(Test)?$'
-IndentCaseLabels: false
-#IndentPPDirectives: AfterHash
-IndentWidth:     4
-IndentWrappedFunctionNames: false
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: All
-ObjCBlockIndentWidth: 4
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: true
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 19
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 60
-PointerAlignment: Left
-#RawStringFormats: 
-#  - Delimiter:       pb
-#    Language:        TextProto
-#    BasedOnStyle:    google
-ReflowComments:  true
-SortIncludes:    false
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeParens: ControlStatements
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 1
-SpacesInAngles:  false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard:        Cpp11
-TabWidth:        4
-UseTab:          Never
-...
--- a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
+++ b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
@ -1,708 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file archrast.cpp
- *
- * @brief Implementation for archrast.
- *
- ******************************************************************************/
-#include <sys/stat.h>
-
-#include <atomic>
-#include <map>
-
-#include "common/os.h"
-#include "archrast/archrast.h"
-#include "archrast/eventmanager.h"
-#include "gen_ar_event.hpp"
-#include "gen_ar_eventhandlerfile.hpp"
-
-namespace ArchRast
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief struct that keeps track of depth and stencil event information
-    struct DepthStencilStats
-    {
-        uint32_t earlyZTestPassCount       = 0;
-        uint32_t earlyZTestFailCount       = 0;
-        uint32_t lateZTestPassCount        = 0;
-        uint32_t lateZTestFailCount        = 0;
-        uint32_t earlyStencilTestPassCount = 0;
-        uint32_t earlyStencilTestFailCount = 0;
-        uint32_t lateStencilTestPassCount  = 0;
-        uint32_t lateStencilTestFailCount  = 0;
-    };
-
-    struct CStats
-    {
-        uint32_t trivialRejectCount;
-        uint32_t trivialAcceptCount;
-        uint32_t mustClipCount;
-    };
-
-    struct TEStats
-    {
-        uint32_t inputPrims = 0;
-        //@todo:: Change this to numPatches. Assumed: 1 patch per prim. If holds, its fine.
-    };
-
-    struct GSStateInfo
-    {
-        uint32_t inputPrimCount;
-        uint32_t primGeneratedCount;
-        uint32_t vertsInput;
-    };
-
-    struct RastStats
-    {
-        uint32_t rasterTiles = 0;
-    };
-
-    struct CullStats
-    {
-        uint32_t degeneratePrimCount = 0;
-        uint32_t backfacePrimCount   = 0;
-    };
-
-    struct AlphaStats
-    {
-        uint32_t alphaTestCount  = 0;
-        uint32_t alphaBlendCount = 0;
-    };
-
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Event handler that handles API thread events. This is shared
-    ///        between the API and its caller (e.g. driver shim) but typically
-    ///        there is only a single API thread per context. So you can save
-    ///        information in the class to be used for other events.
-    class EventHandlerApiStats : public EventHandlerFile
-    {
-    public:
-        EventHandlerApiStats(uint32_t id) : EventHandlerFile(id)
-        {
-#if defined(_WIN32)
-            // Attempt to copy the events.proto file to the ArchRast output dir. It's common for
-            // tools to place the events.proto file in the DEBUG_OUTPUT_DIR when launching AR. If it
-            // exists, this will attempt to copy it the first time we get here to package it with
-            // the stats. Otherwise, the user would need to specify the events.proto location when
-            // parsing the stats in post.
-            std::stringstream eventsProtoSrcFilename, eventsProtoDstFilename;
-            eventsProtoSrcFilename << KNOB_DEBUG_OUTPUT_DIR << "\\events.proto" << std::ends;
-            eventsProtoDstFilename << mOutputDir.substr(0, mOutputDir.size() - 1)
-                                   << "\\events.proto" << std::ends;
-
-            // If event.proto already exists, we're done; else do the copy
-            struct stat buf; // Use a Posix stat for file existence check
-            if (!stat(eventsProtoDstFilename.str().c_str(), &buf) == 0)
-            {
-                // Now check to make sure the events.proto source exists
-                if (stat(eventsProtoSrcFilename.str().c_str(), &buf) == 0)
-                {
-                    std::ifstream srcFile;
-                    srcFile.open(eventsProtoSrcFilename.str().c_str(), std::ios::binary);
-                    if (srcFile.is_open())
-                    {
-                        // Just do a binary buffer copy
-                        std::ofstream dstFile;
-                        dstFile.open(eventsProtoDstFilename.str().c_str(), std::ios::binary);
-                        dstFile << srcFile.rdbuf();
-                        dstFile.close();
-                    }
-                    srcFile.close();
-                }
-            }
-#endif
-        }
-
-        virtual void Handle(const DrawInstancedEvent& event)
-        {
-            DrawInfoEvent e(event.data.drawId,
-                            ArchRast::Instanced,
-                            event.data.topology,
-                            event.data.numVertices,
-                            0,
-                            0,
-                            event.data.startVertex,
-                            event.data.numInstances,
-                            event.data.startInstance,
-                            event.data.tsEnable,
-                            event.data.gsEnable,
-                            event.data.soEnable,
-                            event.data.soTopology,
-                            event.data.splitId);
-
-            EventHandlerFile::Handle(e);
-        }
-
-        virtual void Handle(const DrawIndexedInstancedEvent& event)
-        {
-            DrawInfoEvent e(event.data.drawId,
-                            ArchRast::IndexedInstanced,
-                            event.data.topology,
-                            0,
-                            event.data.numIndices,
-                            event.data.indexOffset,
-                            event.data.baseVertex,
-                            event.data.numInstances,
-                            event.data.startInstance,
-                            event.data.tsEnable,
-                            event.data.gsEnable,
-                            event.data.soEnable,
-                            event.data.soTopology,
-                            event.data.splitId);
-
-            EventHandlerFile::Handle(e);
-        }
-    };
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Event handler that handles worker thread events. There is one
-    ///        event handler per thread. The python script will need to sum
-    ///        up counters across all of the threads.
-    class EventHandlerWorkerStats : public EventHandlerFile
-    {
-    public:
-        EventHandlerWorkerStats(uint32_t id) : EventHandlerFile(id), mNeedFlush(false)
-        {
-            memset(mShaderStats, 0, sizeof(mShaderStats));
-        }
-
-        virtual void Handle(const EarlyDepthStencilInfoSingleSample& event)
-        {
-            // earlyZ test compute
-            mDSSingleSample.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSSingleSample.earlyZTestFailCount +=
-                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
-
-            // earlyStencil test compute
-            mDSSingleSample.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSSingleSample.earlyStencilTestFailCount +=
-                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
-
-            // earlyZ test single and multi sample
-            mDSCombined.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSCombined.earlyZTestFailCount +=
-                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
-
-            // earlyStencil test single and multi sample
-            mDSCombined.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSCombined.earlyStencilTestFailCount +=
-                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
-
-            mNeedFlush = true;
-        }
-
-        virtual void Handle(const EarlyDepthStencilInfoSampleRate& event)
-        {
-            // earlyZ test compute
-            mDSSampleRate.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSSampleRate.earlyZTestFailCount +=
-                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
-
-            // earlyStencil test compute
-            mDSSampleRate.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSSampleRate.earlyStencilTestFailCount +=
-                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
-
-            // earlyZ test single and multi sample
-            mDSCombined.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSCombined.earlyZTestFailCount +=
-                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
-
-            // earlyStencil test single and multi sample
-            mDSCombined.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSCombined.earlyStencilTestFailCount +=
-                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
-
-            mNeedFlush = true;
-        }
-
-        virtual void Handle(const EarlyDepthStencilInfoNullPS& event)
-        {
-            // earlyZ test compute
-            mDSNullPS.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSNullPS.earlyZTestFailCount +=
-                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
-
-            // earlyStencil test compute
-            mDSNullPS.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSNullPS.earlyStencilTestFailCount +=
-                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
-            mNeedFlush = true;
-        }
-
-        virtual void Handle(const LateDepthStencilInfoSingleSample& event)
-        {
-            // lateZ test compute
-            mDSSingleSample.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSSingleSample.lateZTestFailCount +=
-                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
-
-            // lateStencil test compute
-            mDSSingleSample.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSSingleSample.lateStencilTestFailCount +=
-                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
-
-            // lateZ test single and multi sample
-            mDSCombined.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSCombined.lateZTestFailCount +=
-                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
-
-            // lateStencil test single and multi sample
-            mDSCombined.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSCombined.lateStencilTestFailCount +=
-                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
-
-            mNeedFlush = true;
-        }
-
-        virtual void Handle(const LateDepthStencilInfoSampleRate& event)
-        {
-            // lateZ test compute
-            mDSSampleRate.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSSampleRate.lateZTestFailCount +=
-                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
-
-            // lateStencil test compute
-            mDSSampleRate.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSSampleRate.lateStencilTestFailCount +=
-                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
-
-            // lateZ test single and multi sample
-            mDSCombined.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSCombined.lateZTestFailCount +=
-                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
-
-            // lateStencil test single and multi sample
-            mDSCombined.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSCombined.lateStencilTestFailCount +=
-                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
-
-            mNeedFlush = true;
-        }
-
-        virtual void Handle(const LateDepthStencilInfoNullPS& event)
-        {
-            // lateZ test compute
-            mDSNullPS.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSNullPS.lateZTestFailCount +=
-                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
-
-            // lateStencil test compute
-            mDSNullPS.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSNullPS.lateStencilTestFailCount +=
-                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
-            mNeedFlush = true;
-        }
-
-        virtual void Handle(const EarlyDepthInfoPixelRate& event)
-        {
-            // earlyZ test compute
-            mDSPixelRate.earlyZTestPassCount += event.data.depthPassCount;
-            mDSPixelRate.earlyZTestFailCount +=
-                (_mm_popcnt_u32(event.data.activeLanes) - event.data.depthPassCount);
-            mNeedFlush = true;
-        }
-
-
-        virtual void Handle(const LateDepthInfoPixelRate& event)
-        {
-            // lateZ test compute
-            mDSPixelRate.lateZTestPassCount += event.data.depthPassCount;
-            mDSPixelRate.lateZTestFailCount +=
-                (_mm_popcnt_u32(event.data.activeLanes) - event.data.depthPassCount);
-            mNeedFlush = true;
-        }
-
-
-        virtual void Handle(const ClipInfoEvent& event)
-        {
-            mClipper.mustClipCount += _mm_popcnt_u32(event.data.clipMask);
-            mClipper.trivialRejectCount +=
-                event.data.numInvocations - _mm_popcnt_u32(event.data.validMask);
-            mClipper.trivialAcceptCount +=
-                _mm_popcnt_u32(event.data.validMask & ~event.data.clipMask);
-        }
-
-        void UpdateStats(SWR_SHADER_STATS* pStatTotals, const SWR_SHADER_STATS* pStatUpdate)
-        {
-            pStatTotals->numInstExecuted += pStatUpdate->numInstExecuted;
-            pStatTotals->numSampleExecuted += pStatUpdate->numSampleExecuted;
-            pStatTotals->numSampleLExecuted += pStatUpdate->numSampleLExecuted;
-            pStatTotals->numSampleBExecuted += pStatUpdate->numSampleBExecuted;
-            pStatTotals->numSampleCExecuted += pStatUpdate->numSampleCExecuted;
-            pStatTotals->numSampleCLZExecuted += pStatUpdate->numSampleCLZExecuted;
-            pStatTotals->numSampleCDExecuted += pStatUpdate->numSampleCDExecuted;
-            pStatTotals->numGather4Executed += pStatUpdate->numGather4Executed;
-            pStatTotals->numGather4CExecuted += pStatUpdate->numGather4CExecuted;
-            pStatTotals->numGather4CPOExecuted += pStatUpdate->numGather4CPOExecuted;
-            pStatTotals->numGather4CPOCExecuted += pStatUpdate->numGather4CPOCExecuted;
-            pStatTotals->numLodExecuted += pStatUpdate->numLodExecuted;
-        }
-
-        virtual void Handle(const VSStats& event)
-        {
-            SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
-            UpdateStats(&mShaderStats[SHADER_VERTEX], pStats);
-        }
-
-        virtual void Handle(const GSStats& event)
-        {
-            SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
-            UpdateStats(&mShaderStats[SHADER_GEOMETRY], pStats);
-        }
-
-        virtual void Handle(const DSStats& event)
-        {
-            SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
-            UpdateStats(&mShaderStats[SHADER_DOMAIN], pStats);
-        }
-
-        virtual void Handle(const HSStats& event)
-        {
-            SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
-            UpdateStats(&mShaderStats[SHADER_HULL], pStats);
-        }
-
-        virtual void Handle(const PSStats& event)
-        {
-            SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
-            UpdateStats(&mShaderStats[SHADER_PIXEL], pStats);
-            mNeedFlush = true;
-        }
-
-        virtual void Handle(const CSStats& event)
-        {
-            SWR_SHADER_STATS* pStats = (SWR_SHADER_STATS*)event.data.hStats;
-            UpdateStats(&mShaderStats[SHADER_COMPUTE], pStats);
-            mNeedFlush = true;
-        }
-
-        // Flush cached events for this draw
-        virtual void FlushDraw(uint32_t drawId)
-        {
-            if (mNeedFlush == false)
-                return;
-
-            EventHandlerFile::Handle(PSInfo(drawId,
-                                            mShaderStats[SHADER_PIXEL].numInstExecuted,
-                                            mShaderStats[SHADER_PIXEL].numSampleExecuted,
-                                            mShaderStats[SHADER_PIXEL].numSampleLExecuted,
-                                            mShaderStats[SHADER_PIXEL].numSampleBExecuted,
-                                            mShaderStats[SHADER_PIXEL].numSampleCExecuted,
-                                            mShaderStats[SHADER_PIXEL].numSampleCLZExecuted,
-                                            mShaderStats[SHADER_PIXEL].numSampleCDExecuted,
-                                            mShaderStats[SHADER_PIXEL].numGather4Executed,
-                                            mShaderStats[SHADER_PIXEL].numGather4CExecuted,
-                                            mShaderStats[SHADER_PIXEL].numGather4CPOExecuted,
-                                            mShaderStats[SHADER_PIXEL].numGather4CPOCExecuted,
-                                            mShaderStats[SHADER_PIXEL].numLodExecuted));
-            EventHandlerFile::Handle(CSInfo(drawId,
-                                            mShaderStats[SHADER_COMPUTE].numInstExecuted,
-                                            mShaderStats[SHADER_COMPUTE].numSampleExecuted,
-                                            mShaderStats[SHADER_COMPUTE].numSampleLExecuted,
-                                            mShaderStats[SHADER_COMPUTE].numSampleBExecuted,
-                                            mShaderStats[SHADER_COMPUTE].numSampleCExecuted,
-                                            mShaderStats[SHADER_COMPUTE].numSampleCLZExecuted,
-                                            mShaderStats[SHADER_COMPUTE].numSampleCDExecuted,
-                                            mShaderStats[SHADER_COMPUTE].numGather4Executed,
-                                            mShaderStats[SHADER_COMPUTE].numGather4CExecuted,
-                                            mShaderStats[SHADER_COMPUTE].numGather4CPOExecuted,
-                                            mShaderStats[SHADER_COMPUTE].numGather4CPOCExecuted,
-                                            mShaderStats[SHADER_COMPUTE].numLodExecuted));
-
-            // singleSample
-            EventHandlerFile::Handle(EarlyZSingleSample(
-                drawId, mDSSingleSample.earlyZTestPassCount, mDSSingleSample.earlyZTestFailCount));
-            EventHandlerFile::Handle(LateZSingleSample(
-                drawId, mDSSingleSample.lateZTestPassCount, mDSSingleSample.lateZTestFailCount));
-            EventHandlerFile::Handle(
-                EarlyStencilSingleSample(drawId,
-                                         mDSSingleSample.earlyStencilTestPassCount,
-                                         mDSSingleSample.earlyStencilTestFailCount));
-            EventHandlerFile::Handle(
-                LateStencilSingleSample(drawId,
-                                        mDSSingleSample.lateStencilTestPassCount,
-                                        mDSSingleSample.lateStencilTestFailCount));
-
-            // sampleRate
-            EventHandlerFile::Handle(EarlyZSampleRate(
-                drawId, mDSSampleRate.earlyZTestPassCount, mDSSampleRate.earlyZTestFailCount));
-            EventHandlerFile::Handle(LateZSampleRate(
-                drawId, mDSSampleRate.lateZTestPassCount, mDSSampleRate.lateZTestFailCount));
-            EventHandlerFile::Handle(
-                EarlyStencilSampleRate(drawId,
-                                       mDSSampleRate.earlyStencilTestPassCount,
-                                       mDSSampleRate.earlyStencilTestFailCount));
-            EventHandlerFile::Handle(LateStencilSampleRate(drawId,
-                                                           mDSSampleRate.lateStencilTestPassCount,
-                                                           mDSSampleRate.lateStencilTestFailCount));
-
-            // combined
-            EventHandlerFile::Handle(
-                EarlyZ(drawId, mDSCombined.earlyZTestPassCount, mDSCombined.earlyZTestFailCount));
-            EventHandlerFile::Handle(
-                LateZ(drawId, mDSCombined.lateZTestPassCount, mDSCombined.lateZTestFailCount));
-            EventHandlerFile::Handle(EarlyStencil(drawId,
-                                                  mDSCombined.earlyStencilTestPassCount,
-                                                  mDSCombined.earlyStencilTestFailCount));
-            EventHandlerFile::Handle(LateStencil(drawId,
-                                                 mDSCombined.lateStencilTestPassCount,
-                                                 mDSCombined.lateStencilTestFailCount));
-
-            // pixelRate
-            EventHandlerFile::Handle(EarlyZPixelRate(
-                drawId, mDSPixelRate.earlyZTestPassCount, mDSPixelRate.earlyZTestFailCount));
-            EventHandlerFile::Handle(LateZPixelRate(
-                drawId, mDSPixelRate.lateZTestPassCount, mDSPixelRate.lateZTestFailCount));
-
-
-            // NullPS
-            EventHandlerFile::Handle(
-                EarlyZNullPS(drawId, mDSNullPS.earlyZTestPassCount, mDSNullPS.earlyZTestFailCount));
-            EventHandlerFile::Handle(EarlyStencilNullPS(
-                drawId, mDSNullPS.earlyStencilTestPassCount, mDSNullPS.earlyStencilTestFailCount));
-
-            // Rasterized Subspans
-            EventHandlerFile::Handle(RasterTiles(drawId, rastStats.rasterTiles));
-
-            // Alpha Subspans
-            EventHandlerFile::Handle(
-                AlphaEvent(drawId, mAlphaStats.alphaTestCount, mAlphaStats.alphaBlendCount));
-
-            // Primitive Culling
-            EventHandlerFile::Handle(
-                CullEvent(drawId, mCullStats.backfacePrimCount, mCullStats.degeneratePrimCount));
-
-            mDSSingleSample = {};
-            mDSSampleRate   = {};
-            mDSCombined     = {};
-            mDSPixelRate    = {};
-            mDSNullPS = {};
-
-            rastStats   = {};
-            mCullStats  = {};
-            mAlphaStats = {};
-
-            mShaderStats[SHADER_PIXEL]   = {};
-            mShaderStats[SHADER_COMPUTE] = {};
-
-            mNeedFlush = false;
-        }
-
-        virtual void Handle(const FrontendDrawEndEvent& event)
-        {
-            // Clipper
-            EventHandlerFile::Handle(ClipperEvent(event.data.drawId,
-                                                  mClipper.trivialRejectCount,
-                                                  mClipper.trivialAcceptCount,
-                                                  mClipper.mustClipCount));
-
-            // Tesselator
-            EventHandlerFile::Handle(TessPrims(event.data.drawId, mTS.inputPrims));
-
-            // Geometry Shader
-            EventHandlerFile::Handle(GSInputPrims(event.data.drawId, mGS.inputPrimCount));
-            EventHandlerFile::Handle(GSPrimsGen(event.data.drawId, mGS.primGeneratedCount));
-            EventHandlerFile::Handle(GSVertsInput(event.data.drawId, mGS.vertsInput));
-
-            EventHandlerFile::Handle(VSInfo(event.data.drawId,
-                                            mShaderStats[SHADER_VERTEX].numInstExecuted,
-                                            mShaderStats[SHADER_VERTEX].numSampleExecuted,
-                                            mShaderStats[SHADER_VERTEX].numSampleLExecuted,
-                                            mShaderStats[SHADER_VERTEX].numSampleBExecuted,
-                                            mShaderStats[SHADER_VERTEX].numSampleCExecuted,
-                                            mShaderStats[SHADER_VERTEX].numSampleCLZExecuted,
-                                            mShaderStats[SHADER_VERTEX].numSampleCDExecuted,
-                                            mShaderStats[SHADER_VERTEX].numGather4Executed,
-                                            mShaderStats[SHADER_VERTEX].numGather4CExecuted,
-                                            mShaderStats[SHADER_VERTEX].numGather4CPOExecuted,
-                                            mShaderStats[SHADER_VERTEX].numGather4CPOCExecuted,
-                                            mShaderStats[SHADER_VERTEX].numLodExecuted));
-            EventHandlerFile::Handle(HSInfo(event.data.drawId,
-                                            mShaderStats[SHADER_HULL].numInstExecuted,
-                                            mShaderStats[SHADER_HULL].numSampleExecuted,
-                                            mShaderStats[SHADER_HULL].numSampleLExecuted,
-                                            mShaderStats[SHADER_HULL].numSampleBExecuted,
-                                            mShaderStats[SHADER_HULL].numSampleCExecuted,
-                                            mShaderStats[SHADER_HULL].numSampleCLZExecuted,
-                                            mShaderStats[SHADER_HULL].numSampleCDExecuted,
-                                            mShaderStats[SHADER_HULL].numGather4Executed,
-                                            mShaderStats[SHADER_HULL].numGather4CExecuted,
-                                            mShaderStats[SHADER_HULL].numGather4CPOExecuted,
-                                            mShaderStats[SHADER_HULL].numGather4CPOCExecuted,
-                                            mShaderStats[SHADER_HULL].numLodExecuted));
-            EventHandlerFile::Handle(DSInfo(event.data.drawId,
-                                            mShaderStats[SHADER_DOMAIN].numInstExecuted,
-                                            mShaderStats[SHADER_DOMAIN].numSampleExecuted,
-                                            mShaderStats[SHADER_DOMAIN].numSampleLExecuted,
-                                            mShaderStats[SHADER_DOMAIN].numSampleBExecuted,
-                                            mShaderStats[SHADER_DOMAIN].numSampleCExecuted,
-                                            mShaderStats[SHADER_DOMAIN].numSampleCLZExecuted,
-                                            mShaderStats[SHADER_DOMAIN].numSampleCDExecuted,
-                                            mShaderStats[SHADER_DOMAIN].numGather4Executed,
-                                            mShaderStats[SHADER_DOMAIN].numGather4CExecuted,
-                                            mShaderStats[SHADER_DOMAIN].numGather4CPOExecuted,
-                                            mShaderStats[SHADER_DOMAIN].numGather4CPOCExecuted,
-                                            mShaderStats[SHADER_DOMAIN].numLodExecuted));
-            EventHandlerFile::Handle(GSInfo(event.data.drawId,
-                                            mShaderStats[SHADER_GEOMETRY].numInstExecuted,
-                                            mShaderStats[SHADER_GEOMETRY].numSampleExecuted,
-                                            mShaderStats[SHADER_GEOMETRY].numSampleLExecuted,
-                                            mShaderStats[SHADER_GEOMETRY].numSampleBExecuted,
-                                            mShaderStats[SHADER_GEOMETRY].numSampleCExecuted,
-                                            mShaderStats[SHADER_GEOMETRY].numSampleCLZExecuted,
-                                            mShaderStats[SHADER_GEOMETRY].numSampleCDExecuted,
-                                            mShaderStats[SHADER_GEOMETRY].numGather4Executed,
-                                            mShaderStats[SHADER_GEOMETRY].numGather4CExecuted,
-                                            mShaderStats[SHADER_GEOMETRY].numGather4CPOExecuted,
-                                            mShaderStats[SHADER_GEOMETRY].numGather4CPOCExecuted,
-                                            mShaderStats[SHADER_GEOMETRY].numLodExecuted));
-
-            mShaderStats[SHADER_VERTEX]   = {};
-            mShaderStats[SHADER_HULL]     = {};
-            mShaderStats[SHADER_DOMAIN]   = {};
-            mShaderStats[SHADER_GEOMETRY] = {};
-
-            // Reset Internal Counters
-            mClipper = {};
-            mTS      = {};
-            mGS      = {};
-        }
-
-        virtual void Handle(const GSPrimInfo& event)
-        {
-            mGS.inputPrimCount += event.data.inputPrimCount;
-            mGS.primGeneratedCount += event.data.primGeneratedCount;
-            mGS.vertsInput += event.data.vertsInput;
-        }
-
-        virtual void Handle(const TessPrimCount& event) { mTS.inputPrims += event.data.primCount; }
-
-        virtual void Handle(const RasterTileCount& event)
-        {
-            rastStats.rasterTiles += event.data.rasterTiles;
-        }
-
-        virtual void Handle(const CullInfoEvent& event)
-        {
-            mCullStats.degeneratePrimCount += _mm_popcnt_u32(
-                event.data.validMask ^ (event.data.validMask & ~event.data.degeneratePrimMask));
-            mCullStats.backfacePrimCount += _mm_popcnt_u32(
-                event.data.validMask ^ (event.data.validMask & ~event.data.backfacePrimMask));
-        }
-
-        virtual void Handle(const AlphaInfoEvent& event)
-        {
-            mAlphaStats.alphaTestCount += event.data.alphaTestEnable;
-            mAlphaStats.alphaBlendCount += event.data.alphaBlendEnable;
-        }
-
-    protected:
-        bool mNeedFlush;
-        // Per draw stats
-        DepthStencilStats mDSSingleSample = {};
-        DepthStencilStats mDSSampleRate   = {};
-        DepthStencilStats mDSPixelRate    = {};
-        DepthStencilStats mDSCombined     = {};
-        DepthStencilStats mDSNullPS       = {};
-        DepthStencilStats mDSOmZ          = {};
-        CStats            mClipper        = {};
-        TEStats           mTS             = {};
-        GSStateInfo       mGS             = {};
-        RastStats         rastStats       = {};
-        CullStats         mCullStats      = {};
-        AlphaStats        mAlphaStats     = {};
-
-        SWR_SHADER_STATS mShaderStats[NUM_SHADER_TYPES];
-
-    };
-
-    static EventManager* FromHandle(HANDLE hThreadContext)
-    {
-        return reinterpret_cast<EventManager*>(hThreadContext);
-    }
-
-    // Construct an event manager and associate a handler with it.
-    HANDLE CreateThreadContext(AR_THREAD type)
-    {
-        // Can we assume single threaded here?
-        static std::atomic<uint32_t> counter(0);
-        uint32_t                     id = counter.fetch_add(1);
-
-        EventManager* pManager = new EventManager();
-
-        if (pManager)
-        {
-            EventHandlerFile* pHandler = nullptr;
-
-            if (type == AR_THREAD::API)
-            {
-                pHandler = new EventHandlerApiStats(id);
-                pManager->Attach(pHandler);
-                pHandler->Handle(ThreadStartApiEvent());
-            }
-            else
-            {
-                pHandler = new EventHandlerWorkerStats(id);
-                pManager->Attach(pHandler);
-                pHandler->Handle(ThreadStartWorkerEvent());
-            }
-
-            pHandler->MarkHeader();
-
-            return pManager;
-        }
-
-        SWR_INVALID("Failed to register thread.");
-        return nullptr;
-    }
-
-    void DestroyThreadContext(HANDLE hThreadContext)
-    {
-        EventManager* pManager = FromHandle(hThreadContext);
-        SWR_ASSERT(pManager != nullptr);
-
-        delete pManager;
-    }
-
-    // Dispatch event for this thread.
-    void Dispatch(HANDLE hThreadContext, const Event& event)
-    {
-        if (event.IsEnabled())
-        {
-            EventManager* pManager = reinterpret_cast<EventManager*>(hThreadContext);
-            SWR_ASSERT(pManager != nullptr);
-            pManager->Dispatch(event);
-        }
-    }
-
-    // Flush for this thread.
-    void FlushDraw(HANDLE hThreadContext, uint32_t drawId)
-    {
-        EventManager* pManager = FromHandle(hThreadContext);
-        SWR_ASSERT(pManager != nullptr);
-
-        pManager->FlushDraw(drawId);
-    }
-} // namespace ArchRast
--- a/src/gallium/drivers/swr/rasterizer/archrast/archrast.h
+++ b/src/gallium/drivers/swr/rasterizer/archrast/archrast.h
@ -1,49 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file archrast.h
- *
- * @brief Definitions for archrast.
- *
- ******************************************************************************/
-#pragma once
-
-#include "common/os.h"
-#include "gen_ar_event.hpp"
-#include "eventmanager.h"
-
-namespace ArchRast
-{
-    enum class AR_THREAD
-    {
-        API    = 0,
-        WORKER = 1
-    };
-
-    HANDLE CreateThreadContext(AR_THREAD type);
-    void   DestroyThreadContext(HANDLE hThreadContext);
-
-    // Dispatch event for this thread.
-    void Dispatch(HANDLE hThreadContext, const Event& event);
-
-    void FlushDraw(HANDLE hThreadContext, uint32_t drawId);
-}; // namespace ArchRast
--- a/src/gallium/drivers/swr/rasterizer/archrast/eventmanager.h
+++ b/src/gallium/drivers/swr/rasterizer/archrast/eventmanager.h
@ -1,88 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file archrast.h
- *
- * @brief Definitions for the event manager.
- *
- ******************************************************************************/
-#pragma once
-
-#include "common/os.h"
-
-#include "gen_ar_event.hpp"
-#include "gen_ar_eventhandler.hpp"
-
-#include <vector>
-
-namespace ArchRast
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// EventManager - interface to dispatch events to handlers.
-    /// Event handling occurs only on a single thread.
-    //////////////////////////////////////////////////////////////////////////
-    class EventManager
-    {
-    public:
-        EventManager() {}
-
-        ~EventManager()
-        {
-            // Event manager owns destroying handler objects once attached.
-            ///@note See comment for Detach.
-            for (auto pHandler : mHandlers)
-            {
-                delete pHandler;
-            }
-        }
-
-        void Attach(EventHandler* pHandler)
-        {
-            SWR_ASSERT(pHandler != nullptr);
-            mHandlers.push_back(pHandler);
-        }
-
-        void Dispatch(const Event& event)
-        {
-            ///@todo Add event filter check here.
-
-            for (auto pHandler : mHandlers)
-            {
-                event.Accept(pHandler);
-            }
-        }
-
-        void FlushDraw(uint32_t drawId)
-        {
-            for (auto pHandler : mHandlers)
-            {
-                pHandler->FlushDraw(drawId);
-            }
-        }
-
-    private:
-        // Handlers stay registered for life
-        void Detach(EventHandler* pHandler) { SWR_INVALID("Should not be called"); }
-
-        std::vector<EventHandler*> mHandlers;
-    };
-}; // namespace ArchRast
--- a/src/gallium/drivers/swr/rasterizer/archrast/events.proto
+++ b/src/gallium/drivers/swr/rasterizer/archrast/events.proto
@ -1,427 +0,0 @@
-# Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-#
-# Provides definitions for events.
-
-enum AR_DRAW_TYPE
-{
-    Instanced = 0,
-    IndexedInstanced = 1,
-    InstancedSplit = 2,
-    IndexedInstancedSplit = 3
-};
-
-event Framework::ThreadStartApiEvent
-{
-};
-
-event Framework::ThreadStartWorkerEvent
-{
-};
-
-///@brief Used as a helper event to indicate end of frame. Does not guarantee to capture end of frame on all APIs
-event ApiSwr::FrameEndEvent
-{
-    uint32_t frameId;       // current frame id
-    uint32_t nextDrawId;    // next draw id (always incremental - does not reset)
-};
-
-///@brief Synchronization event.
-event ApiSwr::SwrSyncEvent
-{
-    uint32_t drawId;
-};
-
-///@brief Invalidate hot tiles (i.e. tile cache)
-event ApiSwr::SwrInvalidateTilesEvent
-{
-    uint32_t drawId;
-};
-
-///@brief Invalidate and discard hot tiles within pixel region
-event ApiSwr::SwrDiscardRectEvent
-{
-    uint32_t drawId;
-};
-
-///@brief Flush tiles out to memory that is typically owned by driver (e.g. Flush RT cache)
-event ApiSwr::SwrStoreTilesEvent
-{
-    uint32_t drawId;
-};
-
-event PipelineStats::DrawInfoEvent
-{
-    uint32_t drawId;
-    AR_DRAW_TYPE type;  // type of draw (indexed, instanced, etc)
-    uint32_t topology;  // topology of draw
-    uint32_t numVertices; // number of vertices for draw
-    uint32_t numIndices; // number of indices for draw
-    int32_t  indexOffset; // offset into index buffer
-    int32_t  baseVertex; // which vertex to start with
-    uint32_t numInstances; // number of instances to draw
-    uint32_t startInstance; // which instance to start fetching
-    uint32_t tsEnable; // tesselation enabled
-    uint32_t gsEnable; // geometry shader enabled
-    uint32_t soEnable; // stream-out enabled
-    uint32_t soTopology; // topology of stream-out
-    uint32_t splitId; // split draw count or id
-};
-
-event PipelineStats::DispatchEvent
-{
-    uint32_t drawId;
-    uint32_t threadGroupCountX; // num thread groups in X dimension
-    uint32_t threadGroupCountY; // num thread groups in Y dimension
-    uint32_t threadGroupCountZ; // num thread groups in Z dimension
-};
-
-event PipelineStats::FrontendStatsEvent
-{
-    uint32_t drawId;
-    uint64_t IaVertices;
-    uint64_t IaPrimitives;
-    uint64_t VsInvocations;
-    uint64_t HsInvocations;
-    uint64_t DsInvocations;
-    uint64_t GsInvocations;
-    uint64_t GsPrimitives;
-    uint64_t CInvocations;
-    uint64_t CPrimitives;
-    uint64_t SoPrimStorageNeeded0;
-    uint64_t SoPrimStorageNeeded1;
-    uint64_t SoPrimStorageNeeded2;
-    uint64_t SoPrimStorageNeeded3;
-    uint64_t SoNumPrimsWritten0;
-    uint64_t SoNumPrimsWritten1;
-    uint64_t SoNumPrimsWritten2;
-    uint64_t SoNumPrimsWritten3;
-};
-
-event PipelineStats::BackendStatsEvent
-{
-    uint32_t drawId;
-    uint64_t DepthPassCount;
-    uint64_t PsInvocations;
-    uint64_t CsInvocations;
-
-};
-
-event PipelineStats::EarlyZSingleSample
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::LateZSingleSample
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::EarlyStencilSingleSample
-{
-    uint32_t drawId; 
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::LateStencilSingleSample
-{
-    uint32_t drawId; 
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::EarlyZSampleRate
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::LateZSampleRate
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::EarlyStencilSampleRate
-{
-    uint32_t drawId; 
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::LateStencilSampleRate
-{
-    uint32_t drawId; 
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-// Total Early-Z counts, SingleSample and SampleRate
-event PipelineStats::EarlyZ
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-// Total LateZ counts, SingleSample and SampleRate
-event PipelineStats::LateZ
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-// Total EarlyStencil counts, SingleSample and SampleRate
-event PipelineStats::EarlyStencil
-{
-    uint32_t drawId; 
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-// Total LateStencil counts, SingleSample and SampleRate
-event PipelineStats::LateStencil
-{
-    uint32_t drawId; 
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::EarlyZNullPS
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::EarlyStencilNullPS
-{
-    uint32_t drawId; 
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::EarlyZPixelRate
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::LateZPixelRate
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-
-event PipelineStats::EarlyOmZ
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::EarlyOmStencil
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::LateOmZ
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::LateOmStencil
-{
-    uint32_t drawId;
-    uint64_t passCount;
-    uint64_t failCount;
-};
-
-event PipelineStats::GSInputPrims
-{
-    uint32_t drawId;
-    uint64_t inputPrimCount;
-};
-
-event PipelineStats::GSPrimsGen
-{
-    uint32_t drawId;
-    uint64_t primGeneratedCount;
-};
-
-event PipelineStats::GSVertsInput
-{
-    uint32_t drawId;
-    uint64_t vertsInput;
-};
-
-event PipelineStats::TessPrims
-{
-    uint32_t drawId;
-    uint64_t primCount;
-};
-
-event PipelineStats::RasterTiles
-{
-    uint32_t drawId;
-    uint32_t rastTileCount;
-};
-
-event PipelineStats::ClipperEvent
-{
-    uint32_t drawId;
-    uint32_t trivialRejectCount;
-    uint32_t trivialAcceptCount;
-    uint32_t mustClipCount;
-};
-
-event PipelineStats::CullEvent
-{
-    uint32_t drawId;
-    uint64_t backfacePrimCount;
-    uint64_t degeneratePrimCount;
-};
-
-event PipelineStats::AlphaEvent
-{
-    uint32_t drawId;
-    uint32_t alphaTestCount;
-    uint32_t alphaBlendCount;
-};
-
-event ShaderStats::VSInfo
-{
-    uint32_t drawId;
-    uint32_t numInstExecuted;
-    uint32_t numSampleExecuted;
-    uint32_t numSampleLExecuted;
-    uint32_t numSampleBExecuted;
-    uint32_t numSampleCExecuted;
-    uint32_t numSampleCLZExecuted;
-    uint32_t numSampleCDExecuted;
-    uint32_t numGather4Executed;
-    uint32_t numGather4CExecuted;
-    uint32_t numGather4CPOExecuted;
-    uint32_t numGather4CPOCExecuted;
-    uint32_t numLodExecuted;
-};
-
-event ShaderStats::HSInfo
-{
-    uint32_t drawId;
-    uint32_t numInstExecuted;
-    uint32_t numSampleExecuted;
-    uint32_t numSampleLExecuted;
-    uint32_t numSampleBExecuted;
-    uint32_t numSampleCExecuted;
-    uint32_t numSampleCLZExecuted;
-    uint32_t numSampleCDExecuted;
-    uint32_t numGather4Executed;
-    uint32_t numGather4CExecuted;
-    uint32_t numGather4CPOExecuted;
-    uint32_t numGather4CPOCExecuted;
-    uint32_t numLodExecuted;
-};
-
-event ShaderStats::DSInfo
-{
-    uint32_t drawId;
-    uint32_t numInstExecuted;
-    uint32_t numSampleExecuted;
-    uint32_t numSampleLExecuted;
-    uint32_t numSampleBExecuted;
-    uint32_t numSampleCExecuted;
-    uint32_t numSampleCLZExecuted;
-    uint32_t numSampleCDExecuted;
-    uint32_t numGather4Executed;
-    uint32_t numGather4CExecuted;
-    uint32_t numGather4CPOExecuted;
-    uint32_t numGather4CPOCExecuted;
-    uint32_t numLodExecuted;
-};
-
-event ShaderStats::GSInfo
-{
-    uint32_t drawId;
-    uint32_t numInstExecuted;
-    uint32_t numSampleExecuted;
-    uint32_t numSampleLExecuted;
-    uint32_t numSampleBExecuted;
-    uint32_t numSampleCExecuted;
-    uint32_t numSampleCLZExecuted;
-    uint32_t numSampleCDExecuted;
-    uint32_t numGather4Executed;
-    uint32_t numGather4CExecuted;
-    uint32_t numGather4CPOExecuted;
-    uint32_t numGather4CPOCExecuted;
-    uint32_t numLodExecuted;
-
-};
-
-event ShaderStats::PSInfo
-{
-    uint32_t drawId;
-    uint32_t numInstExecuted;
-    uint32_t numSampleExecuted;
-    uint32_t numSampleLExecuted;
-    uint32_t numSampleBExecuted;
-    uint32_t numSampleCExecuted;
-    uint32_t numSampleCLZExecuted;
-    uint32_t numSampleCDExecuted;
-    uint32_t numGather4Executed;
-    uint32_t numGather4CExecuted;
-    uint32_t numGather4CPOExecuted;
-    uint32_t numGather4CPOCExecuted;
-    uint32_t numLodExecuted;
-};
-
-event ShaderStats::CSInfo
-{
-    uint32_t drawId;
-    uint32_t numInstExecuted;
-    uint32_t numSampleExecuted;
-    uint32_t numSampleLExecuted;
-    uint32_t numSampleBExecuted;
-    uint32_t numSampleCExecuted;
-    uint32_t numSampleCLZExecuted;
-    uint32_t numSampleCDExecuted;
-    uint32_t numGather4Executed;
-    uint32_t numGather4CExecuted;
-    uint32_t numGather4CPOExecuted;
-    uint32_t numGather4CPOCExecuted;
-    uint32_t numLodExecuted;
-};
-
--- a/src/gallium/drivers/swr/rasterizer/archrast/events_private.proto
+++ b/src/gallium/drivers/swr/rasterizer/archrast/events_private.proto
@ -1,212 +0,0 @@
-# Copyright (C) 2018 Intel Corporation.   All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-#
-# Provides definitions for private internal events that are only used internally
-# to rasty for communicating information between Rasty and Archrast. One goal for
-# ArchRast is to not pollute the Rasty code with lots of calculations, etc. that
-# are needed to compute per draw statistics, etc.
-
-event PipelineStats::EarlyDepthStencilInfoSingleSample
-{
-    uint64_t depthPassMask;
-    uint64_t stencilPassMask;
-    uint64_t coverageMask;
-};
-
-event PipelineStats::EarlyDepthStencilInfoSampleRate
-{
-    uint64_t depthPassMask;
-    uint64_t stencilPassMask;
-    uint64_t coverageMask;
-};
-
-event PipelineStats::EarlyDepthStencilInfoNullPS
-{
-    uint64_t depthPassMask;
-    uint64_t stencilPassMask;
-    uint64_t coverageMask;
-};
-
-event PipelineStats::LateDepthStencilInfoSingleSample
-{
-    uint64_t depthPassMask;
-    uint64_t stencilPassMask;
-    uint64_t coverageMask;
-};
-
-event PipelineStats::LateDepthStencilInfoSampleRate
-{
-    uint64_t depthPassMask;
-    uint64_t stencilPassMask;
-    uint64_t coverageMask;
-};
-
-event PipelineStats::LateDepthStencilInfoNullPS
-{
-    uint64_t depthPassMask;
-    uint64_t stencilPassMask;
-    uint64_t coverageMask;
-};
-
-event PipelineStats::EarlyDepthInfoPixelRate
-{
-    uint64_t depthPassCount;
-    uint64_t activeLanes;
-};
-
-
-event PipelineStats::LateDepthInfoPixelRate
-{
-    uint64_t depthPassCount;
-    uint64_t activeLanes;
-};
-
-
-event PipelineStats::BackendDrawEndEvent
-{
-    uint32_t drawId;
-};
-
-event PipelineStats::FrontendDrawEndEvent
-{
-    uint32_t drawId;
-};
-
-event Memory::MemoryAccessEvent
-{
-    uint32_t drawId;
-    uint64_t tsc;
-    uint64_t ptr;
-    uint32_t size;
-    uint8_t isRead;
-    uint8_t client;
-};
-
-event Memory::MemoryStatsEndEvent
-{
-    uint32_t drawId;
-};
-
-event PipelineStats::TessPrimCount
-{
-    uint64_t primCount;
-};
-
-event PipelineStats::RasterTileCount
-{
-    uint32_t drawId;
-    uint64_t rasterTiles;
-};
-
-event PipelineStats::GSPrimInfo
-{
-    uint64_t inputPrimCount;
-    uint64_t primGeneratedCount;
-    uint64_t vertsInput;
-};
-
-// validMask is primitives that still need to be clipped. They weren't rejected due to trivial reject or nan.
-// clipMask is primitives that need to be clipped. So trivial accepts will be 0 while validMask for that is 1.
-// Trivial reject is numInvocations - pop_cnt32(validMask)
-// Trivial accept is validMask & ~clipMask
-// Must clip count is pop_cnt32(clipMask)
-event PipelineStats::ClipInfoEvent
-{
-    uint32_t numInvocations;
-    uint32_t validMask;
-    uint32_t clipMask;
-};
-
-event PipelineStats::CullInfoEvent
-{
-    uint32_t drawId;
-    uint64_t degeneratePrimMask;
-    uint64_t backfacePrimMask;
-    uint32_t validMask;
-};
-
-event PipelineStats::AlphaInfoEvent
-{
-    uint32_t drawId;
-    uint32_t alphaTestEnable;
-    uint32_t alphaBlendEnable;
-};
-
-event PipelineStats::DrawInstancedEvent
-{
-    uint32_t drawId;
-    uint32_t topology;
-    uint32_t numVertices;
-    int32_t  startVertex;
-    uint32_t numInstances;
-    uint32_t startInstance;
-    uint32_t tsEnable;
-    uint32_t gsEnable;
-    uint32_t soEnable;
-    uint32_t soTopology;
-    uint32_t splitId; // Split draw count or id.
-};
-
-event PipelineStats::DrawIndexedInstancedEvent
-{
-    uint32_t drawId;
-    uint32_t topology;
-    uint32_t numIndices;
-    int32_t  indexOffset;
-    int32_t  baseVertex;
-    uint32_t numInstances;
-    uint32_t startInstance;
-    uint32_t tsEnable;
-    uint32_t gsEnable;
-    uint32_t soEnable;
-    uint32_t soTopology;
-    uint32_t splitId; // Split draw count or id.
-};
-
-event ShaderStats::VSStats
-{
-    HANDLE hStats;      // SWR_SHADER_STATS
-};
-
-event ShaderStats::HSStats
-{
-    HANDLE hStats;      // SWR_SHADER_STATS
-};
-
-event ShaderStats::DSStats
-{
-    HANDLE hStats;      // SWR_SHADER_STATS
-};
-
-event ShaderStats::GSStats
-{
-    HANDLE hStats;      // SWR_SHADER_STATS
-};
-
-event ShaderStats::PSStats
-{
-    HANDLE hStats;      // SWR_SHADER_STATS
-};
-
-event ShaderStats::CSStats
-{
-    HANDLE hStats;      // SWR_SHADER_STATS
-};
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_archrast.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_archrast.py
@ -1,327 +0,0 @@
-# Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-
-# Python source
-import os
-import sys
-import re
-from gen_common import *
-
-def parse_event_fields(lines, idx, event_dict):
-    """
-        Parses lines from a proto file that contain an event definition and stores it in event_dict
-    """
-    fields = []
-    end_of_event = False
-
-    # record all fields in event definition.
-    # note: we don't check if there's a leading brace.
-    while not end_of_event and idx < len(lines):
-        line = lines[idx].rstrip()
-        idx += 1
-
-        # ex 1: uint32_t    numSampleCLZExecuted; // number of sample_cl_z instructions executed
-        # ex 2: char        reason[256]; // size of reason
-        match = re.match(r'^(\s*)([\w\*]+)(\s+)([\w]+)(\[\d+\])*;\s*(\/\/.*)*$', line)
-        # group 1 -
-        # group 2 type
-        # group 3 -
-        # group 4 name
-        # group 5 [array size]
-        # group 6 //comment
-
-        if match:
-            field = {
-                "type": match.group(2),
-                "name": match.group(4),
-                "size": int(match.group(5)[1:-1]) if match.group(5) else 1,
-                "desc": match.group(6)[2:].strip() if match.group(6) else "",
-            }
-            fields.append(field)
-
-        end_of_event = re.match(r'(\s*)};', line)
-
-    event_dict['fields'] = fields
-    event_dict['num_fields'] = len(fields)
-
-    return idx
-
-def parse_enums(lines, idx, event_dict):
-    """
-        Parses lines from a proto file that contain an enum definition and stores it in event_dict
-    """
-    enum_names = []
-    end_of_enum = False
-
-    # record all enum values in enumeration
-    # note: we don't check if there's a leading brace.
-    while not end_of_enum and idx < len(lines):
-        line = lines[idx].rstrip()
-        idx += 1
-
-        preprocessor = re.search(r'#if|#endif', line)
-
-        if not preprocessor:
-            enum = re.match(r'(\s*)(\w+)(\s*)', line)
-
-            if enum:
-                enum_names.append(line)
-
-            end_of_enum = re.match(r'(\s*)};', line)
-
-    event_dict['names'] = enum_names
-    return idx
-
-def parse_protos(files, verbose=False):
-    """
-        Parses a proto file and returns a dictionary of event definitions
-    """
-
-    # Protos structure:
-    #
-    # {
-    #   "events": {
-    #     "defs": {     // dict of event definitions where keys are 'group_name::event_name"
-    #       ...,
-    #       "ApiStat::DrawInfoEvent": {
-    #         "id": 3,
-    #         "group": "ApiStat",
-    #         "name": "DrawInfoEvent",  // name of event without 'group_name::' prefix
-    #         "desc": "",
-    #         "fields": [
-    #           {
-    #             "type": "uint32_t",
-    #             "name": "drawId",
-    #             "size": 1,
-    #             "desc": "",
-    #           },
-    #           ...
-    #         ]
-    #       },
-    #       ...
-    #     },
-    #     "groups": {   // dict of groups with lists of event keys
-    #       "ApiStat": [
-    #         "ApiStat::DispatchEvent",
-    #         "ApiStat::DrawInfoEvent",
-    #         ...
-    #       ],
-    #       "Framework": [
-    #         "Framework::ThreadStartApiEvent",
-    #         "Framework::ThreadStartWorkerEvent",
-    #         ...
-    #       ],
-    #       ...
-    #     },
-    #     "map": {  // map of event ids to match archrast output to event key
-    #       "1": "Framework::ThreadStartApiEvent",
-    #       "2": "Framework::ThreadStartWorkerEvent",
-    #       "3": "ApiStat::DrawInfoEvent",
-    #       ...
-    #     }
-    #   },
-    #   "enums": { ... }    // enums follow similar defs, map (groups?) structure
-    # }
-
-    protos = {
-        'events': {
-            'defs': {},             # event dictionary containing events with their fields
-            'map': {},              # dictionary to map event ids to event names
-            'groups': {}            # event keys stored by groups
-        },
-        'enums': {
-            'defs': {},
-            'map': {}
-        }
-    }
-
-    event_id = 0
-    enum_id = 0
-
-    if type(files) is not list:
-        files = [files]
-
-    for filename in files:
-        if verbose:
-            print("Parsing proto file: %s" % os.path.normpath(filename))
-
-        with open(filename, 'r') as f:
-            lines = f.readlines()
-            in_brief = False
-            brief = []
-            idx = 0
-            while idx < len(lines):
-                line = lines[idx].strip()
-                idx += 1
-
-                # If currently processing a brief, keep processing or change state
-                if in_brief:
-                    match = re.match(r'^\s*\/\/\/\s*(.*)$', line)                   # i.e. "/// more event desc..."
-                    if match:
-                        brief.append(match.group(1).strip())
-                        continue
-                    else:
-                        in_brief = False
-
-                # Match event/enum brief
-                match = re.match(r'^\s*\/\/\/\s*@(brief|breif)\s*(.*)$', line)       # i.e. "///@brief My event desc..."
-                if match:
-                    in_brief = True
-                    brief.append(match.group(2).strip())
-                    continue
-
-                # Match event definition
-                match = re.match(r'event(\s*)(((\w*)::){0,1}(\w+))', line)          # i.e. "event SWTag::CounterEvent"
-                if match:
-                    event_id += 1
-
-                    # Parse event attributes
-                    event_key = match.group(2)                                      # i.e. SWTag::CounterEvent
-                    event_group = match.group(4) if match.group(4) else ""          # i.e. SWTag
-                    event_name = match.group(5)                                     # i.e. CounterEvent
-
-                    # Define event attributes
-                    event = {
-                        'id': event_id,
-                        'group': event_group,
-                        'name': event_name,
-                        'desc': ' '.join(brief)
-                    }
-                    # Add period at end of event desc if necessary
-                    if event["desc"] and event["desc"][-1] != '.':
-                        event["desc"] += '.'
-
-                    # Reset brief
-                    brief = []
-
-                    # Now add event fields
-                    idx = parse_event_fields(lines, idx, event)
-
-                    # Register event and mapping
-                    protos['events']['defs'][event_key] = event
-                    protos['events']['map'][event_id] = event_key
-
-                    continue
-
-                # Match enum definition
-                match = re.match(r'enum(\s*)(\w+)', line)
-                if match:
-                    enum_id += 1
-
-                    # Parse enum attributes
-                    enum_name = match.group(2)
-
-                    # Define enum attr
-                    enum = {
-                        'name': enum_name,
-                        'desc': ' '.join(brief)
-                    }
-                    # Add period at end of event desc if necessary
-                    if enum["desc"] and enum["desc"][-1] != '.':
-                        enum["desc"] += '.'
-
-                    # Reset brief
-                    brief = []
-
-                    # Now add enum fields
-                    idx = parse_enums(lines, idx, enum)
-
-                    # Register enum and mapping
-                    protos['enums']['defs'][enum_name] = enum
-                    protos['enums']['map'][enum_id] = enum_name
-
-                    continue
-
-    # Sort and group events
-    event_groups = protos['events']['groups']
-    for key in sorted(protos['events']['defs']):
-        group = protos['events']['defs'][key]['group']
-        if group not in event_groups:
-            event_groups[group] = []
-        event_groups[group].append(key)
-
-    return protos
-
-
-def main():
-
-    # Parse args...
-    parser = ArgumentParser()
-    parser.add_argument("--proto", "-p", dest="protos", nargs='+', help="Path to all proto file(s) to process. Accepts one or more paths (i.e. events.proto and events_private.proto)", required=True)
-    parser.add_argument("--output-dir", help="Output dir (defaults to ./codegen). Will create folder if it does not exist.", required=False, default="codegen")
-    parser.add_argument("--verbose", "-v", help="Verbose", action="store_true")
-    args = parser.parse_args()
-
-    if not os.path.exists(args.output_dir):
-        MakeDir(args.output_dir)
-
-    for f in args.protos:
-        if not os.path.exists(f):
-            print('Error: Could not find proto file %s' % f, file=sys.stderr)
-            return 1
-
-    # Parse each proto file and add to protos container
-    protos = parse_protos(args.protos, args.verbose)
-
-    files = [
-        ["gen_ar_event.hpp", ""],
-        ["gen_ar_event.cpp", ""],
-        ["gen_ar_eventhandler.hpp", "gen_ar_event.hpp"],
-        ["gen_ar_eventhandlerfile.hpp", "gen_ar_eventhandler.hpp"]
-    ]
-
-    rval = 0
-
-    try:
-        # Delete existing files
-        for f in files:
-            filename = f[0]
-            output_fullpath = os.path.join(args.output_dir, filename)
-            if os.path.exists(output_fullpath):
-                if args.verbose:
-                    print("Deleting existing file: %s" % output_fullpath)
-                os.remove(output_fullpath)
-
-        # Generate files from templates
-        print("Generating c++ from proto files...")
-        for f in files:
-            filename = f[0]
-            event_header = f[1]
-            curdir = os.path.dirname(os.path.abspath(__file__))
-            template_file = os.path.join(curdir, 'templates', filename)
-            output_fullpath = os.path.join(args.output_dir, filename)
-
-            if args.verbose:
-                print("Generating: %s" % output_fullpath)
-            MakoTemplateWriter.to_file(template_file, output_fullpath,
-                    cmdline=sys.argv,
-                    filename=filename,
-                    protos=protos,
-                    event_header=event_header)
-
-    except Exception as e:
-        print(e)
-        rval = 1
-
-    return rval
-
-if __name__ == '__main__':
-    sys.exit(main())
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py
@ -1,164 +0,0 @@
-# Copyright (C) 2017-2018 Intel Corporation.   All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the 'Software'),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-
-# Python source
-
-import itertools
-import os
-import sys
-from gen_common import *
-
-
-def main(args=sys.argv[1:]):
-    thisDir = os.path.dirname(os.path.realpath(__file__))
-    parser = ArgumentParser('Generate files and initialization functions for all permutations of BackendPixelRate.')
-    parser.add_argument('--dim', help='gBackendPixelRateTable array dimensions', nargs='+', type=int, required=True)
-    parser.add_argument('--outdir', help='output directory', nargs='?', type=str, default=thisDir)
-    parser.add_argument('--split', help='how many lines of initialization per file [0=no split]', nargs='?', type=int, default='512')
-    parser.add_argument('--numfiles', help='how many output files to generate', nargs='?', type=int, default='0')
-    parser.add_argument('--cpp', help='Generate cpp file(s)', action='store_true', default=False)
-    parser.add_argument('--hpp', help='Generate hpp file', action='store_true', default=False)
-    parser.add_argument('--cmake', help='Generate cmake file', action='store_true', default=False)
-    parser.add_argument('--rast', help='Generate rasterizer functions instead of normal backend', action='store_true', default=False)
-
-    args = parser.parse_args(args)
-
-
-    class backendStrs :
-        def __init__(self) :
-            self.outFileName = 'gen_BackendPixelRate%s.cpp'
-            self.outHeaderName = 'gen_BackendPixelRate.hpp'
-            self.functionTableName = 'gBackendPixelRateTable'
-            self.funcInstanceHeader = ' = BackendPixelRate<SwrBackendTraits<'
-            self.template = 'gen_backend.cpp'
-            self.hpp_template = 'gen_header_init.hpp'
-            self.cmakeFileName = 'gen_backends.cmake'
-            self.cmakeSrcVar = 'GEN_BACKEND_SOURCES'
-            self.tableName = 'BackendPixelRate'
-
-            if args.rast:
-                self.outFileName = 'gen_rasterizer%s.cpp'
-                self.outHeaderName = 'gen_rasterizer.hpp'
-                self.functionTableName = 'gRasterizerFuncs'
-                self.funcInstanceHeader = ' = RasterizeTriangle<RasterizerTraits<'
-                self.template = 'gen_rasterizer.cpp'
-                self.cmakeFileName = 'gen_rasterizer.cmake'
-                self.cmakeSrcVar = 'GEN_RASTERIZER_SOURCES'
-                self.tableName = 'RasterizerFuncs'
-
-
-    backend = backendStrs()
-
-    output_list = []
-    for x in args.dim:
-        output_list.append(list(range(x)))
-
-    # generate all permutations possible for template parameter inputs
-    output_combinations = list(itertools.product(*output_list))
-    output_list = []
-
-    # for each permutation
-    for x in range(len(output_combinations)):
-        # separate each template peram into its own list member
-        new_list = [output_combinations[x][i] for i in range(len(output_combinations[x]))]
-        tempStr = backend.functionTableName
-        #print each list member as an index in the multidimensional array
-        for i in new_list:
-            tempStr += '[' + str(i) + ']'
-        #map each entry in the permutation as its own string member, store as the template instantiation string
-        tempStr += backend.funcInstanceHeader + ','.join(map(str, output_combinations[x])) + '>>;'
-        #append the line of c++ code in the list of output lines
-        output_list.append(tempStr)
-
-    # how many files should we split the global template initialization into?
-    if (args.split == 0):
-        numFiles = 1
-    else:
-        numFiles = (len(output_list) + args.split - 1) // args.split
-    if (args.numfiles != 0):
-        numFiles = args.numfiles
-    linesPerFile = (len(output_list) + numFiles - 1) // numFiles
-    chunkedList = [output_list[x:x+linesPerFile] for x in range(0, len(output_list), linesPerFile)]
-
-    tmp_output_dir = MakeTmpDir('_codegen')
-
-    if not os.path.exists(args.outdir):
-        try:
-            os.makedirs(args.outdir)
-        except OSError as err:
-            if err.errno != errno.EEXIST:
-                print('ERROR: Could not create directory:', args.outdir, file=sys.stderr)
-                return 1
-
-    rval = 0
-
-    # generate .cpp files
-    try:
-        if args.cpp:
-            baseCppName = os.path.join(tmp_output_dir, backend.outFileName)
-            templateCpp = os.path.join(thisDir, 'templates', backend.template)
-
-            for fileNum in range(numFiles):
-                filename = baseCppName % str(fileNum)
-                MakoTemplateWriter.to_file(
-                    templateCpp,
-                    baseCppName % str(fileNum),
-                    cmdline=sys.argv,
-                    fileNum=fileNum,
-                    funcList=chunkedList[fileNum])
-
-        if args.hpp:
-            baseHppName = os.path.join(tmp_output_dir, backend.outHeaderName)
-            templateHpp = os.path.join(thisDir, 'templates', backend.hpp_template)
-
-            MakoTemplateWriter.to_file(
-                templateHpp,
-                baseHppName,
-                cmdline=sys.argv,
-                numFiles=numFiles,
-                filename=backend.outHeaderName,
-                tableName=backend.tableName)
-
-        # generate gen_backend.cmake file
-        if args.cmake:
-            templateCmake = os.path.join(thisDir, 'templates', 'gen_backend.cmake')
-            cmakeFile = os.path.join(tmp_output_dir, backend.cmakeFileName)
-
-            MakoTemplateWriter.to_file(
-                templateCmake,
-                cmakeFile,
-                cmdline=sys.argv,
-                srcVar=backend.cmakeSrcVar,
-                numFiles=numFiles,
-                baseCppName='${RASTY_GEN_SRC_DIR}/backends/' + os.path.basename(baseCppName))
-
-        rval = CopyDirFilesIfDifferent(tmp_output_dir, args.outdir)
-
-    except:
-        rval = 1
-
-    finally:
-        DeleteDirTree(tmp_output_dir)
-
-    return rval
-
-if __name__ == '__main__':
-    sys.exit(main())
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py
@ -1,291 +0,0 @@
-# Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-
-# Python source
-import os
-import errno
-import sys
-import argparse
-import tempfile
-import filecmp
-import shutil
-from mako.template import Template
-from mako.exceptions import RichTraceback
-
-#==============================================================================
-def ConcatLists(list_of_lists):
-    output = []
-    for l in list_of_lists: output += l
-    return output
-
-#==============================================================================
-def MakeTmpDir(suffix=''):
-    '''
-        Create temporary directory for use in codegen scripts.
-    '''
-    return tempfile.mkdtemp(suffix)
-
-#==============================================================================
-def MakeDir(dir_path):
-    '''
-        Create a directory if it doesn't exist
-
-        returns 0 on success, non-zero on failure
-    '''
-    dir_path = os.path.abspath(dir_path)
-
-    if not os.path.exists(dir_path):
-        try:
-            os.makedirs(dir_path)
-        except OSError as err:
-            if err.errno != errno.EEXIST:
-                return 1
-    else:
-        if not os.path.isdir(dir_path):
-            return 1
-
-    return 0
-
-#==============================================================================
-def DeleteDirTree(dir_path):
-    '''
-        Delete directory tree.
-
-        returns 0 on success, non-zero on failure
-    '''
-    rval = 0
-    try:
-        shutil.rmtree(dir_path, False)
-    except:
-        rval = 1
-    return rval
-
-#==============================================================================
-def CopyFileIfDifferent(src, dst, verbose = False):
-    '''
-        Copy <src> file to <dst> file if the <dst>
-        file either doesn't contain the file or the file
-        contents are different.
-
-        returns 0 on success, non-zero on failure
-    '''
-
-    assert os.path.isfile(src)
-    assert (False == os.path.exists(dst) or os.path.isfile(dst))
-
-    need_copy = not os.path.exists(dst)
-    if not need_copy:
-        need_copy = not filecmp.cmp(src, dst)
-
-    if need_copy:
-        try:
-            shutil.copy2(src, dst)
-        except:
-            print('ERROR: Could not copy %s to %s' % (src, dst), file=sys.stderr)
-            return 1
-
-        if verbose:
-            print(src, '-->', dst)
-
-    return 0
-
-#==============================================================================
-def CopyDirFilesIfDifferent(src, dst, recurse = True, verbose = False, orig_dst = None):
-    '''
-        Copy files <src> directory to <dst> directory if the <dst>
-        directory either doesn't contain the file or the file
-        contents are different.
-
-        Optionally recurses into subdirectories
-
-        returns 0 on success, non-zero on failure
-    '''
-
-    assert os.path.isdir(src)
-    assert os.path.isdir(dst)
-
-    src = os.path.abspath(src)
-    dst = os.path.abspath(dst)
-
-    if not orig_dst:
-        orig_dst = dst
-
-    for f in os.listdir(src):
-        src_path = os.path.join(src, f)
-        dst_path = os.path.join(dst, f)
-
-        # prevent recursion
-        if src_path == orig_dst:
-            continue
-
-        if os.path.isdir(src_path):
-            if recurse:
-                if MakeDir(dst_path):
-                    print('ERROR: Could not create directory:', dst_path, file=sys.stderr)
-                    return 1
-
-                if verbose:
-                    print('mkdir', dst_path)
-                rval = CopyDirFilesIfDifferent(src_path, dst_path, recurse, verbose, orig_dst)
-        else:
-            rval = CopyFileIfDifferent(src_path, dst_path, verbose)
-
-        if rval:
-            return rval
-
-    return 0
-
-#==============================================================================
-class MakoTemplateWriter:
-    '''
-        MakoTemplateWriter - Class (namespace) for functions to generate strings
-        or files using the Mako template module.
-
-        See http://docs.makotemplates.org/en/latest/ for
-        mako documentation.
-   '''
-    
-    @staticmethod
-    def to_string(template_filename, **kwargs):
-        '''
-            Write template data to a string object and return the string
-        '''
-        from mako.template      import Template
-        from mako.exceptions    import RichTraceback
-
-        try:
-            template = Template(filename=template_filename)
-            # Split + Join fixes line-endings for whatever platform you are using
-            return '\n'.join(template.render(**kwargs).splitlines())
-        except:
-            traceback = RichTraceback()
-            for (filename, lineno, function, line) in traceback.traceback:
-                print('File %s, line %s, in %s' % (filename, lineno, function))
-                print(line, '\n')
-            print('%s: %s' % (str(traceback.error.__class__.__name__), traceback.error))
-            raise
-
-    @staticmethod
-    def to_file(template_filename, output_filename, **kwargs):
-        '''
-            Write template data to a file
-        '''
-        if MakeDir(os.path.dirname(output_filename)):
-            return 1
-        with open(output_filename, 'w') as outfile:
-            print(MakoTemplateWriter.to_string(template_filename, **kwargs), file=outfile)
-        return 0
-
-
-#==============================================================================
-class ArgumentParser(argparse.ArgumentParser):
-    '''
-    Subclass of argparse.ArgumentParser
-
-    Allow parsing from command files that start with @
-    Example:
-      >bt run @myargs.txt
-    
-    Contents of myargs.txt:
-      -m <machine>
-      --target cdv_win7
-    
-    The below function allows multiple args to be placed on the same text-file line.
-    The default is one token per line, which is a little cumbersome.
-    
-    Also allow all characters after a '#' character to be ignored.
-    '''
-    
-    #==============================================================================
-    class _HelpFormatter(argparse.RawTextHelpFormatter):
-        ''' Better help formatter for argument parser '''
-
-        def _split_lines(self, text, width):
-            ''' optimized split lines algorithm, indents split lines '''
-            lines = text.splitlines()
-            out_lines = []
-            if len(lines):
-                out_lines.append(lines[0])
-                for line in lines[1:]:
-                    out_lines.append('  ' + line)
-            return out_lines
-
-    #==============================================================================
-    def __init__(self, *args, **kwargs):
-        ''' Constructor.  Compatible with argparse.ArgumentParser(),
-            but with some modifications for better usage and help display.
-        '''
-        super(ArgumentParser, self).__init__(
-                *args,
-                fromfile_prefix_chars='@',
-                formatter_class=ArgumentParser._HelpFormatter,
-                **kwargs)
-
-    #==========================================================================
-    def convert_arg_line_to_args(self, arg_line):
-        ''' convert one line of parsed file to arguments '''
-        arg_line = arg_line.split('#', 1)[0]
-        if sys.platform == 'win32':
-            arg_line = arg_line.replace('\\', '\\\\')
-        for arg in shlex.split(arg_line):
-            if not arg.strip():
-                continue
-            yield arg
-
-    #==========================================================================
-    def _read_args_from_files(self, arg_strings):
-        ''' read arguments from files '''
-        # expand arguments referencing files
-        new_arg_strings = []
-        for arg_string in arg_strings:
-
-            # for regular arguments, just add them back into the list
-            if arg_string[0] not in self.fromfile_prefix_chars:
-                new_arg_strings.append(arg_string)
-
-            # replace arguments referencing files with the file content
-            else:
-                filename = arg_string[1:]
-
-                # Search in sys.path
-                if not os.path.exists(filename):
-                    for path in sys.path:
-                        filename = os.path.join(path, arg_string[1:])
-                        if os.path.exists(filename):
-                            break
-
-                try:
-                    args_file = open(filename)
-                    try:
-                        arg_strings = []
-                        for arg_line in args_file.read().splitlines():
-                            for arg in self.convert_arg_line_to_args(arg_line):
-                                arg_strings.append(arg)
-                        arg_strings = self._read_args_from_files(arg_strings)
-                        new_arg_strings.extend(arg_strings)
-                    finally:
-                        args_file.close()
-                except IOError:
-                    err = sys.exc_info()[1]
-                    self.error(str(err))
-
-        # return the modified argument list
-        return new_arg_strings
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_knobs.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_knobs.py
@ -1,80 +0,0 @@
-# Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-
-# Python source
-import os
-import sys
-import knob_defs
-from gen_common import *
-
-def main(args=sys.argv[1:]):
-
-    # parse args
-    parser = ArgumentParser()
-    parser.add_argument("--output", "-o", help="Path to output file", required=True)
-    parser.add_argument("--gen_h", "-gen_h", help="Generate gen_knobs.h", action="store_true", default=False)
-    parser.add_argument("--gen_cpp", "-gen_cpp", help="Generate gen_knobs.cpp", action="store_true", required=False)
-
-    args = parser.parse_args()
-
-    cur_dir = os.path.dirname(os.path.abspath(__file__))
-    template_cpp = os.path.join(cur_dir, 'templates', 'gen_knobs.cpp')
-    template_h = os.path.join(cur_dir, 'templates', 'gen_knobs.h')
-
-    output_filename = os.path.basename(args.output)
-    output_dir = MakeTmpDir('_codegen')
-
-    output_file = os.path.join(output_dir, output_filename)
-
-    rval = 0
-
-    try:
-        if args.gen_h:
-            MakoTemplateWriter.to_file(
-                template_h,
-                output_file,
-                cmdline=sys.argv,
-                filename='gen_knobs',
-                knobs=knob_defs.KNOBS)
-
-        if args.gen_cpp:
-            MakoTemplateWriter.to_file(
-                template_cpp,
-                output_file,
-                cmdline=sys.argv,
-                filename='gen_knobs',
-                knobs=knob_defs.KNOBS,
-                includes=['core/knobs_init.h', 'common/os.h', 'sstream', 'iomanip'])
-
-        rval = CopyFileIfDifferent(output_file, args.output)
-
-    except:
-        rval = 1
-
-    finally:
-        # ignore errors from delete of tmp directory
-        DeleteDirTree(output_dir)
-
-    return 0
-
-if __name__ == '__main__':
-    sys.exit(main())
-
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@ -1,362 +0,0 @@
-# Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-
-import os, sys, re
-from gen_common import *
-from argparse import FileType
-
-inst_aliases = {
-    'SHUFFLE_VECTOR': 'VSHUFFLE',
-    'INSERT_ELEMENT': 'VINSERT',
-    'EXTRACT_ELEMENT': 'VEXTRACT',
-    'MEM_SET': 'MEMSET',
-    'MEM_CPY': 'MEMCOPY',
-    'MEM_MOVE': 'MEMMOVE',
-    'L_SHR': 'LSHR',
-    'A_SHR': 'ASHR',
-    'BIT_CAST': 'BITCAST',
-    'U_DIV': 'UDIV',
-    'S_DIV': 'SDIV',
-    'U_REM': 'UREM',
-    'S_REM': 'SREM',
-    'BIN_OP': 'BINOP',
-}
-
-intrinsics = [
-    ['VGATHERPD',   ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
-    ['VGATHERPS',   ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
-    ['VGATHERDD',   ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
-    ['VSCATTERPS',  ['pBase', 'mask', 'indices', 'src', 'scale'], 'src'],
-    ['VRCPPS',      ['a'], 'a'],
-    ['VROUND',      ['a', 'rounding'], 'a'],
-    ['BEXTR_32',    ['src', 'control'], 'src'],
-    ['VPSHUFB',     ['a', 'b'], 'a'],
-    ['VPERMD',      ['a', 'idx'], 'a'],
-    ['VPERMPS',     ['idx', 'a'], 'a'],
-    ['VCVTPD2PS',   ['a'], 'getVectorType(mFP32Ty, VEC_GET_NUM_ELEMS)'],
-    ['VCVTPS2PH',   ['a', 'round'], 'mSimdInt16Ty'],
-    ['VHSUBPS',     ['a', 'b'], 'a'],
-    ['VPTESTC',     ['a', 'b'], 'mInt32Ty'],
-    ['VPTESTZ',     ['a', 'b'], 'mInt32Ty'],
-    ['VPHADDD',     ['a', 'b'], 'a'],
-    ['PDEP32',      ['a', 'b'], 'a'],
-    ['RDTSC',       [], 'mInt64Ty'],
-]
-
-llvm_intrinsics = [
-    ['CTTZ', 'cttz', ['a', 'flag'], ['a']],
-    ['CTLZ', 'ctlz', ['a', 'flag'], ['a']],
-    ['VSQRTPS', 'sqrt', ['a'], ['a']],
-    ['STACKSAVE', 'stacksave', [], []],
-    ['STACKRESTORE', 'stackrestore', ['a'], []],
-    ['VMINPS', 'minnum', ['a', 'b'], ['a']],
-    ['VMAXPS', 'maxnum', ['a', 'b'], ['a']],
-    ['VFMADDPS', 'fmuladd', ['a', 'b', 'c'], ['a']],
-    ['DEBUGTRAP', 'debugtrap', [], []],
-    ['POPCNT', 'ctpop', ['a'], ['a']],
-    ['LOG2', 'log2', ['a'], ['a']],
-    ['FABS', 'fabs', ['a'], ['a']],
-    ['EXP2', 'exp2', ['a'], ['a']],
-    ['COS', 'cos', ['a'], ['a']],
-    ['SIN', 'sin', ['a'], ['a']],
-    ['FLOOR', 'floor', ['a'], ['a']],
-    ['POW', 'pow', ['a', 'b'], ['a']]
-]
-
-this_dir = os.path.dirname(os.path.abspath(__file__))
-template = os.path.join(this_dir, 'templates', 'gen_builder.hpp')
-
-def convert_uppercamel(name):
-    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
-    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).upper()
-
-'''
-    Given an input file (e.g. IRBuilder.h) generates function dictionary.
-'''
-def parse_ir_builder(input_file):
-
-    functions = []
-
-    lines = input_file.readlines()
-    deprecated = None
-
-    idx = 0
-    while idx < len(lines) - 1:
-        line = lines[idx].rstrip()
-        idx += 1
-
-        if deprecated is None:
-            deprecated = re.search(r'LLVM_ATTRIBUTE_DEPRECATED', line)
-
-        #match = re.search(r'\*Create', line)
-        match = re.search(r'[\*\s]Create(\w*)\(', line)
-        if match is not None:
-            #print('Line: %s' % match.group(1))
-
-            # Skip function if LLVM_ATTRIBUTE_DEPRECATED found before
-            if deprecated is not None:
-                deprecated = None
-                continue
-
-            if re.search(r'^\s*Create', line) is not None:
-                func_sig = lines[idx-2].rstrip() + line
-            else:
-                func_sig = line
-
-            end_of_args = False
-            while not end_of_args:
-                end_paren = re.search(r'\)', line)
-                if end_paren is not None:
-                    end_of_args = True
-                else:
-                    line = lines[idx].rstrip()
-                    func_sig += line
-                    idx += 1
-
-            delfunc = re.search(r'LLVM_DELETED_FUNCTION|= delete;', func_sig)
-
-            if not delfunc:
-                func = re.search(r'(.*?)\*[\n\s]*(Create\w*)\((.*?)\)', func_sig)
-                if func is not None:
-
-                    return_type = func.group(1).strip() + '*'
-                    func_name = func.group(2)
-                    arguments = func.group(3)
-
-                    func_args = []
-                    arg_names = []
-                    args = arguments.split(',')
-                    for arg in args:
-                        arg = arg.strip()
-                        if arg:
-                            func_args.append(arg)
-
-                            split_args = arg.split('=')
-                            arg_name = split_args[0].rsplit(None, 1)[-1]
-
-                            reg_arg = re.search(r'[\&\*]*(\w*)', arg_name)
-                            if reg_arg:
-                                arg_names += [reg_arg.group(1)]
-
-                    ignore = False
-
-                    # The following functions need to be ignored in openswr.
-                    # API change in llvm-5.0 breaks baked autogen files
-                    if (
-                        (func_name == 'CreateFence' or
-                         func_name == 'CreateAtomicCmpXchg' or
-                         func_name == 'CreateAtomicRMW')):
-                        ignore = True
-
-                    # The following functions need to be ignored.
-                    if (func_name == 'CreateInsertNUWNSWBinOp' or
-                        func_name == 'CreateMaskedIntrinsic' or
-                        func_name == 'CreateAlignmentAssumptionHelper' or
-                        func_name == 'CreateGEP' or
-                        func_name == 'CreateLoad' or
-                        func_name == 'CreateMaskedLoad' or
-                        func_name == 'CreateStore' or
-                        func_name == 'CreateMaskedStore' or
-                        func_name == 'CreateFCmpHelper' or
-                        func_name == 'CreateElementUnorderedAtomicMemCpy'):
-                        ignore = True
-
-                    # Convert CamelCase to CAMEL_CASE
-                    func_mod = re.search(r'Create(\w*)', func_name)
-                    if func_mod:
-                        func_mod = func_mod.group(1)
-                        func_mod = convert_uppercamel(func_mod)
-                        if func_mod[0:2] == 'F_' or func_mod[0:2] == 'I_':
-                            func_mod = func_mod[0] + func_mod[2:]
-
-                    # Substitute alias based on CAMEL_CASE name.
-                    func_alias = inst_aliases.get(func_mod)
-                    if not func_alias:
-                        func_alias = func_mod
-
-                        if func_name == 'CreateCall' or func_name == 'CreateGEP':
-                            arglist = re.search(r'ArrayRef', ', '.join(func_args))
-                            if arglist:
-                                func_alias = func_alias + 'A'
-
-                    if not ignore:
-                        functions.append({
-                                'name'      : func_name,
-                                'alias'     : func_alias,
-                                'return'    : return_type,
-                                'args'      : ', '.join(func_args),
-                                'arg_names' : arg_names,
-                            })
-
-    return functions
-
-'''
-    Auto-generates macros for LLVM IR
-'''
-def generate_gen_h(functions, output_dir):
-    filename = 'gen_builder.hpp'
-    output_filename = os.path.join(output_dir, filename)
-
-    templfuncs = []
-    for func in functions:
-        decl = '%s %s(%s)' % (func['return'], func['alias'], func['args'])
-
-        templfuncs.append({
-            'decl'      : decl,
-            'intrin'    : func['name'],
-            'args'      : func['arg_names'],
-        })
-
-    MakoTemplateWriter.to_file(
-        template,
-        output_filename,
-        cmdline=sys.argv,
-        comment='Builder IR Wrappers',
-        filename=filename,
-        functions=templfuncs,
-        isX86=False, isIntrin=False)
-
-'''
-    Auto-generates macros for LLVM IR
-'''
-def generate_meta_h(output_dir):
-    filename = 'gen_builder_meta.hpp'
-    output_filename = os.path.join(output_dir, filename)
-
-    functions = []
-    for inst in intrinsics:
-        name = inst[0]
-        args = inst[1]
-        ret = inst[2]
-
-        #print('Inst: %s, x86: %s numArgs: %d' % (inst[0], inst[1], len(inst[2])))
-        if len(args) != 0:
-            declargs = 'Value* ' + ', Value* '.join(args)
-            decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (name, declargs)
-        else:
-            decl = 'Value* %s(const llvm::Twine& name = "")' % (name)
-
-        # determine the return type of the intrinsic. It can either be:
-        # - type of one of the input arguments
-        # - snippet of code to set the return type
-
-        if ret in args:
-            returnTy = ret + '->getType()'
-        else:
-            returnTy = ret
-
-        functions.append({
-            'decl'      : decl,
-            'name'      : name,
-            'args'      : args,
-            'returnType': returnTy
-        })
-
-    MakoTemplateWriter.to_file(
-        template,
-        output_filename,
-        cmdline=sys.argv,
-        comment='meta intrinsics',
-        filename=filename,
-        functions=functions,
-        isX86=True, isIntrin=False)
-
-def generate_intrin_h(output_dir):
-    filename = 'gen_builder_intrin.hpp'
-    output_filename = os.path.join(output_dir, filename)
-
-    functions = []
-    for inst in llvm_intrinsics:
-        #print('Inst: %s, x86: %s numArgs: %d' % (inst[0], inst[1], len(inst[2])))
-        if len(inst[2]) != 0:
-            declargs = 'Value* ' + ', Value* '.join(inst[2])
-            decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (inst[0], declargs)
-        else:
-            decl = 'Value* %s(const llvm::Twine& name = "")' % (inst[0])
-
-        functions.append({
-            'decl'      : decl,
-            'intrin'    : inst[1],
-            'args'      : inst[2],
-            'types'     : inst[3],
-        })
-
-    MakoTemplateWriter.to_file(
-        template,
-        output_filename,
-        cmdline=sys.argv,
-        comment='llvm intrinsics',
-        filename=filename,
-        functions=functions,
-        isX86=False, isIntrin=True)
-'''
-    Function which is invoked when this script is started from a command line.
-    Will present and consume a set of arguments which will tell this script how
-    to behave
-'''
-def main():
-
-    # Parse args...
-    parser = ArgumentParser()
-    parser.add_argument('--input', '-i', type=FileType('r'), help='Path to IRBuilder.h', required=False)
-    parser.add_argument('--output-dir', '-o', action='store', dest='output', help='Path to output directory', required=True)
-    parser.add_argument('--gen_h', help='Generate builder_gen.h', action='store_true', default=False)
-    parser.add_argument('--gen_meta_h', help='Generate meta intrinsics. No input is needed.', action='store_true', default=False)
-    parser.add_argument('--gen_intrin_h', help='Generate llvm intrinsics. No input is needed.', action='store_true', default=False)
-    args = parser.parse_args()
-
-    if not os.path.exists(args.output):
-        os.makedirs(args.output)
-
-    final_output_dir = args.output
-    args.output = MakeTmpDir('_codegen')
-
-    rval = 0
-    try:
-        if args.input:
-            functions = parse_ir_builder(args.input)
-
-            if args.gen_h:
-                generate_gen_h(functions, args.output)
-
-        elif args.gen_h:
-            print('Need to specify --input for --gen_h!')
-
-        if args.gen_meta_h:
-            generate_meta_h(args.output)
-
-        if args.gen_intrin_h:
-            generate_intrin_h(args.output)
-
-        rval = CopyDirFilesIfDifferent(args.output, final_output_dir)
-
-    except:
-        print('ERROR: Could not generate llvm_ir_macros', file=sys.stderr)
-        rval = 1
-
-    finally:
-        DeleteDirTree(args.output)
-
-    return rval
-
-if __name__ == '__main__':
-    sys.exit(main())
-# END OF FILE
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py
@ -1,360 +0,0 @@
-# Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-
-import os, sys, re
-from gen_common import *
-from argparse import FileType
-
-'''
-'''
-def gen_llvm_type(type, name, idx, is_pointer, is_pointer_pointer, is_array, is_array_array, array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file):
-
-    llvm_type = ''
-
-    if is_llvm_struct:
-        if is_pointer or is_pointer_pointer:
-            llvm_type = 'Type::getInt32Ty(ctx)'
-        else:
-            llvm_type = 'ArrayType::get(Type::getInt8Ty(ctx), sizeof(%s))' % type
-    elif is_llvm_enum:
-        llvm_type = 'Type::getInt32Ty(ctx)'
-    elif is_llvm_pfn:
-        llvm_type = 'PointerType::get(Type::getInt8Ty(ctx), 0)'
-    else:
-        if type == 'BYTE' or type == 'char' or type == 'uint8_t' or type == 'int8_t' or type == 'bool':
-            llvm_type = 'Type::getInt8Ty(ctx)'
-        elif type == 'UINT64' or type == 'INT64' or type == 'uint64_t' or type == 'int64_t' or type == 'gfxptr_t':
-            llvm_type = 'Type::getInt64Ty(ctx)'
-        elif type == 'UINT16' or type == 'int16_t' or type == 'uint16_t':
-            llvm_type = 'Type::getInt16Ty(ctx)'
-        elif type == 'UINT' or type == 'INT' or type == 'int' or type == 'BOOL' or type == 'uint32_t' or type == 'int32_t':
-            llvm_type = 'Type::getInt32Ty(ctx)'
-        elif type == 'float' or type == 'FLOAT':
-            llvm_type = 'Type::getFloatTy(ctx)'
-        elif type == 'double' or type == 'DOUBLE':
-            llvm_type = 'Type::getDoubleTy(ctx)'
-        elif type == 'void' or type == 'VOID':
-            llvm_type = 'Type::getInt32Ty(ctx)'
-        elif type == 'HANDLE':
-            llvm_type = 'PointerType::get(Type::getInt32Ty(ctx), 0)'
-        elif type == 'simdscalar':
-            llvm_type = 'getVectorType(Type::getFloatTy(ctx), pJitMgr->mVWidth)'
-        elif type == 'simdscalari':
-            llvm_type = 'getVectorType(Type::getInt32Ty(ctx), pJitMgr->mVWidth)'
-        elif type == 'simd16scalar':
-            llvm_type = 'getVectorType(Type::getFloatTy(ctx), 16)'
-        elif type == 'simd16scalari':
-            llvm_type = 'getVectorType(Type::getInt32Ty(ctx), 16)'
-        elif type == '__m128i':
-            llvm_type = 'getVectorType(Type::getInt32Ty(ctx), 4)'
-        elif type == 'SIMD256::Float':
-            llvm_type = 'getVectorType(Type::getFloatTy(ctx), 8)'
-        elif type == 'SIMD256::Integer':
-            llvm_type = 'getVectorType(Type::getInt32Ty(ctx), 8)'
-        elif type == 'SIMD512::Float':
-            llvm_type = 'getVectorType(Type::getFloatTy(ctx), 16)'
-        elif type == 'SIMD512::Integer':
-            llvm_type = 'getVectorType(Type::getInt32Ty(ctx), 16)'
-        elif type == 'simdvector':
-            llvm_type = 'ArrayType::get(getVectorType(Type::getFloatTy(ctx), 8), 4)'
-        elif type == 'simd16vector':
-            llvm_type = 'ArrayType::get(getVectorType(Type::getFloatTy(ctx), 16), 4)'
-        elif type == 'SIMD256::Vec4':
-            llvm_type = 'ArrayType::get(getVectorType(Type::getFloatTy(ctx), 8), 4)'
-        elif type == 'SIMD512::Vec4':
-            llvm_type = 'ArrayType::get(getVectorType(Type::getFloatTy(ctx), 16), 4)'
-        else:
-            llvm_type = 'Gen_%s(pJitMgr)' % type
-
-    if is_pointer:
-        llvm_type = 'PointerType::get(%s, 0)' % llvm_type
-
-    if is_pointer_pointer:
-        llvm_type = 'PointerType::get(%s, 0)' % llvm_type
-
-    if is_array_array:
-        llvm_type = 'ArrayType::get(ArrayType::get(%s, %s), %s)' % (llvm_type, array_count1, array_count)
-    elif is_array:
-        llvm_type = 'ArrayType::get(%s, %s)' % (llvm_type, array_count)
-
-    return {
-        'name'  : name,
-        'lineNum' : idx,
-        'type'  : llvm_type,
-    }
-
-'''
-'''
-def gen_llvm_types(input_file, output_file):
-
-    lines = input_file.readlines()
-
-    types = []
-
-    for idx in range(len(lines)):
-        line = lines[idx].rstrip()
-
-        if 'gen_llvm_types FINI' in line:
-            break
-
-        match = re.match(r'(\s*)struct(\s*)(\w+)', line)
-        if match:
-            llvm_args = []
-
-             # Detect start of structure
-            is_fwd_decl = re.search(r';', line)
-
-            if not is_fwd_decl:
-
-                # Extract the command name
-                struct_name = match.group(3).strip()
-
-                type_entry = {
-                    'name'      : struct_name,
-                    'lineNum'   : idx+1,
-                    'members'   : [],
-                }
-
-                end_of_struct = False
-
-                while not end_of_struct and idx < len(lines)-1:
-                    idx += 1
-                    line = lines[idx].rstrip()
-
-                    is_llvm_typedef = re.search(r'@llvm_typedef', line)
-                    if is_llvm_typedef is not None:
-                        is_llvm_typedef = True
-                        continue
-                    else:
-                        is_llvm_typedef = False
-
-                    ###########################################
-                    # Is field a llvm struct? Tells script to treat type as array of bytes that is size of structure.
-                    is_llvm_struct = re.search(r'@llvm_struct', line)
-
-                    if is_llvm_struct is not None:
-                        is_llvm_struct = True
-                    else:
-                        is_llvm_struct = False
-
-                    ###########################################
-                    # Is field the start of a function? Tells script to ignore it
-                    is_llvm_func_start = re.search(r'@llvm_func_start', line)
-
-                    if is_llvm_func_start is not None:
-                        while not end_of_struct and idx < len(lines)-1:
-                            idx += 1
-                            line = lines[idx].rstrip()
-                            is_llvm_func_end = re.search(r'@llvm_func_end', line)
-                            if is_llvm_func_end is not None:
-                                break;
-                        continue
-
-                    ###########################################
-                    # Is field a function? Tells script to ignore it
-                    is_llvm_func = re.search(r'@llvm_func', line)
-
-                    if is_llvm_func is not None:
-                        continue
-
-                    ###########################################
-                    # Is field a llvm enum? Tells script to treat type as an enum and replaced with uint32 type.
-                    is_llvm_enum = re.search(r'@llvm_enum', line)
-
-                    if is_llvm_enum is not None:
-                        is_llvm_enum = True
-                    else:
-                        is_llvm_enum = False
-
-                    ###########################################
-                    # Is field a llvm function pointer? Tells script to treat type as an enum and replaced with uint32 type.
-                    is_llvm_pfn = re.search(r'@llvm_pfn', line)
-
-                    if is_llvm_pfn is not None:
-                        is_llvm_pfn = True
-                    else:
-                        is_llvm_pfn = False
-
-                    ###########################################
-                    # Is field const?
-                    is_const = re.search(r'\s+const\s+', line)
-
-                    if is_const is not None:
-                        is_const = True
-                    else:
-                        is_const = False
-
-                    ###########################################
-                    # Is field a pointer?
-                    is_pointer_pointer = re.search('\*\*', line)
-
-                    if is_pointer_pointer is not None:
-                        is_pointer_pointer = True
-                    else:
-                        is_pointer_pointer = False
-
-                    ###########################################
-                    # Is field a pointer?
-                    is_pointer = re.search('\*', line)
-
-                    if is_pointer is not None:
-                        is_pointer = True
-                    else:
-                        is_pointer = False
-
-                    ###########################################
-                    # Is field an array of arrays?
-                    # TODO: Can add this to a list.
-                    is_array_array = re.search('\[(\w*)\]\[(\w*)\]', line)
-                    array_count = '0'
-                    array_count1 = '0'
-
-                    if is_array_array is not None:
-                        array_count = is_array_array.group(1)
-                        array_count1 = is_array_array.group(2)
-                        is_array_array = True
-                    else:
-                        is_array_array = False
-
-                    ###########################################
-                    # Is field an array?
-                    is_array = re.search('\[(\w*)\]', line)
-
-                    if is_array is not None:
-                        array_count = is_array.group(1)
-                        is_array = True
-                    else:
-                        is_array = False
-
-                    is_scoped = re.search('::', line)
-
-                    if is_scoped is not None:
-                        is_scoped = True
-                    else:
-                        is_scoped = False
-
-                    type = None
-                    name = None
-                    if is_const and is_pointer:
-
-                        if is_scoped:
-                            field_match = re.match(r'(\s*)(\w+\<*\w*\>*)(\s+)(\w+::)(\w+)(\s*\**\s*)(\w+)', line)
-
-                            type = '%s%s' % (field_match.group(4), field_match.group(5))
-                            name = field_match.group(7)
-                        else:
-                            field_match = re.match(r'(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*\**\s*)(\w+)', line)
-
-                            type = field_match.group(4)
-                            name = field_match.group(6)
-
-                    elif is_pointer:
-                        field_match = re.match(r'(\s*)(\s+)(\w+\<*\w*\>*)(\s*\**\s*)(\w+)', line)
-
-                        if field_match:
-                            type = field_match.group(3)
-                            name = field_match.group(5)
-                    elif is_const:
-                        field_match = re.match(r'(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*)(\w+)', line)
-
-                        if field_match:
-                            type = field_match.group(4)
-                            name = field_match.group(6)
-                    else:
-                        if is_scoped:
-                            field_match = re.match(r'\s*(\w+\<*\w*\>*)\s*::\s*(\w+\<*\w*\>*)\s+(\w+)', line)
-
-                            if field_match:
-                                type = field_match.group(1) + '::' + field_match.group(2)
-                                name = field_match.group(3)
-                        else:
-                            field_match = re.match(r'(\s*)(\w+\<*\w*\>*)(\s+)(\w+)', line)
-
-                            if field_match:
-                                type = field_match.group(2)
-                                name = field_match.group(4)
-
-                    if is_llvm_typedef is False:
-                        if type is not None:
-                            type_entry['members'].append(
-                                gen_llvm_type(
-                                    type, name, idx+1, is_pointer, is_pointer_pointer, is_array, is_array_array,
-                                    array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file))
-
-                    # Detect end of structure
-                    end_of_struct = re.match(r'(\s*)};', line)
-
-                    if end_of_struct:
-                        types.append(type_entry)
-
-    cur_dir = os.path.dirname(os.path.abspath(__file__))
-    template = os.path.join(cur_dir, 'templates', 'gen_llvm.hpp')
-
-    MakoTemplateWriter.to_file(
-        template,
-        output_file,
-        cmdline=sys.argv,
-        filename=os.path.basename(output_file),
-        types=types,
-        input_dir=os.path.dirname(input_file.name),
-        input_file=os.path.basename(input_file.name))
-
-'''
-    Function which is invoked when this script is started from a command line.
-    Will present and consume a set of arguments which will tell this script how
-    to behave
-'''
-def main():
-
-    # Parse args...
-    parser = ArgumentParser()
-    parser.add_argument('--input', '-i', type=FileType('r'),
-            help='Path to input file containing structs', required=True)
-    parser.add_argument('--output', '-o', action='store',
-            help='Path to output file', required=True)
-    args = parser.parse_args()
-
-    final_output_dir = os.path.dirname(args.output)
-    if MakeDir(final_output_dir):
-        return 1
-
-    final_output_file = args.output
-
-    tmp_dir = MakeTmpDir('_codegen')
-    args.output = os.path.join(tmp_dir, os.path.basename(args.output))
-
-    rval = 0
-    try:
-        gen_llvm_types(args.input, args.output)
-
-        rval = CopyFileIfDifferent(args.output, final_output_file)
-    except:
-        print('ERROR: Could not generate llvm types', file=sys.stderr)
-        rval = 1
-
-    finally:
-        DeleteDirTree(tmp_dir)
-
-    return rval
-
-if __name__ == '__main__':
-    sys.exit(main())
-# END OF FILE
--- a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
@ -1,383 +0,0 @@
-# Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-import sys
-
-# Python source
-KNOBS = [
-
-    ['ENABLE_ASSERT_DIALOGS', {
-        'type'      : 'bool',
-        'default'   : 'true',
-        'desc'      : ['Use dialogs when asserts fire.',
-                       'Asserts are only enabled in debug builds'],
-        'category'  : 'debug',
-    }],
-
-    ['SINGLE_THREADED', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['If enabled will perform all rendering on the API thread.',
-                       'This is useful mainly for debugging purposes.'],
-        'category'  : 'debug',
-    }],
-
-    ['DUMP_SHADER_IR', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Dumps shader LLVM IR at various stages of jit compilation.'],
-        'category'  : 'debug',
-    }],
-
-    ['USE_GENERIC_STORETILE', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Always use generic function for performing StoreTile.',
-                       'Will be slightly slower than using optimized (jitted) path'],
-        'category'  : 'debug_adv',
-    }],
-
-    ['FAST_CLEAR', {
-        'type'      : 'bool',
-        'default'   : 'true',
-        'desc'      : ['Replace 3D primitive execute with a SWRClearRT operation and',
-                       'defer clear execution to first backend op on hottile, or hottile store'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['MAX_NUMA_NODES', {
-        'type'      : 'uint32_t',
-        'default'   : '1' if sys.platform == 'win32' else '0',
-        'desc'      : ['Maximum # of NUMA-nodes per system used for worker threads',
-                       '  0 == ALL NUMA-nodes in the system',
-                       '  N == Use at most N NUMA-nodes for rendering'],
-        'category'  : 'perf',
-    }],
-
-    ['MAX_CORES_PER_NUMA_NODE', {
-        'type'      : 'uint32_t',
-        'default'   : '0',
-        'desc'      : ['Maximum # of cores per NUMA-node used for worker threads.',
-                       '  0 == ALL non-API thread cores per NUMA-node',
-                       '  N == Use at most N cores per NUMA-node'],
-        'category'  : 'perf',
-    }],
-
-    ['MAX_THREADS_PER_CORE', {
-        'type'      : 'uint32_t',
-        'default'   : '1',
-        'desc'      : ['Maximum # of (hyper)threads per physical core used for worker threads.',
-                       '  0 == ALL hyper-threads per core',
-                       '  N == Use at most N hyper-threads per physical core'],
-        'category'  : 'perf',
-    }],
-
-    ['MAX_WORKER_THREADS', {
-        'type'      : 'uint32_t',
-        'default'   : '0',
-        'desc'      : ['Maximum worker threads to spawn.',
-                       '',
-                       'IMPORTANT: If this is non-zero, no worker threads will be bound to',
-                       'specific HW threads.  They will all be "floating" SW threads.',
-                       'In this case, the above 3 KNOBS will be ignored.'],
-        'category'  : 'perf',
-    }],
-
-    ['BASE_NUMA_NODE', {
-        'type'      : 'uint32_t',
-        'default'   : '0',
-        'desc'      : ['Starting NUMA node index to use when allocating compute resources.',
-                       'Setting this to a non-zero value will reduce the maximum # of NUMA nodes used.'],
-        'category'  : 'perf',
-    }],
-
-    ['BASE_CORE', {
-        'type'      : 'uint32_t',
-        'default'   : '0',
-        'desc'      : ['Starting core index to use when allocating compute resources.',
-                       'Setting this to a non-zero value will reduce the maximum # of cores used.'],
-        'category'  : 'perf',
-    }],
-
-    ['BASE_THREAD', {
-        'type'      : 'uint32_t',
-        'default'   : '0',
-        'desc'      : ['Starting thread index to use when allocating compute resources.',
-                       'Setting this to a non-zero value will reduce the maximum # of threads used.'],
-        'category'  : 'perf',
-    }],
-
-    ['BUCKETS_START_FRAME', {
-        'type'      : 'uint32_t',
-        'default'   : '1200',
-        'desc'      : ['Frame from when to start saving buckets data.',
-                       '',
-                       'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
-                       'for this to have an effect.'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['BUCKETS_END_FRAME', {
-        'type'      : 'uint32_t',
-        'default'   : '1400',
-        'desc'      : ['Frame at which to stop saving buckets data.',
-                       '',
-                       'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
-                       'for this to have an effect.'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['WORKER_SPIN_LOOP_COUNT', {
-        'type'      : 'uint32_t',
-        'default'   : '5000',
-        'desc'      : ['Number of spin-loop iterations worker threads will perform',
-                       'before going to sleep when waiting for work'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['MAX_DRAWS_IN_FLIGHT', {
-        'type'      : 'uint32_t',
-        'default'   : '256',
-        'desc'      : ['Maximum number of draws outstanding before API thread blocks.',
-                       'This value MUST be evenly divisible into 2^32'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['MAX_PRIMS_PER_DRAW', {
-        'type'      : 'uint32_t',
-        'default'   : '49152',
-        'desc'      : ['Maximum primitives in a single Draw().',
-                       'Larger primitives are split into smaller Draw calls.',
-                       'Should be a multiple of (3 * vectorWidth).'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['MAX_TESS_PRIMS_PER_DRAW', {
-        'type'      : 'uint32_t',
-        'default'   : '16',
-        'desc'      : ['Maximum primitives in a single Draw() with tessellation enabled.',
-                       'Larger primitives are split into smaller Draw calls.',
-                       'Should be a multiple of (vectorWidth).'],
-        'category'  : 'perf_adv',
-    }],
-
-
-    ['DEBUG_OUTPUT_DIR', {
-        'type'      : 'std::string',
-        'default'   : r'%TEMP%\Rast\DebugOutput' if sys.platform == 'win32' else '/tmp/Rast/DebugOutput',
-        'desc'      : ['Output directory for debug data.'],
-        'category'  : 'debug',
-    }],
-
-    ['JIT_ENABLE_CACHE', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Enables caching of compiled shaders'],
-        'category'  : 'debug_adv',
-    }],
-
-    ['JIT_OPTIMIZATION_LEVEL', {
-        'type'      : 'int',
-        'default'   : '-1',
-        'desc'      : ['JIT compile optimization level:',],
-        'category'  : 'debug',
-        'control'   : 'dropdown',
-        'choices' : [
-            {
-                'name'  : 'Automatic',
-                'desc'  : 'Automatic based on other KNOB and build settings',
-                'value' : -1,
-            },
-            {
-                'name'  : 'Debug',
-                'desc'  : 'No optimization: -O0',
-                'value' : 0,
-            },
-            {
-                'name'  : 'Less',
-                'desc'  : 'Some optimization: -O1',
-                'value' : 1,
-            },
-            {
-                'name'  : 'Optimize',
-                'desc'  : 'Default Clang / LLVM optimizations: -O2',
-                'value' : 2,
-            },
-            {
-                'name'  : 'Aggressive',
-                'desc'  : 'Maximum optimization: -O3',
-                'value' : 3,
-            },
-        ],
-    }],
-
-    ['JIT_CACHE_DIR', {
-        'type'      : 'std::string',
-        'default'   : r'%TEMP%\SWR\JitCache' if sys.platform == 'win32' else '${HOME}/.swr/jitcache',
-        'desc'      : ['Cache directory for compiled shaders.'],
-        'category'  : 'debug',
-    }],
-
-    ['TOSS_DRAW', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Disable per-draw/dispatch execution'],
-        'category'  : 'perf',
-    }],
-
-    ['TOSS_QUEUE_FE', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Stop per-draw execution at worker FE',
-                       '',
-                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['TOSS_FETCH', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Stop per-draw execution at vertex fetch',
-                       '',
-                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['TOSS_IA', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Stop per-draw execution at input assembler',
-                       '',
-                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['TOSS_VS', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Stop per-draw execution at vertex shader',
-                       '',
-                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['TOSS_SETUP_TRIS', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Stop per-draw execution at primitive setup',
-                       '',
-                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['TOSS_BIN_TRIS', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Stop per-draw execution at primitive binning',
-                       '',
-                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['TOSS_RS', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Stop per-draw execution at rasterizer',
-                       '',
-                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['DISABLE_SPLIT_DRAW', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Don\'t split large draws into smaller draws.,',
-                       'MAX_PRIMS_PER_DRAW and MAX_TESS_PRIMS_PER_DRAW can be used to control split size.',
-                       '',
-                       'Useful to disable split draws for gathering archrast stats.'],
-        'category'  : 'perf_adv',
-    }],
-
-    ['AR_ENABLE_PIPELINE_STATS', {
-        'type'      : 'bool',
-        'default'   : 'true',
-        'desc'      : ['Enable pipeline stats when using Archrast'],
-        'category'  : 'archrast',
-    }],
-
-    ['AR_ENABLE_SHADER_STATS', {
-        'type'      : 'bool',
-        'default'   : 'true',
-        'desc'      : ['Enable shader stats when using Archrast'],
-        'category'  : 'archrast',
-    }],
-
-    ['AR_ENABLE_SWTAG_DATA', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Enable SWTag data when using Archrast'],
-        'category'  : 'archrast',
-    }],
-
-    ['AR_ENABLE_SWR_EVENTS', {
-        'type'      : 'bool',
-        'default'   : 'true',
-        'desc'      : ['Enable internal SWR events when using Archrast'],
-        'category'  : 'archrast',
-    }],
-
-    ['AR_ENABLE_PIPELINE_EVENTS', {
-        'type'      : 'bool',
-        'default'   : 'true',
-        'desc'      : ['Enable pipeline events when using Archrast'],
-        'category'  : 'archrast',
-    }],
-
-    ['AR_ENABLE_SHADER_EVENTS', {
-        'type'      : 'bool',
-        'default'   : 'true',
-        'desc'      : ['Enable shader events when using Archrast'],
-        'category'  : 'archrast',
-    }],
-
-    ['AR_ENABLE_SWTAG_EVENTS', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Enable SWTag events when using Archrast'],
-        'category'  : 'archrast',
-    }],
-
-    ['AR_ENABLE_MEMORY_EVENTS', {
-        'type'      : 'bool',
-        'default'   : 'false',
-        'desc'      : ['Enable memory events when using Archrast'],
-        'category'  : 'archrast',
-    }],
-
-    ['AR_MEM_SET_BYTE_GRANULARITY', {
-        'type'      : 'uint32_t',
-        'default'   : '64',
-        'desc'      : ['Granularity and alignment of tracking of memory accesses',
-                       'ONLY ACTIVE UNDER ArchRast.'],
-        'category'  : 'archrast',
-    }],
-
-
-    ]
--- a/src/gallium/drivers/swr/rasterizer/codegen/meson.build
+++ b/src/gallium/drivers/swr/rasterizer/codegen/meson.build
@ -1,77 +0,0 @@
-# Copyright © 2017-2018 Intel Corporation
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-gen_knobs_cpp = custom_target(
-  'gen_knobs.cpp',
-  input : ['gen_knobs.py'],
-  output : 'gen_knobs.cpp',
-  command : [prog_python, '@INPUT0@', '--output', '@OUTPUT@', '--gen_cpp'],
-  depend_files : files(
-    'knob_defs.py', 'gen_common.py',
-    'templates/gen_knobs.cpp',
-  ),
-)
-
-gen_knobs_h = custom_target(
-  'gen_knobs.h',
-  input : ['gen_knobs.py'],
-  output : 'gen_knobs.h',
-  command : [prog_python, '@INPUT0@', '--output', '@OUTPUT@', '--gen_h'],
-  depend_files : files(
-    'knob_defs.py', 'gen_common.py',
-    'templates/gen_knobs.h',
-  ),
-)
-
-
-# The generators above this are needed individually, while the below generators
-# are all inputs to the same lib, so they don't need unique names.
-files_swr_common += [
-  gen_builder_hpp, gen_builder_meta_hpp, gen_knobs_h, gen_knobs_cpp
-]
-
-foreach x : [[swr_context_files, 'gen_swr_context_llvm.h'],
-             [swr_state_files, 'gen_state_llvm.h'],
-             [swr_surf_state_files, 'gen_surf_state_llvm.h']]
-  files_swr_common += custom_target(
-    x[1],
-    input : ['gen_llvm_types.py', x[0]],
-    output : x[1],
-    command : [prog_python, '@INPUT0@', '--input', '@INPUT1@', '--output', '@OUTPUT@'],
-    depend_files : files(
-      'templates/gen_llvm.hpp',
-      'gen_common.py',
-    ),
-  )
-endforeach
-
-ar_output_filenames = ['gen_ar_event.hpp', 'gen_ar_event.cpp', 'gen_ar_eventhandler.hpp', 'gen_ar_eventhandlerfile.hpp']
-ar_template_filenames = []
-foreach fname : ar_output_filenames
-    ar_template_filenames += join_paths('templates', fname)
-endforeach
-
-files_swr_common += custom_target(
-    'gen_archrast',
-    input : ['gen_archrast.py', swr_event_proto_files, swr_event_pproto_files],
-    output : ar_output_filenames,
-    command : [prog_python, '@INPUT0@', '--proto', '@INPUT1@', '@INPUT2@', '--output-dir', meson.current_build_dir()],
-    depend_files : files('gen_common.py', ar_template_filenames)
-)
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.cpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.cpp
@ -1,55 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file ${filename}
- *
- * @brief Implementation for events.  auto-generated file
- *
- * DO NOT EDIT
- *
- * Generation Command Line:
- *  ${'\n *    '.join(cmdline)}
- *
- ******************************************************************************/
-// clang-format off
-#include "common/os.h"
-#include "gen_ar_event.hpp"
-#include "gen_ar_eventhandler.hpp"
-
-using namespace ArchRast;
-
-<%  sorted_groups = sorted(protos['events']['groups']) %>
-%   for group in sorted_groups:
-%       for event_key in protos['events']['groups'][group]:
-<%
-        event = protos['events']['defs'][event_key]
-%>
-void ${event['name']}::Accept(EventHandler* pHandler) const
-{
-    pHandler->Handle(*this);
-}
-%       endfor
-%   endfor
-
-
-// clan-format on
-
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.hpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.hpp
@ -1,168 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file ${filename}
- *
- * @brief Definitions for events.  auto-generated file
- *
- * DO NOT EDIT
- *
- * Generation Command Line:
- *  ${'\n *    '.join(cmdline)}
- *
- ******************************************************************************/
-// clang-format off
-#pragma once
-
-#include "common/os.h"
-#include "core/state.h"
-
-<%
-    always_enabled_knob_groups = ['Framework', 'SWTagFramework', 'ApiSwr']
-    group_knob_remap_table = {
-        "ShaderStats": "KNOB_AR_ENABLE_SHADER_STATS",
-        "PipelineStats" : "KNOB_AR_ENABLE_PIPELINE_STATS",
-        "SWTagData" : "KNOB_AR_ENABLE_SWTAG_DATA",
- }
-%>
-namespace ArchRast
-{
-<% sorted_enums = sorted(protos['enums']['defs']) %>
-% for name in sorted_enums:
-    enum ${name}
-    {<% names = protos['enums']['defs'][name]['names'] %>
-        % for i in range(len(names)):
-        ${names[i].lstrip()}
-        % endfor
-    };
-% endfor
-
-    // Forward decl
-    class EventHandler;
-
-    //////////////////////////////////////////////////////////////////////////
-    /// Event - interface for handling events.
-    //////////////////////////////////////////////////////////////////////////
-    struct Event
-    {
-        const uint32_t eventId = {0xFFFFFFFF};
-        Event() {}
-        virtual ~Event() {}
-
-        virtual bool IsEnabled() const { return true; };
-        virtual const uint32_t GetEventId() const = 0;
-        virtual void Accept(EventHandler* pHandler) const = 0;
-    };
-
-<%  sorted_groups = sorted(protos['events']['groups']) %>
-% for group in sorted_groups:
-    % for event_key in protos['events']['groups'][group]:
-<%
-        event = protos['events']['defs'][event_key]
-%>
-    //////////////////////////////////////////////////////////////////////////
-    /// ${event_key}Data
-    //////////////////////////////////////////////////////////////////////////
-#pragma pack(push, 1)
-    struct ${event['name']}Data
-    {<%
-        fields = event['fields'] %>
-        // Fields
-        % for i in range(len(fields)):
-            % if fields[i]['size'] > 1:
-        ${fields[i]['type']} ${fields[i]['name']}[${fields[i]['size']}];
-            % else:
-        ${fields[i]['type']} ${fields[i]['name']};
-            % endif
-        % endfor
-    };
-#pragma pack(pop)
-
-    //////////////////////////////////////////////////////////////////////////
-    /// ${event_key}
-    //////////////////////////////////////////////////////////////////////////
-    struct ${event['name']} : Event
-    {<%
-        fields = event['fields'] %>
-        const uint32_t eventId = {${ event['id'] }};
-        ${event['name']}Data data;
-
-        // Constructor
-        ${event['name']}(
-        % for i in range(len(fields)):
-            % if i < len(fields)-1:
-                % if fields[i]['size'] > 1:
-            ${fields[i]['type']}* ${fields[i]['name']},
-            uint32_t ${fields[i]['name']}_size,
-                % else:
-            ${fields[i]['type']} ${fields[i]['name']},
-                % endif
-            % endif
-            % if i == len(fields)-1:
-                % if fields[i]['size'] > 1:
-            ${fields[i]['type']}* ${fields[i]['name']},
-            uint32_t ${fields[i]['name']}_size
-                % else:
-            ${fields[i]['type']} ${fields[i]['name']}
-                % endif
-            % endif
-        % endfor
-        )
-        {
-        % for i in range(len(fields)):
-            % if fields[i]['size'] > 1:
-                % if fields[i]['type'] == 'char':
-            // Copy size of string (null-terminated) followed by string into entire buffer
-            SWR_ASSERT(${fields[i]['name']}_size + 1 < ${fields[i]['size']} - sizeof(uint32_t), "String length must be less than size of char buffer - size(uint32_t)!");
-            memcpy(data.${fields[i]['name']}, &${fields[i]['name']}_size, sizeof(uint32_t));
-            strcpy_s(data.${fields[i]['name']} + sizeof(uint32_t), ${fields[i]['name']}_size + 1, ${fields[i]['name']});
-                % else:
-            memcpy(data.${fields[i]['name']}, ${fields[i]['name']}, ${fields[i]['name']}_size);
-                % endif
-            % else:
-            data.${fields[i]['name']} = ${fields[i]['name']};
-            % endif
-        % endfor
-        }
-
-        virtual void Accept(EventHandler* pHandler) const;
-        inline const uint32_t GetEventId() const { return eventId; }
-        % if group not in always_enabled_knob_groups:
-        <% 
-            if group in group_knob_remap_table:
-                group_knob_define = group_knob_remap_table[group]
-            else:
-                group_knob_define = 'KNOB_AR_ENABLE_' + group.upper() + '_EVENTS'
-        %>
-        bool IsEnabled() const
-        {
-            static const bool IsEventEnabled = true;    // TODO: Replace with knob for each event
-            return ${group_knob_define} && IsEventEnabled;
-        }
-        % endif
-    };
-
-    % endfor
-
-% endfor
-} // namespace ArchRast
-// clang-format on
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandler.hpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandler.hpp
@ -1,61 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file ${filename}
- *
- * @brief Event handler interface.  auto-generated file
- *
- * DO NOT EDIT
- *
- * Generation Command Line:
- *  ${'\n *    '.join(cmdline)}
- *
- ******************************************************************************/
-// clang-format on
-#pragma once
-
-#include "${event_header}"
-
-namespace ArchRast
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// EventHandler - interface for handling events.
-    //////////////////////////////////////////////////////////////////////////
-    class EventHandler
-    {
-    public:
-        EventHandler() {}
-        virtual ~EventHandler() {}
-
-        virtual void FlushDraw(uint32_t drawId) {}
-
-<%  sorted_groups = sorted(protos['events']['groups']) %>
-%   for group in sorted_groups:
-%       for event_key in protos['events']['groups'][group]:
-<%
-            event = protos['events']['defs'][event_key]
-%>        virtual void Handle(const ${event['name']}& event) {}
-%       endfor
-%   endfor
-    };
-} // namespace ArchRast
-// clan-format off
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
@ -1,174 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file ${filename}
- *
- * @brief Event handler interface.  auto-generated file
- *
- * DO NOT EDIT
- *
- * Generation Command Line:
- *  ${'\n *    '.join(cmdline)}
- *
- ******************************************************************************/
-// clang-format off
-#pragma once
-
-#include "common/os.h"
-#include "${event_header}"
-#include <fstream>
-#include <sstream>
-#include <iostream>
-#include <thread>
-
-namespace ArchRast
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// EventHandlerFile - interface for handling events.
-    //////////////////////////////////////////////////////////////////////////
-    class EventHandlerFile : public EventHandler
-    {
-    public:
-        EventHandlerFile(uint32_t id) : mBufOffset(0)
-        {
-#if defined(_WIN32)
-            DWORD pid = GetCurrentProcessId();
-            TCHAR procname[MAX_PATH];
-            GetModuleFileName(NULL, procname, MAX_PATH);
-            const char*       pBaseName = strrchr(procname, '\\');
-            std::stringstream outDir;
-            outDir << KNOB_DEBUG_OUTPUT_DIR << pBaseName << "_" << pid << std::ends;
-            mOutputDir = outDir.str();
-            if (CreateDirectory(mOutputDir.c_str(), NULL))
-            {
-                std::cout << std::endl
-                          << "ArchRast Dir:       " << mOutputDir << std::endl
-                          << std::endl
-                          << std::flush;
-            }
-
-            // There could be multiple threads creating thread pools. We
-            // want to make sure they are uniquely identified by adding in
-            // the creator's thread id into the filename.
-            std::stringstream fstr;
-            fstr << outDir.str().c_str() << "\\ar_event" << std::this_thread::get_id();
-            fstr << "_" << id << ".bin" << std::ends;
-            mFilename = fstr.str();
-#else
-            // There could be multiple threads creating thread pools. We
-            // want to make sure they are uniquely identified by adding in
-            // the creator's thread id into the filename.
-            std::stringstream fstr;
-            fstr << "/tmp/ar_event" << std::this_thread::get_id();
-            fstr << "_" << id << ".bin" << std::ends;
-            mFilename = fstr.str();
-#endif
-        }
-
-        virtual ~EventHandlerFile() { FlushBuffer(); }
-
-        //////////////////////////////////////////////////////////////////////////
-        /// @brief Flush buffer to file.
-        bool FlushBuffer()
-        {
-            if (mBufOffset > 0)
-            {
-                if (mBufOffset == mHeaderBufOffset)
-                {
-                    // Nothing to flush. Only header has been generated.
-                    return false;
-                }
-
-                std::ofstream file;
-                file.open(mFilename, std::ios::out | std::ios::app | std::ios::binary);
-
-                if (!file.is_open())
-                {
-                    SWR_INVALID("ArchRast: Could not open event file!");
-                    return false;
-                }
-
-                file.write((char*)mBuffer, mBufOffset);
-                file.close();
-
-                mBufOffset       = 0;
-                mHeaderBufOffset = 0; // Reset header offset so its no longer considered.
-            }
-            return true;
-        }
-
-        //////////////////////////////////////////////////////////////////////////
-        /// @brief Write event and its payload to the memory buffer.
-        void Write(uint32_t eventId, const char* pBlock, uint32_t size)
-        {
-            if ((mBufOffset + size + sizeof(eventId)) > mBufferSize)
-            {
-                if (!FlushBuffer())
-                {
-                    // Don't corrupt what's already in the buffer?
-                    /// @todo Maybe add corrupt marker to buffer here in case we can open file in
-                    /// future?
-                    return;
-                }
-            }
-
-            memcpy(&mBuffer[mBufOffset], (char*)&eventId, sizeof(eventId));
-            mBufOffset += sizeof(eventId);
-            memcpy(&mBuffer[mBufOffset], pBlock, size);
-            mBufOffset += size;
-        }
-<%  sorted_groups = sorted(protos['events']['groups']) %>
-%   for group in sorted_groups:
-%       for event_key in protos['events']['groups'][group]:
-<%
-            event = protos['events']['defs'][event_key]
-%>
-        //////////////////////////////////////////////////////////////////////////
-        /// @brief Handle ${event_key} event
-        virtual void Handle(const ${event['name']}& event)
-        {
-% if event['num_fields'] == 0:
-            Write(event.eventId, (char*)&event.data, 0);
-% else:
-            Write(event.eventId, (char*)&event.data, sizeof(event.data));
-% endif
-        }
-%       endfor
-%   endfor
-
-        //////////////////////////////////////////////////////////////////////////
-        /// @brief Everything written to buffer this point is the header.
-        virtual void MarkHeader()
-        {
-            mHeaderBufOffset = mBufOffset;
-        }
-
-        std::string mFilename;
-        std::string mOutputDir;
-
-        static const uint32_t mBufferSize = 1024;
-        uint8_t               mBuffer[mBufferSize];
-        uint32_t mBufOffset{0};
-        uint32_t mHeaderBufOffset{0};
-    };
-} // namespace ArchRast
-// clang-format on
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp
@ -1,42 +0,0 @@
-//============================================================================
-// Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a
-// copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice (including the next
-// paragraph) shall be included in all copies or substantial portions of the
-// Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-// IN THE SOFTWARE.
-//
-// @file BackendPixelRate${fileNum}.cpp
-//
-// @brief auto-generated file
-//
-// DO NOT EDIT
-//
-// Generation Command Line:
-//  ${'\n//    '.join(cmdline)}
-//
-//============================================================================
-
-#include "core/backend.h"
-#include "core/backend_impl.h"
-
-void InitBackendPixelRate${fileNum}()
-{
-    %for func in funcList:
-    ${func}
-    %endfor
-}
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp
@ -1,84 +0,0 @@
-//============================================================================
-// Copyright (C) 2014-2020 Intel Corporation.   All Rights Reserved.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a
-// copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice (including the next
-// paragraph) shall be included in all copies or substantial portions of the
-// Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-// IN THE SOFTWARE.
-//
-// @file ${filename}
-//
-// @brief auto-generated file
-//
-// DO NOT EDIT
-//
-// Generation Command Line:
-//  ${'\n//    '.join(cmdline)}
-//
-//============================================================================
-// clang-format off
-#pragma once
-
-//============================================================================
-// Auto-generated ${comment}
-//============================================================================
-%for func in functions:
-<%argList = ', '.join(func['args'])%>\
-${func['decl']}
-{
-%if isX86:
-    %if len(func['args']) != 0:
-    SmallVector<Type*, ${len(func['args'])}> argTypes;
-    %for arg in func['args']:
-    argTypes.push_back(${arg}->getType());
-    %endfor
-#if LLVM_VERSION_MAJOR >= 12
-    #define VEC_GET_NUM_ELEMS cast<FixedVectorType>(a->getType())->getNumElements()
-#elif LLVM_VERSION_MAJOR >= 11
-    #define VEC_GET_NUM_ELEMS cast<VectorType>(a->getType())->getNumElements()
-#else
-    #define VEC_GET_NUM_ELEMS a->getType()->getVectorNumElements()
-#endif
-    FunctionType* pFuncTy = FunctionType::get(${ func['returnType'] }, argTypes, false);
-    %else:
-    FunctionType* pFuncTy = FunctionType::get(${ func['returnType'] }, {}, false);
-    %endif:
-#if LLVM_VERSION_MAJOR >= 9
-    Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("meta.intrinsic.${func['name']}", pFuncTy).getCallee());
-#else
-    Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("meta.intrinsic.${func['name']}", pFuncTy));
-#endif
-    return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name);
-%elif isIntrin:
-    %if len(func['types']) != 0:
-    SmallVector<Type*, ${len(func['types'])}> args;
-    %for arg in func['types']:
-    args.push_back(${arg}->getType());
-    %endfor
-    Function* pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']}, args);
-    return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name);
-    %else:
-    Function* pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']});
-    return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name);
-    %endif
-%else:
-    return IRB()->${func['intrin']}(${argList});
-%endif
-}
-
-% endfor
-    // clang-format on
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp
@ -1,46 +0,0 @@
-//============================================================================
-// Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a
-// copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice (including the next
-// paragraph) shall be included in all copies or substantial portions of the
-// Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-// IN THE SOFTWARE.
-//
-// @file ${filename}
-//
-// @brief auto-generated file
-//
-// DO NOT EDIT
-//
-// Generation Command Line:
-//  ${'\n//    '.join(cmdline)}
-//
-//============================================================================
-
-// clang-format off
-
-%for num in range(numFiles):
-void Init${tableName}${num}();
-%endfor
-
-static INLINE void Init${tableName}()
-{
-    %for num in range(numFiles):
-    Init${tableName}${num}();
-    %endfor
-}
-// clang-format on
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
@ -1,143 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2015-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file ${filename}.cpp
- *
- * @brief Dynamic Knobs for Core.
- *
- * ======================= AUTO GENERATED: DO NOT EDIT !!! ====================
- *
- * Generation Command Line:
- *  ${'\n *    '.join(cmdline)}
- *
- ******************************************************************************/
-// clang-format off
-<% calc_max_knob_len(knobs) %>
-% for inc in includes:
-#include <${inc}>
-% endfor
-#include <regex>
-#include <core/utils.h>
-
-//========================================================
-// Implementation
-//========================================================
-void KnobBase::autoExpandEnvironmentVariables(std::string& text)
-{
-    size_t start;
-    while ((start = text.find("${'${'}")) != std::string::npos)
-    {
-        size_t end = text.find("}");
-        if (end == std::string::npos)
-            break;
-        const std::string var = GetEnv(text.substr(start + 2, end - start - 2));
-        text.replace(start, end - start + 1, var);
-    }
-    // win32 style variable replacement
-    while ((start = text.find("%")) != std::string::npos)
-    {
-        size_t end = text.find("%", start + 1);
-        if (end == std::string::npos)
-            break;
-        const std::string var = GetEnv(text.substr(start + 1, end - start - 1));
-        text.replace(start, end - start + 1, var);
-    }
-}
-
-//========================================================
-// Static Data Members
-//========================================================
-% for knob in knobs:
-% if knob[1]['type'] == 'std::string':
-${knob[1]['type']} GlobalKnobs::Knob_${knob[0]}::m_default = "${repr(knob[1]['default'])[1:-1]}";
-% else:
-${knob[1]['type']} GlobalKnobs::Knob_${knob[0]}::m_default = ${knob[1]['default']};
-% endif
-% endfor
-GlobalKnobs g_GlobalKnobs;
-
-//========================================================
-// Knob Initialization
-//========================================================
-GlobalKnobs::GlobalKnobs()
-{
-    % for knob in knobs :
-    InitKnob(${ knob[0] });
-    % endfor
-}
-
-//========================================================
-// Knob Display (Convert to String)
-//========================================================
-std::string GlobalKnobs::ToString(const char* optPerLinePrefix)
-{
-    std::basic_stringstream<char> str;
-    str << std::showbase << std::setprecision(1) << std::fixed;
-
-    if (optPerLinePrefix == nullptr)
-    {
-        optPerLinePrefix = "";
-    }
-
-    % for knob in knobs:
-    str << optPerLinePrefix << "KNOB_${knob[0]}:${space_knob(knob[0])}";
-    % if knob[1]['type'] == 'bool':
-    str << (KNOB_${knob[0]} ? "+\n" : "-\n");
-    % elif knob[1]['type'] != 'float' and knob[1]['type'] != 'std::string':
-    str << std::hex << std::setw(11) << std::left << KNOB_${knob[0]};
-    str << std::dec << KNOB_${knob[0]} << "\n";
-    % else:
-    str << KNOB_${knob[0]} << "\n";
-    % endif
-    % endfor
-    str << std::ends;
-
-    return str.str();
-}
-<%!
-    # Globally available python 
-    max_len = 0
-    def calc_max_knob_len(knobs):
-        global max_len
-        max_len = 0
-        for knob in knobs:
-            if len(knob[0]) > max_len: max_len = len(knob[0])
-        max_len += len('KNOB_ ')
-        if max_len % 4: max_len += 4 - (max_len % 4)
-
-    def space_knob(knob):
-        knob_len = len('KNOB_' + knob)
-        return ' '*(max_len - knob_len)
-
-    def calc_max_name_len(choices_array):
-        _max_len = 0
-        for choice in choices_array:
-            if len(choice['name']) > _max_len: _max_len = len(choice['name'])
-
-        if _max_len % 4: _max_len += 4 - (_max_len % 4)
-        return _max_len
-
-    def space_name(name, max_len):
-        name_len = len(name)
-        return ' '*(max_len - name_len)
-%>
-// clang-format on
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h
@ -1,154 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2015-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file ${filename}.h
- *
- * @brief Dynamic Knobs for Core.
- *
- * ======================= AUTO GENERATED: DO NOT EDIT !!! ====================
- *
- * Generation Command Line:
- *  ${'\n *    '.join(cmdline)}
- *
- ******************************************************************************/
-// clang-format off
-<% calc_max_knob_len(knobs) %>
-#pragma once
-#include <string>
-
-struct KnobBase
-{
-private:
-    // Update the input string.
-    static void autoExpandEnvironmentVariables(std::string& text);
-
-protected:
-    // Leave input alone and return new string.
-    static std::string expandEnvironmentVariables(std::string const& input)
-    {
-        std::string text = input;
-        autoExpandEnvironmentVariables(text);
-        return text;
-    }
-
-    template <typename T>
-    static T expandEnvironmentVariables(T const& input)
-    {
-        return input;
-    }
-};
-
-template <typename T>
-struct Knob : KnobBase
-{
-public:
-    const T& Value() const { return m_Value; }
-    const T& Value(T const& newValue)
-    {
-        m_Value = expandEnvironmentVariables(newValue);
-        return Value();
-    }
-
-private:
-    T m_Value;
-};
-
-#define DEFINE_KNOB(_name, _type)                               \\
-
-    struct Knob_##_name : Knob<_type>                           \\
-
-    {                                                           \\
-
-        static const char* Name() { return "KNOB_" #_name; }    \\
-
-        static _type DefaultValue() { return (m_default); }     \\
-
-    private:                                                    \\
-
-        static _type m_default;                                 \\
-
-    } _name;
-
-#define GET_KNOB(_name)             g_GlobalKnobs._name.Value()
-#define SET_KNOB(_name, _newValue)  g_GlobalKnobs._name.Value(_newValue)
-
-struct GlobalKnobs
-{
-    % for knob in knobs:
-    //-----------------------------------------------------------
-    // KNOB_${knob[0]}
-    //
-    % for line in knob[1]['desc']:
-    // ${line}
-    % endfor
-    % if knob[1].get('choices'):
-    <%
-    choices = knob[1].get('choices')
-    _max_len = calc_max_name_len(choices) %>//
-    % for i in range(len(choices)):
-    //     ${choices[i]['name']}${space_name(choices[i]['name'], _max_len)} = ${format(choices[i]['value'], '#010x')}
-    % endfor
-    % endif
-    //
-    DEFINE_KNOB(${knob[0]}, ${knob[1]['type']});
-
-    % endfor
-
-    std::string ToString(const char* optPerLinePrefix="");
-    GlobalKnobs();
-};
-extern GlobalKnobs g_GlobalKnobs;
-
-#undef DEFINE_KNOB
-
-% for knob in knobs:
-#define KNOB_${knob[0]}${space_knob(knob[0])} GET_KNOB(${knob[0]})
-% endfor
-
-<%!
-    # Globally available python 
-    max_len = 0
-    def calc_max_knob_len(knobs):
-        global max_len
-        max_len = 0
-        for knob in knobs:
-            if len(knob[0]) > max_len: max_len = len(knob[0])
-        max_len += len('KNOB_ ')
-        if max_len % 4: max_len += 4 - (max_len % 4)
-
-    def space_knob(knob):
-        knob_len = len('KNOB_' + knob)
-        return ' '*(max_len - knob_len)
-
-    def calc_max_name_len(choices_array):
-        _max_len = 0
-        for choice in choices_array:
-            if len(choice['name']) > _max_len: _max_len = len(choice['name'])
-
-        if _max_len % 4: _max_len += 4 - (_max_len % 4)
-        return _max_len
-
-    def space_name(name, max_len):
-        name_len = len(name)
-        return ' '*(max_len - name_len)
-%>
-// clang-format on
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp
@ -1,109 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file ${filename}
- *
- * @brief auto-generated file
- *
- * DO NOT EDIT
- *
- * Generation Command Line:
- *   ${'\n *     '.join(cmdline)}
- *
- ******************************************************************************/
-// clang-format off
-
-#include <llvm/IR/DerivedTypes.h>
-
-#pragma once
-
-namespace SwrJit
-{
-    using namespace llvm;
-
-%for type in types:
-    INLINE static StructType* Gen_${type['name']}(JitManager* pJitMgr)
-    {
-        %if needs_ctx(type):
-        LLVMContext& ctx = pJitMgr->mContext;
-
-        %endif
-#if LLVM_VERSION_MAJOR >= 12
-        StructType* pRetType = StructType::getTypeByName(pJitMgr->mContext, "${type['name']}");
-#else
-        StructType* pRetType = pJitMgr->mpCurrentModule->getTypeByName("${type['name']}");
-#endif
-        if (pRetType == nullptr)
-        {
-            std::vector<Type*> members =<% (max_type_len, max_name_len) = calc_max_len(type['members']) %>
-            {
-                %for member in type['members']:
-                /* ${member['name']} ${pad(len(member['name']), max_name_len)}*/ ${member['type']},
-                %endfor
-            };
-
-            pRetType = StructType::create(members, "${type['name']}", false);
-
-            // Compute debug metadata
-            llvm::DIBuilder builder(*pJitMgr->mpCurrentModule);
-            llvm::DIFile* pFile = builder.createFile("${input_file}", "${os.path.normpath(input_dir).replace('\\', '/')}");
-
-            std::vector<std::pair<std::string, uint32_t>> dbgMembers =
-            {
-                %for member in type['members']:
-                std::make_pair("${member['name']}", ${pad(len(member['name']), max_name_len)}${member['lineNum']}),
-                %endfor
-            };
-            pJitMgr->CreateDebugStructType(pRetType, "${type['name']}", pFile, ${type['lineNum']}, dbgMembers);
-        }
-
-        return pRetType;
-    }
-
-    %for member in type['members']:
-    static const uint32_t ${type['name']}_${member['name']} ${pad(len(member['name']), max_name_len)}= ${loop.index};
-    %endfor
-
-%endfor
-} // namespace SwrJit
-
-<%! # Global function definitions
-    import os
-    def needs_ctx(struct_type):
-        for m in struct_type.get('members', []):
-            if '(ctx)' in m.get('type', ''):
-                return True
-        return False
-
-    def calc_max_len(fields):
-        max_type_len = 0
-        max_name_len = 0
-        for f in fields:
-            if len(f['type']) > max_type_len: max_type_len = len(f['type'])
-            if len(f['name']) > max_name_len: max_name_len = len(f['name'])
-        return (max_type_len, max_name_len)
-
-    def pad(cur_len, max_len):
-        pad_amt = max_len - cur_len
-        return ' '*pad_amt
-%>
-// clang-format on
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp
@ -1,44 +0,0 @@
-//============================================================================
-// Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a
-// copy of this software and associated documentation files (the "Software"),
-// to deal in the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice (including the next
-// paragraph) shall be included in all copies or substantial portions of the
-// Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-// IN THE SOFTWARE.
-//
-// @file gen_rasterizer${fileNum}.cpp
-//
-// @brief auto-generated file
-//
-// DO NOT EDIT
-//
-// Generation Command Line:
-//  ${'\n//    '.join(cmdline)}
-//
-//============================================================================
-// clang-format off
-
-#include "core/rasterizer.h"
-#include "core/rasterizer_impl.h"
-
-void InitRasterizerFuncs${fileNum}()
-{
-    %for func in funcList:
-    ${func}
-    %endfor
-}
-// clang-format on
--- a/src/gallium/drivers/swr/rasterizer/common/formats.cpp
+++ b/src/gallium/drivers/swr/rasterizer/common/formats.cpp
--- a/src/gallium/drivers/swr/rasterizer/common/formats.h
+++ b/src/gallium/drivers/swr/rasterizer/common/formats.h
@ -1,268 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file formats.h
- *
- * @brief auto-generated file
- *
- * DO NOT EDIT
- *
- ******************************************************************************/
-
-#pragma once
-
-#include "common/os.h"
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_TYPE - Format component type
-//////////////////////////////////////////////////////////////////////////
-enum SWR_TYPE
-{
-    SWR_TYPE_UNKNOWN,
-    SWR_TYPE_UNUSED,
-    SWR_TYPE_UNORM,
-    SWR_TYPE_SNORM,
-    SWR_TYPE_UINT,
-    SWR_TYPE_SINT,
-    SWR_TYPE_FLOAT,
-    SWR_TYPE_SSCALED,
-    SWR_TYPE_USCALED,
-    SWR_TYPE_SFIXED,
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_FORMAT
-//////////////////////////////////////////////////////////////////////////
-enum SWR_FORMAT
-{
-    R32G32B32A32_FLOAT       = 0x0,
-    R32G32B32A32_SINT        = 0x1,
-    R32G32B32A32_UINT        = 0x2,
-    R64G64_FLOAT             = 0x5,
-    R32G32B32X32_FLOAT       = 0x6,
-    R32G32B32A32_SSCALED     = 0x7,
-    R32G32B32A32_USCALED     = 0x8,
-    R32G32B32A32_SFIXED      = 0x20,
-    R32G32B32_FLOAT          = 0x40,
-    R32G32B32_SINT           = 0x41,
-    R32G32B32_UINT           = 0x42,
-    R32G32B32_SSCALED        = 0x45,
-    R32G32B32_USCALED        = 0x46,
-    R32G32B32_SFIXED         = 0x50,
-    R16G16B16A16_UNORM       = 0x80,
-    R16G16B16A16_SNORM       = 0x81,
-    R16G16B16A16_SINT        = 0x82,
-    R16G16B16A16_UINT        = 0x83,
-    R16G16B16A16_FLOAT       = 0x84,
-    R32G32_FLOAT             = 0x85,
-    R32G32_SINT              = 0x86,
-    R32G32_UINT              = 0x87,
-    R32_FLOAT_X8X24_TYPELESS = 0x88,
-    X32_TYPELESS_G8X24_UINT  = 0x89,
-    L32A32_FLOAT             = 0x8A,
-    R64_FLOAT                = 0x8D,
-    R16G16B16X16_UNORM       = 0x8E,
-    R16G16B16X16_FLOAT       = 0x8F,
-    L32X32_FLOAT             = 0x91,
-    I32X32_FLOAT             = 0x92,
-    R16G16B16A16_SSCALED     = 0x93,
-    R16G16B16A16_USCALED     = 0x94,
-    R32G32_SSCALED           = 0x95,
-    R32G32_USCALED           = 0x96,
-    R32G32_SFIXED            = 0xA0,
-    B8G8R8A8_UNORM           = 0xC0,
-    B8G8R8A8_UNORM_SRGB      = 0xC1,
-    R10G10B10A2_UNORM        = 0xC2,
-    R10G10B10A2_UNORM_SRGB   = 0xC3,
-    R10G10B10A2_UINT         = 0xC4,
-    R8G8B8A8_UNORM           = 0xC7,
-    R8G8B8A8_UNORM_SRGB      = 0xC8,
-    R8G8B8A8_SNORM           = 0xC9,
-    R8G8B8A8_SINT            = 0xCA,
-    R8G8B8A8_UINT            = 0xCB,
-    R16G16_UNORM             = 0xCC,
-    R16G16_SNORM             = 0xCD,
-    R16G16_SINT              = 0xCE,
-    R16G16_UINT              = 0xCF,
-    R16G16_FLOAT             = 0xD0,
-    B10G10R10A2_UNORM        = 0xD1,
-    B10G10R10A2_UNORM_SRGB   = 0xD2,
-    R11G11B10_FLOAT          = 0xD3,
-    R10G10B10_FLOAT_A2_UNORM = 0xD5,
-    R32_SINT                 = 0xD6,
-    R32_UINT                 = 0xD7,
-    R32_FLOAT                = 0xD8,
-    R24_UNORM_X8_TYPELESS    = 0xD9,
-    X24_TYPELESS_G8_UINT     = 0xDA,
-    L32_UNORM                = 0xDD,
-    L16A16_UNORM             = 0xDF,
-    I24X8_UNORM              = 0xE0,
-    L24X8_UNORM              = 0xE1,
-    I32_FLOAT                = 0xE3,
-    L32_FLOAT                = 0xE4,
-    A32_FLOAT                = 0xE5,
-    B8G8R8X8_UNORM           = 0xE9,
-    B8G8R8X8_UNORM_SRGB      = 0xEA,
-    R8G8B8X8_UNORM           = 0xEB,
-    R8G8B8X8_UNORM_SRGB      = 0xEC,
-    R9G9B9E5_SHAREDEXP       = 0xED,
-    B10G10R10X2_UNORM        = 0xEE,
-    L16A16_FLOAT             = 0xF0,
-    R10G10B10X2_USCALED      = 0xF3,
-    R8G8B8A8_SSCALED         = 0xF4,
-    R8G8B8A8_USCALED         = 0xF5,
-    R16G16_SSCALED           = 0xF6,
-    R16G16_USCALED           = 0xF7,
-    R32_SSCALED              = 0xF8,
-    R32_USCALED              = 0xF9,
-    B5G6R5_UNORM             = 0x100,
-    B5G6R5_UNORM_SRGB        = 0x101,
-    B5G5R5A1_UNORM           = 0x102,
-    B5G5R5A1_UNORM_SRGB      = 0x103,
-    B4G4R4A4_UNORM           = 0x104,
-    B4G4R4A4_UNORM_SRGB      = 0x105,
-    R8G8_UNORM               = 0x106,
-    R8G8_SNORM               = 0x107,
-    R8G8_SINT                = 0x108,
-    R8G8_UINT                = 0x109,
-    R16_UNORM                = 0x10A,
-    R16_SNORM                = 0x10B,
-    R16_SINT                 = 0x10C,
-    R16_UINT                 = 0x10D,
-    R16_FLOAT                = 0x10E,
-    I16_UNORM                = 0x111,
-    L16_UNORM                = 0x112,
-    A16_UNORM                = 0x113,
-    L8A8_UNORM               = 0x114,
-    I16_FLOAT                = 0x115,
-    L16_FLOAT                = 0x116,
-    A16_FLOAT                = 0x117,
-    L8A8_UNORM_SRGB          = 0x118,
-    B5G5R5X1_UNORM           = 0x11A,
-    B5G5R5X1_UNORM_SRGB      = 0x11B,
-    R8G8_SSCALED             = 0x11C,
-    R8G8_USCALED             = 0x11D,
-    R16_SSCALED              = 0x11E,
-    R16_USCALED              = 0x11F,
-    A1B5G5R5_UNORM           = 0x124,
-    A4B4G4R4_UNORM           = 0x125,
-    L8A8_UINT                = 0x126,
-    L8A8_SINT                = 0x127,
-    R8_UNORM                 = 0x140,
-    R8_SNORM                 = 0x141,
-    R8_SINT                  = 0x142,
-    R8_UINT                  = 0x143,
-    A8_UNORM                 = 0x144,
-    I8_UNORM                 = 0x145,
-    L8_UNORM                 = 0x146,
-    R8_SSCALED               = 0x149,
-    R8_USCALED               = 0x14A,
-    L8_UNORM_SRGB            = 0x14C,
-    L8_UINT                  = 0x152,
-    L8_SINT                  = 0x153,
-    I8_UINT                  = 0x154,
-    I8_SINT                  = 0x155,
-    DXT1_RGB_SRGB            = 0x180,
-    YCRCB_SWAPUVY            = 0x183,
-    BC1_UNORM                = 0x186,
-    BC2_UNORM                = 0x187,
-    BC3_UNORM                = 0x188,
-    BC4_UNORM                = 0x189,
-    BC5_UNORM                = 0x18A,
-    BC1_UNORM_SRGB           = 0x18B,
-    BC2_UNORM_SRGB           = 0x18C,
-    BC3_UNORM_SRGB           = 0x18D,
-    YCRCB_SWAPUV             = 0x18F,
-    DXT1_RGB                 = 0x191,
-    R8G8B8_UNORM             = 0x193,
-    R8G8B8_SNORM             = 0x194,
-    R8G8B8_SSCALED           = 0x195,
-    R8G8B8_USCALED           = 0x196,
-    R64G64B64A64_FLOAT       = 0x197,
-    R64G64B64_FLOAT          = 0x198,
-    BC4_SNORM                = 0x199,
-    BC5_SNORM                = 0x19A,
-    R16G16B16_FLOAT          = 0x19B,
-    R16G16B16_UNORM          = 0x19C,
-    R16G16B16_SNORM          = 0x19D,
-    R16G16B16_SSCALED        = 0x19E,
-    R16G16B16_USCALED        = 0x19F,
-    BC6H_SF16                = 0x1A1,
-    BC7_UNORM                = 0x1A2,
-    BC7_UNORM_SRGB           = 0x1A3,
-    BC6H_UF16                = 0x1A4,
-    R8G8B8_UNORM_SRGB        = 0x1A8,
-    R16G16B16_UINT           = 0x1B0,
-    R16G16B16_SINT           = 0x1B1,
-    R32_SFIXED               = 0x1B2,
-    R10G10B10A2_SNORM        = 0x1B3,
-    R10G10B10A2_USCALED      = 0x1B4,
-    R10G10B10A2_SSCALED      = 0x1B5,
-    R10G10B10A2_SINT         = 0x1B6,
-    B10G10R10A2_SNORM        = 0x1B7,
-    B10G10R10A2_USCALED      = 0x1B8,
-    B10G10R10A2_SSCALED      = 0x1B9,
-    B10G10R10A2_UINT         = 0x1BA,
-    B10G10R10A2_SINT         = 0x1BB,
-    R8G8B8_UINT              = 0x1C8,
-    R8G8B8_SINT              = 0x1C9,
-    RAW                      = 0x1FF,
-    NUM_SWR_FORMATS          = 0x200,
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_FORMAT_INFO - Format information
-//////////////////////////////////////////////////////////////////////////
-struct SWR_FORMAT_INFO
-{
-    const char* name;
-    SWR_TYPE    type[4];
-    uint32_t    defaults[4];
-    uint32_t    swizzle[4]; ///< swizzle per component
-    uint32_t    bpc[4];     ///< bits per component
-    uint32_t    bpp;        ///< bits per pixel
-    uint32_t    Bpp;        ///< bytes per pixel
-    uint32_t    numComps;   ///< number of components
-    bool        isSRGB;
-    bool        isBC;
-    bool        isSubsampled;
-    bool        isLuminance;
-    bool        isNormalized[4];
-    float       toFloat[4];
-    uint32_t    bcWidth;
-    uint32_t    bcHeight;
-};
-
-extern const SWR_FORMAT_INFO gFormatInfo[NUM_SWR_FORMATS];
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves format info struct for given format.
-/// @param format - SWR format
-INLINE const SWR_FORMAT_INFO& GetFormatInfo(SWR_FORMAT format)
-{
-    SWR_ASSERT(format < NUM_SWR_FORMATS, "Invalid Surface Format: %d", format);
-    SWR_ASSERT(gFormatInfo[format].name != nullptr, "Invalid Surface Format: %d", format);
-    return gFormatInfo[format];
-}
-
-// lookup table for unorm8 srgb -> float conversion
-extern const uint32_t srgb8Table[256];
--- a/src/gallium/drivers/swr/rasterizer/common/intrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/intrin.h
@ -1,120 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#ifndef __SWR_INTRIN_H__
-#define __SWR_INTRIN_H__
-
-#include "os.h"
-
-#if !defined(SIMD_ARCH)
-#define SIMD_ARCH KNOB_ARCH
-#endif
-
-#include "simdlib_types.hpp"
-
-typedef SIMDImpl::SIMD128Impl::Float   simd4scalar;
-typedef SIMDImpl::SIMD128Impl::Double  simd4scalard;
-typedef SIMDImpl::SIMD128Impl::Integer simd4scalari;
-typedef SIMDImpl::SIMD128Impl::Vec4    simd4vector;
-typedef SIMDImpl::SIMD128Impl::Mask    simd4mask;
-
-typedef SIMDImpl::SIMD256Impl::Float   simd8scalar;
-typedef SIMDImpl::SIMD256Impl::Double  simd8scalard;
-typedef SIMDImpl::SIMD256Impl::Integer simd8scalari;
-typedef SIMDImpl::SIMD256Impl::Vec4    simd8vector;
-typedef SIMDImpl::SIMD256Impl::Mask    simd8mask;
-
-typedef SIMDImpl::SIMD512Impl::Float   simd16scalar;
-typedef SIMDImpl::SIMD512Impl::Double  simd16scalard;
-typedef SIMDImpl::SIMD512Impl::Integer simd16scalari;
-typedef SIMDImpl::SIMD512Impl::Vec4    simd16vector;
-typedef SIMDImpl::SIMD512Impl::Mask    simd16mask;
-
-#if KNOB_SIMD_WIDTH == 8
-typedef simd8scalar  simdscalar;
-typedef simd8scalard simdscalard;
-typedef simd8scalari simdscalari;
-typedef simd8vector  simdvector;
-typedef simd8mask    simdmask;
-#else
-#error Unsupported vector width
-#endif
-
-INLINE
-UINT pdep_u32(UINT a, UINT mask)
-{
-#if KNOB_ARCH >= KNOB_ARCH_AVX2
-    return _pdep_u32(a, mask);
-#else
-    UINT result = 0;
-
-    // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
-    // using bsf instead of funky loop
-    unsigned long maskIndex = 0;
-    while (_BitScanForward(&maskIndex, mask))
-    {
-        // 1. isolate lowest set bit of mask
-        const UINT lowest = 1 << maskIndex;
-
-        // 2. populate LSB from src
-        const UINT LSB = (UINT)((int)(a << 31) >> 31);
-
-        // 3. copy bit from mask
-        result |= LSB & lowest;
-
-        // 4. clear lowest bit
-        mask &= ~lowest;
-
-        // 5. prepare for next iteration
-        a >>= 1;
-    }
-
-    return result;
-#endif
-}
-
-INLINE
-UINT pext_u32(UINT a, UINT mask)
-{
-#if KNOB_ARCH >= KNOB_ARCH_AVX2
-    return _pext_u32(a, mask);
-#else
-    UINT     result = 0;
-    unsigned long maskIndex;
-    uint32_t currentBit = 0;
-    while (_BitScanForward(&maskIndex, mask))
-    {
-        // 1. isolate lowest set bit of mask
-        const UINT lowest = 1 << maskIndex;
-
-        // 2. copy bit from mask
-        result |= ((a & lowest) > 0) << currentBit++;
-
-        // 3. clear lowest bit
-        mask &= ~lowest;
-    }
-    return result;
-#endif
-}
-
-#endif //__SWR_INTRIN_H__
--- a/src/gallium/drivers/swr/rasterizer/common/isa.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/isa.hpp
@ -1,231 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#pragma once
-
-#include <iostream>
-#include <vector>
-#include <bitset>
-#include <array>
-#include <string>
-#include <algorithm>
-
-// Clang for Windows does supply an intrin.h with __cpuid intrinsics, however...
-// It seems to not realize that a write to "b" (ebx) will kill the value in rbx.
-// This attempts to use the "native" clang / gcc intrinsics instead of the windows
-// compatible ones.
-#if defined(_MSC_VER) && !defined(__clang__)
-#include <intrin.h>
-#else
-#include <string.h>
-#if !defined(__cpuid)
-#include <cpuid.h>
-#endif
-#endif
-
-class InstructionSet
-{
-public:
-    InstructionSet() : CPU_Rep(){};
-
-    // getters
-    std::string Vendor(void) { return CPU_Rep.vendor_; }
-    std::string Brand(void) { return CPU_Rep.brand_; }
-
-    bool SSE3(void) { return CPU_Rep.f_1_ECX_[0]; }
-    bool PCLMULQDQ(void) { return CPU_Rep.f_1_ECX_[1]; }
-    bool MONITOR(void) { return CPU_Rep.f_1_ECX_[3]; }
-    bool SSSE3(void) { return CPU_Rep.f_1_ECX_[9]; }
-    bool FMA(void) { return CPU_Rep.f_1_ECX_[12]; }
-    bool CMPXCHG16B(void) { return CPU_Rep.f_1_ECX_[13]; }
-    bool SSE41(void) { return CPU_Rep.f_1_ECX_[19]; }
-    bool SSE42(void) { return CPU_Rep.f_1_ECX_[20]; }
-    bool MOVBE(void) { return CPU_Rep.f_1_ECX_[22]; }
-    bool POPCNT(void) { return CPU_Rep.f_1_ECX_[23]; }
-    bool AES(void) { return CPU_Rep.f_1_ECX_[25]; }
-    bool XSAVE(void) { return CPU_Rep.f_1_ECX_[26]; }
-    bool OSXSAVE(void) { return CPU_Rep.f_1_ECX_[27]; }
-    bool RDRAND(void) { return CPU_Rep.f_1_ECX_[30]; }
-
-    bool MSR(void) { return CPU_Rep.f_1_EDX_[5]; }
-    bool CX8(void) { return CPU_Rep.f_1_EDX_[8]; }
-    bool SEP(void) { return CPU_Rep.f_1_EDX_[11]; }
-    bool CMOV(void) { return CPU_Rep.f_1_EDX_[15]; }
-    bool CLFSH(void) { return CPU_Rep.f_1_EDX_[19]; }
-    bool MMX(void) { return CPU_Rep.f_1_EDX_[23]; }
-    bool FXSR(void) { return CPU_Rep.f_1_EDX_[24]; }
-    bool SSE(void) { return CPU_Rep.f_1_EDX_[25]; }
-    bool SSE2(void) { return CPU_Rep.f_1_EDX_[26]; }
-
-    bool FSGSBASE(void) { return CPU_Rep.f_7_EBX_[0]; }
-    bool BMI1(void) { return CPU_Rep.f_7_EBX_[3]; }
-    bool HLE(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[4]; }
-    bool BMI2(void) { return CPU_Rep.f_7_EBX_[8]; }
-    bool ERMS(void) { return CPU_Rep.f_7_EBX_[9]; }
-    bool INVPCID(void) { return CPU_Rep.f_7_EBX_[10]; }
-    bool RTM(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[11]; }
-    bool RDSEED(void) { return CPU_Rep.f_7_EBX_[18]; }
-    bool ADX(void) { return CPU_Rep.f_7_EBX_[19]; }
-    bool SHA(void) { return CPU_Rep.f_7_EBX_[29]; }
-
-    bool PREFETCHWT1(void) { return CPU_Rep.f_7_ECX_[0]; }
-
-    bool LAHF(void) { return CPU_Rep.f_81_ECX_[0]; }
-    bool LZCNT(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_ECX_[5]; }
-    bool ABM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[5]; }
-    bool SSE4a(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[6]; }
-    bool XOP(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[11]; }
-    bool TBM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[21]; }
-
-    bool SYSCALL(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[11]; }
-    bool MMXEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[22]; }
-    bool RDTSCP(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[27]; }
-    bool _3DNOWEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[30]; }
-    bool _3DNOW(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[31]; }
-
-    bool AVX(void) { return CPU_Rep.f_1_ECX_[28]; }
-    bool F16C(void) { return CPU_Rep.f_1_ECX_[29]; }
-    bool AVX2(void) { return CPU_Rep.f_7_EBX_[5]; }
-    bool AVX512F(void) { return CPU_Rep.f_7_EBX_[16]; }
-    bool AVX512PF(void) { return CPU_Rep.f_7_EBX_[26]; }
-    bool AVX512ER(void) { return CPU_Rep.f_7_EBX_[27]; }
-    bool AVX512CD(void) { return CPU_Rep.f_7_EBX_[28]; }
-
-private:
-    class InstructionSet_Internal
-    {
-    public:
-        InstructionSet_Internal() :
-            nIds_{0}, nExIds_{0}, isIntel_{false}, isAMD_{false}, f_1_ECX_{0}, f_1_EDX_{0},
-            f_7_EBX_{0}, f_7_ECX_{0}, f_81_ECX_{0}, f_81_EDX_{0}, data_{}, extdata_{}
-        {
-            // int cpuInfo[4] = {-1};
-            std::array<int, 4> cpui;
-
-            // Calling __cpuid with 0x0 as the function_id argument
-            // gets the number of the highest valid function ID.
-#if defined(_MSC_VER) && !defined(__clang__)
-            __cpuid(cpui.data(), 0);
-            nIds_ = cpui[0];
-#else
-            nIds_ = __get_cpuid_max(0, NULL);
-#endif
-
-            for (int i = 0; i <= nIds_; ++i)
-            {
-#if defined(_MSC_VER) && !defined(__clang__)
-                __cpuidex(cpui.data(), i, 0);
-#else
-                int* data = cpui.data();
-                __cpuid_count(i, 0, data[0], data[1], data[2], data[3]);
-#endif
-                data_.push_back(cpui);
-            }
-
-            // Capture vendor string
-            char vendor[0x20];
-            memset(vendor, 0, sizeof(vendor));
-            *reinterpret_cast<int*>(vendor)     = data_[0][1];
-            *reinterpret_cast<int*>(vendor + 4) = data_[0][3];
-            *reinterpret_cast<int*>(vendor + 8) = data_[0][2];
-            vendor_                             = vendor;
-            if (vendor_ == "GenuineIntel")
-            {
-                isIntel_ = true;
-            }
-            else if (vendor_ == "AuthenticAMD")
-            {
-                isAMD_ = true;
-            }
-
-            // load bitset with flags for function 0x00000001
-            if (nIds_ >= 1)
-            {
-                f_1_ECX_ = data_[1][2];
-                f_1_EDX_ = data_[1][3];
-            }
-
-            // load bitset with flags for function 0x00000007
-            if (nIds_ >= 7)
-            {
-                f_7_EBX_ = data_[7][1];
-                f_7_ECX_ = data_[7][2];
-            }
-
-            // Calling __cpuid with 0x80000000 as the function_id argument
-            // gets the number of the highest valid extended ID.
-#if defined(_MSC_VER) && !defined(__clang__)
-            __cpuid(cpui.data(), 0x80000000);
-            nExIds_ = cpui[0];
-#else
-            nExIds_ = __get_cpuid_max(0x80000000, NULL);
-#endif
-
-            char brand[0x40];
-            memset(brand, 0, sizeof(brand));
-
-            for (unsigned i = 0x80000000; i <= nExIds_; ++i)
-            {
-#if defined(_MSC_VER) && !defined(__clang__)
-                __cpuidex(cpui.data(), i, 0);
-#else
-                int* data = cpui.data();
-                __cpuid_count(i, 0, data[0], data[1], data[2], data[3]);
-#endif
-                extdata_.push_back(cpui);
-            }
-
-            // load bitset with flags for function 0x80000001
-            if (nExIds_ >= 0x80000001)
-            {
-                f_81_ECX_ = extdata_[1][2];
-                f_81_EDX_ = extdata_[1][3];
-            }
-
-            // Interpret CPU brand string if reported
-            if (nExIds_ >= 0x80000004)
-            {
-                memcpy(brand, extdata_[2].data(), sizeof(cpui));
-                memcpy(brand + 16, extdata_[3].data(), sizeof(cpui));
-                memcpy(brand + 32, extdata_[4].data(), sizeof(cpui));
-                brand_ = brand;
-            }
-        };
-
-        int                             nIds_;
-        unsigned                        nExIds_;
-        std::string                     vendor_;
-        std::string                     brand_;
-        bool                            isIntel_;
-        bool                            isAMD_;
-        std::bitset<32>                 f_1_ECX_;
-        std::bitset<32>                 f_1_EDX_;
-        std::bitset<32>                 f_7_EBX_;
-        std::bitset<32>                 f_7_ECX_;
-        std::bitset<32>                 f_81_ECX_;
-        std::bitset<32>                 f_81_EDX_;
-        std::vector<std::array<int, 4>> data_;
-        std::vector<std::array<int, 4>> extdata_;
-    };
-    const InstructionSet_Internal CPU_Rep;
-};
--- a/src/gallium/drivers/swr/rasterizer/common/os.cpp
+++ b/src/gallium/drivers/swr/rasterizer/common/os.cpp
@ -1,314 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#include "common/os.h"
-#include <vector>
-#include <array>
-#include <sstream>
-
-#if defined(_WIN32)
-#include <shlobj.h>
-#endif // Windows
-
-#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
-#include <pthread.h>
-#endif // Linux
-
-#if defined(_MSC_VER)
-static const DWORD MS_VC_EXCEPTION = 0x406D1388;
-
-#pragma pack(push, 8)
-typedef struct tagTHREADNAME_INFO
-{
-    DWORD  dwType;     // Must be 0x1000.
-    LPCSTR szName;     // Pointer to name (in user addr space).
-    DWORD  dwThreadID; // Thread ID (-1=caller thread).
-    DWORD  dwFlags;    // Reserved for future use, must be zero.
-} THREADNAME_INFO;
-#pragma pack(pop)
-
-void LegacySetThreadName(const char* pThreadName)
-{
-    THREADNAME_INFO info;
-    info.dwType     = 0x1000;
-    info.szName     = pThreadName;
-    info.dwThreadID = GetCurrentThreadId();
-    info.dwFlags    = 0;
-
-    if (!IsDebuggerPresent())
-    {
-        // No debugger attached to interpret exception, no need to actually do it
-        return;
-    }
-
-#pragma warning(push)
-#pragma warning(disable : 6320 6322)
-    __try
-    {
-        RaiseException(MS_VC_EXCEPTION, 0, sizeof(info) / sizeof(ULONG_PTR), (ULONG_PTR*)&info);
-    }
-    __except (EXCEPTION_EXECUTE_HANDLER)
-    {
-    }
-#pragma warning(pop)
-}
-#endif // _WIN32
-
-void SWR_API SetCurrentThreadName(const char* pThreadName)
-{
-#if defined(_MSC_VER)
-    // The SetThreadDescription API was brought in version 1607 of Windows 10.
-    typedef HRESULT(WINAPI * PFNSetThreadDescription)(HANDLE hThread, PCWSTR lpThreadDescription);
-    // The SetThreadDescription API works even if no debugger is attached.
-    auto pfnSetThreadDescription = reinterpret_cast<PFNSetThreadDescription>(
-        GetProcAddress(GetModuleHandleA("Kernel32.dll"), "SetThreadDescription"));
-
-    if (!pfnSetThreadDescription)
-    {
-        // try KernelBase.dll
-        pfnSetThreadDescription = reinterpret_cast<PFNSetThreadDescription>(
-            GetProcAddress(GetModuleHandleA("KernelBase.dll"), "SetThreadDescription"));
-    }
-
-    if (pfnSetThreadDescription)
-    {
-        std::string  utf8Name = pThreadName;
-        std::wstring wideName;
-        wideName.resize(utf8Name.size() + 1);
-        swprintf_s(&(wideName.front()), wideName.size(), L"%S", utf8Name.c_str());
-        HRESULT hr = pfnSetThreadDescription(GetCurrentThread(), wideName.c_str());
-        SWR_ASSERT(SUCCEEDED(hr), "Failed to set thread name to %s", pThreadName);
-
-        // Fall through - it seems like some debuggers only recognize the exception
-    }
-
-    // Fall back to exception based hack
-    LegacySetThreadName(pThreadName);
-#endif // _WIN32
-
-#if defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
-    pthread_setname_np(pthread_self(), pThreadName);
-#endif // Linux
-}
-
-#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
-static void
-SplitString(std::vector<std::string>& out_segments, const std::string& input, char splitToken)
-{
-    out_segments.clear();
-
-    std::istringstream f(input);
-    std::string        s;
-    while (std::getline(f, s, splitToken))
-    {
-        if (s.size())
-        {
-            out_segments.push_back(s);
-        }
-    }
-}
-#endif // Unix
-
-void SWR_API CreateDirectoryPath(const std::string& path)
-{
-#if defined(_WIN32)
-    SHCreateDirectoryExA(nullptr, path.c_str(), nullptr);
-#endif // Windows
-
-#if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
-    std::vector<std::string> pathSegments;
-    SplitString(pathSegments, path, '/');
-
-    std::string tmpPath;
-    for (auto const& segment : pathSegments)
-    {
-        tmpPath.push_back('/');
-        tmpPath += segment;
-
-        int result = mkdir(tmpPath.c_str(), 0777);
-        if (result == -1 && errno != EEXIST)
-        {
-            break;
-        }
-    }
-#endif // Unix
-}
-
-/// Execute Command (block until finished)
-/// @returns process exit value
-int SWR_API ExecCmd(const std::string& cmd,     ///< (In) Command line string
-                    const char* pOptEnvStrings, ///< (Optional In) Environment block for new process
-                    std::string*       pOptStdOut, ///< (Optional Out) Standard Output text
-                    std::string*       pOptStdErr, ///< (Optional Out) Standard Error text
-                    const std::string* pOptStdIn)  ///< (Optional In) Standard Input text
-{
-    int rvalue = -1;
-
-#if defined(_WIN32)
-    struct WinPipe
-    {
-        HANDLE hRead;
-        HANDLE hWrite;
-    };
-    std::array<WinPipe, 3> hPipes = {};
-
-    SECURITY_ATTRIBUTES saAttr  = {sizeof(SECURITY_ATTRIBUTES)};
-    saAttr.bInheritHandle       = TRUE; // Pipe handles are inherited by child process.
-    saAttr.lpSecurityDescriptor = NULL;
-
-    {
-        bool bFail = false;
-        for (WinPipe& p : hPipes)
-        {
-            if (!CreatePipe(&p.hRead, &p.hWrite, &saAttr, 0))
-            {
-                bFail = true;
-            }
-        }
-
-        if (bFail)
-        {
-            for (WinPipe& p : hPipes)
-            {
-                CloseHandle(p.hRead);
-                CloseHandle(p.hWrite);
-            }
-            return rvalue;
-        }
-    }
-
-    STARTUPINFOA StartupInfo{};
-    StartupInfo.cb      = sizeof(STARTUPINFOA);
-    StartupInfo.dwFlags = STARTF_USESTDHANDLES;
-    StartupInfo.dwFlags |= STARTF_USESHOWWINDOW;
-    StartupInfo.wShowWindow = SW_HIDE;
-    if (pOptStdIn)
-    {
-        StartupInfo.hStdInput = hPipes[0].hRead;
-    }
-    StartupInfo.hStdOutput = hPipes[1].hWrite;
-    StartupInfo.hStdError  = hPipes[2].hWrite;
-    PROCESS_INFORMATION procInfo{};
-
-    // CreateProcess can modify the string
-    std::string local_cmd = cmd;
-
-    BOOL ProcessValue = CreateProcessA(NULL,
-                                       (LPSTR)local_cmd.c_str(),
-                                       NULL,
-                                       NULL,
-                                       TRUE,
-                                       0,
-                                       (LPVOID)pOptEnvStrings,
-                                       NULL,
-                                       &StartupInfo,
-                                       &procInfo);
-
-    if (ProcessValue && procInfo.hProcess)
-    {
-        auto ReadFromPipe = [](HANDLE hPipe, std::string* pOutStr) {
-            char  buf[1024];
-            DWORD dwRead  = 0;
-            DWORD dwAvail = 0;
-            while (true)
-            {
-                if (!::PeekNamedPipe(hPipe, NULL, 0, NULL, &dwAvail, NULL))
-                {
-                    break;
-                }
-
-                if (!dwAvail) // no data available, return
-                {
-                    break;
-                }
-
-                if (!::ReadFile(hPipe,
-                                buf,
-                                std::min<size_t>(sizeof(buf) - 1, size_t(dwAvail)),
-                                &dwRead,
-                                NULL) ||
-                    !dwRead)
-                {
-                    // error, the child process might ended
-                    break;
-                }
-
-                buf[dwRead] = 0;
-                if (pOutStr)
-                {
-                    (*pOutStr) += buf;
-                }
-            }
-        };
-        bool   bProcessEnded = false;
-        size_t bytesWritten  = 0;
-        do
-        {
-            if (pOptStdIn && (pOptStdIn->size() > bytesWritten))
-            {
-                DWORD bytesToWrite = static_cast<DWORD>(pOptStdIn->size()) - bytesWritten;
-                if (!::WriteFile(hPipes[0].hWrite,
-                                 pOptStdIn->data() + bytesWritten,
-                                 bytesToWrite,
-                                 &bytesToWrite,
-                                 nullptr))
-                {
-                    // Failed to write to pipe
-                    break;
-                }
-                bytesWritten += bytesToWrite;
-            }
-
-            // Give some timeslice (50ms), so we won't waste 100% cpu.
-            bProcessEnded = (WaitForSingleObject(procInfo.hProcess, 50) == WAIT_OBJECT_0);
-
-            ReadFromPipe(hPipes[1].hRead, pOptStdOut);
-            ReadFromPipe(hPipes[2].hRead, pOptStdErr);
-        } while (!bProcessEnded);
-
-        DWORD exitVal = 0;
-        if (!GetExitCodeProcess(procInfo.hProcess, &exitVal))
-        {
-            exitVal = 1;
-        }
-
-        CloseHandle(procInfo.hProcess);
-        CloseHandle(procInfo.hThread);
-
-        rvalue = exitVal;
-    }
-
-    for (WinPipe& p : hPipes)
-    {
-        CloseHandle(p.hRead);
-        CloseHandle(p.hWrite);
-    }
-
-#else
-
-    // Non-Windows implementation
-
-#endif
-
-    return rvalue;
-}
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@ -1,365 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#ifndef __SWR_OS_H__
-#define __SWR_OS_H__
-
-#include <cstddef>
-#include "core/knobs.h"
-
-#if (defined(FORCE_WINDOWS) || defined(_WIN32)) && !defined(FORCE_LINUX)
-
-#define SWR_API __cdecl
-#define SWR_VISIBLE __declspec(dllexport)
-
-#ifndef NOMINMAX
-#undef UNICODE
-#define NOMINMAX
-#include <windows.h>
-#undef NOMINMAX
-#define UNICODE
-#else
-#undef UNICODE
-#include <windows.h>
-#define UNICODE
-#endif
-#include <intrin.h>
-#include <cstdint>
-
-#if defined(MemoryFence)
-// Windows.h defines MemoryFence as _mm_mfence, but this conflicts with llvm::sys::MemoryFence
-#undef MemoryFence
-#endif
-
-#if defined(_MSC_VER)
-#define OSALIGN(RWORD, WIDTH) __declspec(align(WIDTH)) RWORD
-#elif defined(__GNUC__)
-#define OSALIGN(RWORD, WIDTH) RWORD __attribute__((aligned(WIDTH)))
-#endif
-
-#if defined(_DEBUG)
-// We compile Debug builds with inline function expansion enabled.  This allows
-// functions compiled with __forceinline to be inlined even in Debug builds.
-// The inline_depth(0) pragma below will disable inline function expansion for
-// normal INLINE / inline functions, but not for __forceinline functions.
-// Our SIMD function wrappers (see simdlib.hpp) use __forceinline even in
-// Debug builds.
-#define INLINE inline
-#pragma inline_depth(0)
-#else
-// Use of __forceinline increases compile time dramatically in release builds
-// and provides almost 0 measurable benefit.  Disable until we have a compelling
-// use-case
-// #define INLINE __forceinline
-#define INLINE inline
-#endif
-#ifndef FORCEINLINE
-#define FORCEINLINE __forceinline
-#endif
-
-#define DEBUGBREAK __debugbreak()
-
-#define PRAGMA_WARNING_PUSH_DISABLE(...) \
-    __pragma(warning(push));             \
-    __pragma(warning(disable : __VA_ARGS__));
-
-#define PRAGMA_WARNING_POP() __pragma(warning(pop))
-
-static inline void* AlignedMalloc(size_t _Size, size_t _Alignment)
-{
-    return _aligned_malloc(_Size, _Alignment);
-}
-
-static inline void AlignedFree(void* p)
-{
-    return _aligned_free(p);
-}
-
-#if defined(_WIN64)
-#define BitScanReverseSizeT BitScanReverse64
-#define BitScanForwardSizeT BitScanForward64
-#define _mm_popcount_sizeT _mm_popcnt_u64
-#else
-#define BitScanReverseSizeT BitScanReverse
-#define BitScanForwardSizeT BitScanForward
-#define _mm_popcount_sizeT _mm_popcnt_u32
-#endif
-
-#if !defined(_WIN64)
-extern "C" {
-inline unsigned char _BitScanForward64(unsigned long* Index, uint64_t Mask)
-{
-    if (Mask == 0)
-      return 0;
-#ifdef __GNUC__
-    *Index = __builtin_ctzll(Mask);
-#else
-    *Index = 0;
-    for (int i = 0; i < 64; ++ i)
-      if ((1ULL << i) & Mask)
-        *Index = i;
-#endif
-    return 1;
-}
-
-inline unsigned char _BitScanReverse64(unsigned long* Index, uint64_t Mask)
-{
-    if (Mask == 0)
-      return 0;
-#ifdef __GNUC__
-    *Index = 63 - __builtin_clzll(Mask);
-#else
-    *Index = 0;
-    for (int i = 63; i >= 0; -- i)
-      if ((1ULL << i) & Mask)
-        *Index = i;
-#endif
-    return 1;
-}
-}
-#endif
-
-#elif defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
-
-#define SWR_API
-#define SWR_VISIBLE __attribute__((visibility("default")))
-
-#include <stdlib.h>
-#include <string.h>
-#include <x86intrin.h>
-#include <stdint.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <sys/stat.h>
-#include <stdio.h>
-#include <limits.h>
-
-typedef void         VOID;
-typedef void*        LPVOID;
-typedef int          INT;
-typedef unsigned int UINT;
-typedef void*        HANDLE;
-typedef int          LONG;
-typedef unsigned int DWORD;
-
-#undef FALSE
-#define FALSE 0
-
-#undef TRUE
-#define TRUE 1
-
-#define MAX_PATH PATH_MAX
-
-#define OSALIGN(RWORD, WIDTH) RWORD __attribute__((aligned(WIDTH)))
-#ifndef INLINE
-#define INLINE __inline
-#endif
-#ifndef FORCEINLINE
-#define FORCEINLINE INLINE
-#endif
-#define DEBUGBREAK asm("int $3")
-
-#if !defined(__CYGWIN__)
-
-#ifndef __cdecl
-#define __cdecl
-#endif
-#ifndef __stdcall
-#define __stdcall
-#endif
-
-#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
-#define __declspec(x) __declspec_##x
-#define __declspec_align(y) __attribute__((aligned(y)))
-#define __declspec_deprecated __attribute__((deprecated))
-#define __declspec_dllexport
-#define __declspec_dllimport
-#define __declspec_noinline __attribute__((__noinline__))
-#define __declspec_nothrow __attribute__((nothrow))
-#define __declspec_novtable
-#define __declspec_thread __thread
-#else
-#define __declspec(X)
-#endif
-
-#endif
-
-#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-
-#if !defined(__clang__) && (__GNUC__) && (GCC_VERSION < 40500)
-inline uint64_t      __rdtsc()
-{
-    long low, high;
-    asm volatile("rdtsc" : "=a"(low), "=d"(high));
-    return (low | ((uint64_t)high << 32));
-}
-#endif
-
-#if !defined(__clang__) && !defined(__INTEL_COMPILER)
-// Intrinsic not defined in gcc < 10
-#if (__GNUC__) && (GCC_VERSION < 100000)
-static INLINE void _mm256_storeu2_m128i(__m128i* hi, __m128i* lo, __m256i a)
-{
-    _mm_storeu_si128((__m128i*)lo, _mm256_castsi256_si128(a));
-    _mm_storeu_si128((__m128i*)hi, _mm256_extractf128_si256(a, 0x1));
-}
-#endif
-
-// gcc prior to 4.9 doesn't have _mm*_undefined_*
-#if (__GNUC__) && (GCC_VERSION < 40900)
-#define _mm_undefined_si128 _mm_setzero_si128
-#define _mm256_undefined_ps _mm256_setzero_ps
-#endif
-#endif
-
-inline unsigned char _BitScanForward64(unsigned long* Index, uint64_t Mask)
-{
-    if (Mask == 0)
-      return 0;
-    *Index = __builtin_ctzll(Mask);
-    return 1;
-}
-
-inline unsigned char _BitScanForward(unsigned long* Index, uint32_t Mask)
-{
-    if (Mask == 0)
-      return 0;
-    *Index = __builtin_ctz(Mask);
-    return 1;
-}
-
-inline unsigned char _BitScanReverse64(unsigned long* Index, uint64_t Mask)
-{
-    if (Mask == 0)
-      return 0;
-    *Index = 63 - __builtin_clzll(Mask);
-    return 1;
-}
-
-inline unsigned char _BitScanReverse(unsigned long* Index, uint32_t Mask)
-{
-    if (Mask == 0)
-      return 0;
-    *Index = 31 - __builtin_clz(Mask);
-    return 1;
-}
-
-inline void* AlignedMalloc(size_t size, size_t alignment)
-{
-    void* ret;
-    if (posix_memalign(&ret, alignment, size))
-    {
-        return NULL;
-    }
-    return ret;
-}
-
-static inline void AlignedFree(void* p)
-{
-    free(p);
-}
-
-#define _countof(a) (sizeof(a) / sizeof(*(a)))
-
-#define sprintf_s sprintf
-#define strcpy_s(dst, size, src) strncpy(dst, src, size)
-#define GetCurrentProcessId getpid
-
-#define InterlockedCompareExchange(Dest, Exchange, Comparand) \
-    __sync_val_compare_and_swap(Dest, Comparand, Exchange)
-#define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value)
-#define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1)
-#define InterlockedDecrement64(Append) __sync_sub_and_fetch(Append, 1)
-#define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1)
-#define InterlockedAdd(Addend, Value) __sync_add_and_fetch(Addend, Value)
-#define InterlockedAdd64(Addend, Value) __sync_add_and_fetch(Addend, Value)
-#define _ReadWriteBarrier() asm volatile("" ::: "memory")
-
-#define PRAGMA_WARNING_PUSH_DISABLE(...)
-#define PRAGMA_WARNING_POP()
-
-#define ZeroMemory(dst, size) memset(dst, 0, size)
-#else
-
-#error Unsupported OS/system.
-
-#endif
-
-#define THREAD thread_local
-
-// Universal types
-typedef uint8_t  KILOBYTE[1024];
-typedef KILOBYTE MEGABYTE[1024];
-typedef MEGABYTE GIGABYTE[1024];
-
-#define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64)
-#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, KNOB_SIMD_BYTES)
-#define OSALIGNSIMD16(RWORD) OSALIGN(RWORD, KNOB_SIMD16_BYTES)
-
-#include "common/swr_assert.h"
-
-#ifdef __GNUC__
-#define ATTR_UNUSED __attribute__((unused))
-#else
-#define ATTR_UNUSED
-#endif
-
-#define SWR_FUNC(_retType, _funcName, /* args */...)        \
-    typedef _retType(SWR_API* PFN##_funcName)(__VA_ARGS__); \
-    _retType SWR_API _funcName(__VA_ARGS__);
-
-// Defined in os.cpp
-void SWR_API SetCurrentThreadName(const char* pThreadName);
-void SWR_API CreateDirectoryPath(const std::string& path);
-
-/// Execute Command (block until finished)
-/// @returns process exit value
-int SWR_API
-    ExecCmd(const std::string& cmd,                ///< (In) Command line string
-            const char*  pOptEnvStrings = nullptr, ///< (Optional In) Environment block for new process
-            std::string* pOptStdOut     = nullptr,   ///< (Optional Out) Standard Output text
-            std::string* pOptStdErr     = nullptr,   ///< (Optional Out) Standard Error text
-            const std::string* pOptStdIn = nullptr); ///< (Optional In) Standard Input text
-
-
-/// Helper for setting up FP state
-/// @returns old csr state
-static INLINE uint32_t SetOptimalVectorCSR()
-{
-    uint32_t oldCSR = _mm_getcsr();
-
-    uint32_t newCSR = (oldCSR & ~(_MM_ROUND_MASK | _MM_DENORMALS_ZERO_MASK | _MM_FLUSH_ZERO_MASK));
-    newCSR |= (_MM_ROUND_NEAREST | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
-    _mm_setcsr(newCSR);
-
-    return oldCSR;
-}
-
-/// Set Vector CSR state.
-/// @param csrState - should be value returned from SetOptimalVectorCSR()
-static INLINE void RestoreVectorCSR(uint32_t csrState)
-{
-    _mm_setcsr(csrState);
-}
-
-#endif //__SWR_OS_H__
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
@ -1,192 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file rdtsc_buckets.cpp
- *
- * @brief implementation of rdtsc buckets.
- *
- * Notes:
- *
- ******************************************************************************/
-#include "rdtsc_buckets.h"
-#include <inttypes.h>
-
-#if defined(_WIN32)
-#define PATH_SEPARATOR "\\"
-#elif defined(__unix__) || defined(__APPLE__)
-#define PATH_SEPARATOR "/"
-#else
-#error "Unsupported platform"
-#endif
-
-THREAD UINT tlsThreadId = 0;
-
-BucketManager::~BucketManager()
-{
-}
-
-void BucketManager::RegisterThread(const std::string& name)
-{
-
-    BUCKET_THREAD newThread;
-    newThread.name = name;
-    newThread.root.children.reserve(mBuckets.size());
-    newThread.root.id      = 0;
-    newThread.root.pParent = nullptr;
-    newThread.pCurrent     = &newThread.root;
-
-    mThreadMutex.lock();
-
-    // assign unique thread id for this thread
-    size_t id    = mThreads.size();
-    newThread.id = (UINT)id;
-    tlsThreadId  = (UINT)id;
-
-    // store new thread
-    mThreads.push_back(newThread);
-
-    mThreadMutex.unlock();
-}
-
-UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc)
-{
-    mThreadMutex.lock();
-    size_t id = mBuckets.size();
-    mBuckets.push_back(desc);
-    mThreadMutex.unlock();
-    return (UINT)id;
-}
-
-void BucketManager::PrintBucket(
-    FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket)
-{
-    const char* arrows[] = {
-        "",
-        "|-> ",
-        "    |-> ",
-        "        |-> ",
-        "            |-> ",
-        "                |-> ",
-        "                    |-> ",
-        "                        |-> ",
-        "                            |-> ",
-    };
-
-    // compute percent of total cycles used by this bucket
-    float percentTotal = (float)((double)bucket.elapsed / (double)threadCycles * 100.0);
-
-    // compute percent of parent cycles used by this bucket
-    float percentParent = (float)((double)bucket.elapsed / (double)parentCycles * 100.0);
-
-    // compute average cycle count per invocation
-    uint64_t CPE = bucket.elapsed / bucket.count;
-
-    BUCKET_DESC& desc = mBuckets[bucket.id];
-
-    // construct hierarchy visualization
-    std::string str = arrows[level];
-    str += desc.name;
-    char hier[80];
-    strcpy_s(hier, sizeof(hier)-1, str.c_str());
-
-    // print out
-    fprintf(f,
-            "%6.2f %6.2f %-10" PRIu64 " %-10" PRIu64 " %-10u %-10lu %-10u %s\n",
-            percentTotal,
-            percentParent,
-            bucket.elapsed,
-            CPE,
-            bucket.count,
-            (unsigned long)0,
-            (uint32_t)0,
-            hier);
-
-    // dump all children of this bucket
-    for (const BUCKET& child : bucket.children)
-    {
-        if (child.count)
-        {
-            PrintBucket(f, level + 1, threadCycles, bucket.elapsed, child);
-        }
-    }
-}
-
-void BucketManager::PrintThread(FILE* f, const BUCKET_THREAD& thread)
-{
-    // print header
-    fprintf(f, "\nThread %u (%s)\n", thread.id, thread.name.c_str());
-    fprintf(f, " %%Tot   %%Par  Cycles     CPE        NumEvent   CPE2       NumEvent2  Bucket\n");
-
-    // compute thread level total cycle counts across all buckets from root
-    const BUCKET& root        = thread.root;
-    uint64_t      totalCycles = 0;
-    for (const BUCKET& child : root.children)
-    {
-        totalCycles += child.elapsed;
-    }
-
-    for (const BUCKET& child : root.children)
-    {
-        if (child.count)
-        {
-            PrintBucket(f, 0, totalCycles, totalCycles, child);
-        }
-    }
-}
-
-void BucketManager::PrintReport(const std::string& filename)
-{
-    {
-        FILE* f = fopen(filename.c_str(), "w");
-        assert(f);
-
-        mThreadMutex.lock();
-        for (const BUCKET_THREAD& thread : mThreads)
-        {
-            PrintThread(f, thread);
-            fprintf(f, "\n");
-        }
-
-        mThreadMutex.unlock();
-
-        fclose(f);
-    }
-}
-
-
-void BucketManager::StartCapture()
-{
-
-    printf("Capture Starting\n");
-
-    mCapturing = true;
-}
-
-void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id)
-{
-    pBucketMgr->StartBucket(id);
-}
-
-void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id)
-{
-    pBucketMgr->StopBucket(id);
-}
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
@ -1,227 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file rdtsc_buckets.h
- *
- * @brief declaration for rdtsc buckets.
- *
- * Notes:
- *
- ******************************************************************************/
-#pragma once
-
-#include "os.h"
-#include <vector>
-#include <mutex>
-#include <sstream>
-
-#include "rdtsc_buckets_shared.h"
-
-
-// unique thread id stored in thread local storage
-extern THREAD UINT tlsThreadId;
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief BucketManager encapsulates a single instance of the buckets
-///        functionality. There can be one or many bucket managers active
-///        at any time.  The manager owns all the threads and
-///        bucket information that have been registered to it.
-class BucketManager
-{
-public:
-
-    uint32_t mCurrentFrame;
-    std::vector<uint32_t> mBucketMap;
-    bool                  mBucketsInitialized;
-    std::string           mBucketMgrName;
-
-
-    BucketManager(std::string name) : mCurrentFrame(0), mBucketsInitialized(false), mBucketMgrName(name) 
-    {
-        mBucketMap.clear();
-    }
-    ~BucketManager();
-
-    // removes all registered thread data
-    void ClearThreads()
-    {
-        mThreadMutex.lock();
-        mThreads.clear();
-        mThreadMutex.unlock();
-    }
-
-    // removes all registered buckets
-    void ClearBuckets()
-    {
-        mThreadMutex.lock();
-        mBuckets.clear();
-        mThreadMutex.unlock();
-    }
-
-    /// Registers a new thread with the manager.
-    /// @param name - name of thread, used for labels in reports and threadviz
-    void RegisterThread(const std::string& name);
-
-    /// Registers a new bucket type with the manager.  Returns a unique
-    /// id which should be used in subsequent calls to start/stop the bucket
-    /// @param desc - description of the bucket
-    /// @return unique id
-    UINT RegisterBucket(const BUCKET_DESC& desc);
-
-    // print report
-    void PrintReport(const std::string& filename);
-
-
-    // start capturing
-    void StartCapture();
-
-    // stop capturing
-    INLINE void StopCapture()
-    {
-        mCapturing = false;
-
-        // wait for all threads to pop back to root bucket
-        bool stillCapturing = true;
-        while (stillCapturing)
-        {
-            stillCapturing = false;
-            for (const BUCKET_THREAD& t : mThreads)
-            {
-                if (t.level > 0)
-                {
-                    stillCapturing = true;
-                    continue;
-                }
-            }
-        }
-
-        mDoneCapturing = true;
-        printf("Capture Stopped\n");
-    }
-
-    // start a bucket
-    // @param id generated by RegisterBucket
-    INLINE void StartBucket(UINT id)
-    {
-        if (!mCapturing)
-            return;
-
-        SWR_ASSERT(tlsThreadId < mThreads.size());
-
-        BUCKET_THREAD& bt = mThreads[tlsThreadId];
-
-        uint64_t tsc = __rdtsc();
-
-        {
-            if (bt.pCurrent->children.size() < mBuckets.size())
-            {
-                bt.pCurrent->children.resize(mBuckets.size());
-            }
-            BUCKET& child = bt.pCurrent->children[id];
-            child.pParent = bt.pCurrent;
-            child.id      = id;
-            child.start   = tsc;
-
-            // update thread's currently executing bucket
-            bt.pCurrent = &child;
-        }
-
-
-        bt.level++;
-    }
-
-    // stop the currently executing bucket
-    INLINE void StopBucket(UINT id)
-    {
-        SWR_ASSERT(tlsThreadId < mThreads.size());
-        BUCKET_THREAD& bt = mThreads[tlsThreadId];
-
-        if (bt.level == 0)
-        {
-            return;
-        }
-
-        uint64_t tsc = __rdtsc();
-
-        {
-            if (bt.pCurrent->start == 0)
-                return;
-            SWR_ASSERT(bt.pCurrent->id == id, "Mismatched buckets detected");
-
-            bt.pCurrent->elapsed += (tsc - bt.pCurrent->start);
-            bt.pCurrent->count++;
-
-            // pop to parent
-            bt.pCurrent = bt.pCurrent->pParent;
-        }
-
-        bt.level--;
-    }
-
-    INLINE void AddEvent(uint32_t id, uint32_t count)
-    {
-        if (!mCapturing)
-            return;
-
-        SWR_ASSERT(tlsThreadId < mThreads.size());
-
-        BUCKET_THREAD& bt = mThreads[tlsThreadId];
-
-        // don't record events for threadviz
-        {
-            if (bt.pCurrent->children.size() < mBuckets.size())
-            {
-                bt.pCurrent->children.resize(mBuckets.size());
-            }
-            BUCKET& child = bt.pCurrent->children[id];
-            child.pParent = bt.pCurrent;
-            child.id      = id;
-            child.count += count;
-        }
-    }
-
-private:
-    void PrintBucket(
-        FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket);
-    void PrintThread(FILE* f, const BUCKET_THREAD& thread);
-
-    // list of active threads that have registered with this manager
-    std::vector<BUCKET_THREAD> mThreads;
-
-    // list of buckets registered with this manager
-    std::vector<BUCKET_DESC> mBuckets;
-
-    // is capturing currently enabled
-    volatile bool mCapturing{false};
-
-    // has capturing completed
-    volatile bool mDoneCapturing{false};
-
-    std::mutex mThreadMutex;
-
-    std::string mThreadVizDir;
-
-};
-
-// C helpers for jitter
-void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id);
-void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id);
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
@ -1,169 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file rdtsc_buckets.h
- *
- * @brief declaration for rdtsc buckets.
- *
- * Notes:
- *
- ******************************************************************************/
-#pragma once
-
-#include <vector>
-#include <cassert>
-
-struct BUCKET
-{
-    uint32_t id{0};
-    uint64_t start{0};
-    uint64_t elapsed{0};
-    uint32_t count{0};
-
-    BUCKET*             pParent{nullptr};
-    std::vector<BUCKET> children;
-};
-
-struct BUCKET_DESC
-{
-    // name of bucket, used in reports
-    std::string name;
-
-    // description of bucket, used in threadviz
-    std::string description;
-
-    // enable for threadviz dumping
-    bool enableThreadViz;
-
-    // threadviz color of bucket, in RGBA8_UNORM format
-    uint32_t color;
-};
-
-
-struct BUCKET_THREAD
-{
-    // name of thread, used in reports
-    std::string name;
-
-    // id for this thread, assigned by the thread manager
-    uint32_t id{0};
-
-    // root of the bucket hierarchy for this thread
-    BUCKET root;
-
-    // currently executing bucket somewhere in the hierarchy
-    BUCKET* pCurrent{nullptr};
-
-    // currently executing hierarchy level
-    uint32_t level{0};
-
-    // threadviz file object
-    FILE* vizFile{nullptr};
-
-
-    BUCKET_THREAD() {}
-    BUCKET_THREAD(const BUCKET_THREAD& that)
-    {
-        name     = that.name;
-        id       = that.id;
-        root     = that.root;
-        pCurrent = &root;
-        vizFile  = that.vizFile;
-    }
-};
-
-enum VIZ_TYPE
-{
-    VIZ_START = 0,
-    VIZ_STOP  = 1,
-    VIZ_DATA  = 2
-};
-
-struct VIZ_START_DATA
-{
-    uint8_t  type;
-    uint32_t bucketId;
-    uint64_t timestamp;
-};
-
-struct VIZ_STOP_DATA
-{
-    uint8_t  type;
-    uint64_t timestamp;
-};
-
-inline void Serialize(FILE* f, const VIZ_START_DATA& data)
-{
-    fwrite(&data, sizeof(VIZ_START_DATA), 1, f);
-}
-
-inline void Deserialize(FILE* f, VIZ_START_DATA& data)
-{
-    fread(&data, sizeof(VIZ_START_DATA), 1, f);
-    assert(data.type == VIZ_START);
-}
-
-inline void Serialize(FILE* f, const VIZ_STOP_DATA& data)
-{
-    fwrite(&data, sizeof(VIZ_STOP_DATA), 1, f);
-}
-
-inline void Deserialize(FILE* f, VIZ_STOP_DATA& data)
-{
-    fread(&data, sizeof(VIZ_STOP_DATA), 1, f);
-    assert(data.type == VIZ_STOP);
-}
-
-inline void Serialize(FILE* f, const std::string& string)
-{
-    assert(string.size() <= 256);
-
-    uint8_t length = (uint8_t)string.size();
-    fwrite(&length, sizeof(length), 1, f);
-    fwrite(string.c_str(), string.size(), 1, f);
-}
-
-inline void Deserialize(FILE* f, std::string& string)
-{
-    char    cstr[256];
-    uint8_t length;
-    fread(&length, sizeof(length), 1, f);
-    fread(cstr, length, 1, f);
-    cstr[length] = 0;
-    string.assign(cstr);
-}
-
-inline void Serialize(FILE* f, const BUCKET_DESC& desc)
-{
-    Serialize(f, desc.name);
-    Serialize(f, desc.description);
-    fwrite(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f);
-    fwrite(&desc.color, sizeof(desc.color), 1, f);
-}
-
-inline void Deserialize(FILE* f, BUCKET_DESC& desc)
-{
-    Deserialize(f, desc.name);
-    Deserialize(f, desc.description);
-    fread(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f);
-    fread(&desc.color, sizeof(desc.color), 1, f);
-}
--- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
@ -1,168 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#ifndef __SWR_SIMD16INTRIN_H__
-#define __SWR_SIMD16INTRIN_H__
-
-#if KNOB_SIMD16_WIDTH == 16
-typedef SIMD512 SIMD16;
-#else
-#error Unsupported vector width
-#endif // KNOB_SIMD16_WIDTH == 16
-
-#define _simd16_setzero_ps SIMD16::setzero_ps
-#define _simd16_setzero_si SIMD16::setzero_si
-#define _simd16_set1_ps SIMD16::set1_ps
-#define _simd16_set1_epi8 SIMD16::set1_epi8
-#define _simd16_set1_epi32 SIMD16::set1_epi32
-#define _simd16_set_ps SIMD16::set_ps
-#define _simd16_set_epi32 SIMD16::set_epi32
-#define _simd16_load_ps SIMD16::load_ps
-#define _simd16_loadu_ps SIMD16::loadu_ps
-#if 1
-#define _simd16_load1_ps SIMD16::broadcast_ss
-#endif
-#define _simd16_load_si SIMD16::load_si
-#define _simd16_loadu_si SIMD16::loadu_si
-#define _simd16_broadcast_ss(m) SIMD16::broadcast_ss((float const*)m)
-#define _simd16_store_ps SIMD16::store_ps
-#define _simd16_store_si SIMD16::store_si
-#define _simd16_extract_ps(a, imm8) SIMD16::extract_ps<imm8>(a)
-#define _simd16_extract_si(a, imm8) SIMD16::extract_si<imm8>(a)
-#define _simd16_insert_ps(a, b, imm8) SIMD16::insert_ps<imm8>(a, b)
-#define _simd16_insert_si(a, b, imm8) SIMD16::insert_si<imm8>(a, b)
-#define _simd16_maskstore_ps SIMD16::maskstore_ps
-#define _simd16_blend_ps(a, b, mask) SIMD16::blend_ps<mask>(a, b)
-#define _simd16_blendv_ps SIMD16::blendv_ps
-#define _simd16_blendv_epi32 SIMD16::blendv_epi32
-#define _simd16_mul_ps SIMD16::mul_ps
-#define _simd16_div_ps SIMD16::div_ps
-#define _simd16_add_ps SIMD16::add_ps
-#define _simd16_sub_ps SIMD16::sub_ps
-#define _simd16_rsqrt_ps SIMD16::rsqrt_ps
-#define _simd16_min_ps SIMD16::min_ps
-#define _simd16_max_ps SIMD16::max_ps
-#define _simd16_movemask_ps SIMD16::movemask_ps
-#define _simd16_movemask_pd SIMD16::movemask_pd
-#define _simd16_cvtps_epi32 SIMD16::cvtps_epi32
-#define _simd16_cvttps_epi32 SIMD16::cvttps_epi32
-#define _simd16_cvtepi32_ps SIMD16::cvtepi32_ps
-#define _simd16_cmp_ps(a, b, comp) SIMD16::cmp_ps<SIMD16::CompareType(comp)>(a, b)
-#define _simd16_cmplt_ps SIMD16::cmplt_ps
-#define _simd16_cmpgt_ps SIMD16::cmpgt_ps
-#define _simd16_cmpneq_ps SIMD16::cmpneq_ps
-#define _simd16_cmpeq_ps SIMD16::cmpeq_ps
-#define _simd16_cmpge_ps SIMD16::cmpge_ps
-#define _simd16_cmple_ps SIMD16::cmple_ps
-#define _simd16_castsi_ps SIMD16::castsi_ps
-#define _simd16_castps_si SIMD16::castps_si
-#define _simd16_castsi_pd SIMD16::castsi_pd
-#define _simd16_castpd_si SIMD16::castpd_si
-#define _simd16_castpd_ps SIMD16::castpd_ps
-#define _simd16_castps_pd SIMD16::castps_pd
-#define _simd16_and_ps SIMD16::and_ps
-#define _simd16_andnot_ps SIMD16::andnot_ps
-#define _simd16_or_ps SIMD16::or_ps
-#define _simd16_xor_ps SIMD16::xor_ps
-#define _simd16_round_ps(a, mode) SIMD16::round_ps<SIMD16::RoundMode(mode)>(a)
-#define _simd16_mul_epi32 SIMD16::mul_epi32
-#define _simd16_mullo_epi32 SIMD16::mullo_epi32
-#define _simd16_sub_epi32 SIMD16::sub_epi32
-#define _simd16_sub_epi64 SIMD16::sub_epi64
-#define _simd16_min_epi32 SIMD16::min_epi32
-#define _simd16_max_epi32 SIMD16::max_epi32
-#define _simd16_min_epu32 SIMD16::min_epu32
-#define _simd16_max_epu32 SIMD16::max_epu32
-#define _simd16_add_epi32 SIMD16::add_epi32
-#define _simd16_and_si SIMD16::and_si
-#define _simd16_andnot_si SIMD16::andnot_si
-#define _simd16_or_si SIMD16::or_si
-#define _simd16_xor_si SIMD16::xor_si
-#define _simd16_cmpeq_epi32 SIMD16::cmpeq_epi32
-#define _simd16_cmpgt_epi32 SIMD16::cmpgt_epi32
-#define _simd16_cmplt_epi32 SIMD16::cmplt_epi32
-#define _simd16_testz_ps SIMD16::testz_ps
-#define _simd16_unpacklo_ps SIMD16::unpacklo_ps
-#define _simd16_unpackhi_ps SIMD16::unpackhi_ps
-#define _simd16_unpacklo_pd SIMD16::unpacklo_pd
-#define _simd16_unpackhi_pd SIMD16::unpackhi_pd
-#define _simd16_unpacklo_epi8 SIMD16::unpacklo_epi8
-#define _simd16_unpackhi_epi8 SIMD16::unpackhi_epi8
-#define _simd16_unpacklo_epi16 SIMD16::unpacklo_epi16
-#define _simd16_unpackhi_epi16 SIMD16::unpackhi_epi16
-#define _simd16_unpacklo_epi32 SIMD16::unpacklo_epi32
-#define _simd16_unpackhi_epi32 SIMD16::unpackhi_epi32
-#define _simd16_unpacklo_epi64 SIMD16::unpacklo_epi64
-#define _simd16_unpackhi_epi64 SIMD16::unpackhi_epi64
-#define _simd16_slli_epi32(a, i) SIMD16::slli_epi32<i>(a)
-#define _simd16_srli_epi32(a, i) SIMD16::srli_epi32<i>(a)
-#define _simd16_srai_epi32(a, i) SIMD16::srai_epi32<i>(a)
-#define _simd16_fmadd_ps SIMD16::fmadd_ps
-#define _simd16_fmsub_ps SIMD16::fmsub_ps
-#define _simd16_adds_epu8 SIMD16::adds_epu8
-#define _simd16_subs_epu8 SIMD16::subs_epu8
-#define _simd16_add_epi8 SIMD16::add_epi8
-#define _simd16_shuffle_epi8 SIMD16::shuffle_epi8
-
-#define _simd16_i32gather_ps(m, index, scale) \
-    SIMD16::i32gather_ps<SIMD16::ScaleFactor(scale)>(m, index)
-#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) \
-    SIMD16::mask_i32gather_ps<SIMD16::ScaleFactor(scale)>(a, m, index, mask)
-
-#define _simd16_abs_epi32 SIMD16::abs_epi32
-
-#define _simd16_cmpeq_epi64 SIMD16::cmpeq_epi64
-#define _simd16_cmpgt_epi64 SIMD16::cmpgt_epi64
-#define _simd16_cmpeq_epi16 SIMD16::cmpeq_epi16
-#define _simd16_cmpgt_epi16 SIMD16::cmpgt_epi16
-#define _simd16_cmpeq_epi8 SIMD16::cmpeq_epi8
-#define _simd16_cmpgt_epi8 SIMD16::cmpgt_epi8
-
-#define _simd16_permute_ps_i(a, i) SIMD16::permute_ps<i>(a)
-#define _simd16_permute_ps SIMD16::permute_ps
-#define _simd16_permute_epi32 SIMD16::permute_epi32
-#define _simd16_sllv_epi32 SIMD16::sllv_epi32
-#define _simd16_srlv_epi32 SIMD16::sllv_epi32
-#define _simd16_permute2f128_ps(a, b, i) SIMD16::permute2f128_ps<i>(a, b)
-#define _simd16_permute2f128_pd(a, b, i) SIMD16::permute2f128_pd<i>(a, b)
-#define _simd16_permute2f128_si(a, b, i) SIMD16::permute2f128_si<i>(a, b)
-#define _simd16_shuffle_ps(a, b, i) SIMD16::shuffle_ps<i>(a, b)
-#define _simd16_shuffle_pd(a, b, i) SIMD16::shuffle_pd<i>(a, b)
-#define _simd16_shuffle_epi32(a, b, imm8) SIMD16::shuffle_epi32<imm8>(a, b)
-#define _simd16_shuffle_epi64(a, b, imm8) SIMD16::shuffle_epi64<imm8>(a, b)
-#define _simd16_cvtepu8_epi16 SIMD16::cvtepu8_epi16
-#define _simd16_cvtepu8_epi32 SIMD16::cvtepu8_epi32
-#define _simd16_cvtepu16_epi32 SIMD16::cvtepu16_epi32
-#define _simd16_cvtepu16_epi64 SIMD16::cvtepu16_epi64
-#define _simd16_cvtepu32_epi64 SIMD16::cvtepu32_epi64
-#define _simd16_packus_epi16 SIMD16::packus_epi16
-#define _simd16_packs_epi16 SIMD16::packs_epi16
-#define _simd16_packus_epi32 SIMD16::packus_epi32
-#define _simd16_packs_epi32 SIMD16::packs_epi32
-#define _simd16_cmplt_ps_mask SIMD16::cmp_ps_mask<SIMD16::CompareType::LT_OQ>
-#define _simd16_cmpeq_ps_mask SIMD16::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>
-#define _simd16_int2mask(mask) simd16mask(mask)
-#define _simd16_mask2int(mask) int(mask)
-#define _simd16_vmask_ps SIMD16::vmask_ps
-
-#endif //__SWR_SIMD16INTRIN_H_
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@ -1,322 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#ifndef __SWR_SIMDINTRIN_H__
-#define __SWR_SIMDINTRIN_H__
-
-#include "common/intrin.h"
-#include "common/simdlib.hpp"
-
-#if KNOB_SIMD_WIDTH == 8
-typedef SIMD256 SIMD;
-#else
-#error Unsupported vector width
-#endif // KNOB_SIMD16_WIDTH == 16
-
-#define _simd128_maskstore_ps SIMD128::maskstore_ps
-#define _simd128_fmadd_ps SIMD128::fmadd_ps
-
-#define _simd_load_ps SIMD::load_ps
-#define _simd_load1_ps SIMD::broadcast_ss
-#define _simd_loadu_ps SIMD::loadu_ps
-#define _simd_setzero_ps SIMD::setzero_ps
-#define _simd_set1_ps SIMD::set1_ps
-#define _simd_blend_ps(a, b, i) SIMD::blend_ps<i>(a, b)
-#define _simd_blend_epi32(a, b, i) SIMD::blend_epi32<i>(a, b)
-#define _simd_blendv_ps SIMD::blendv_ps
-#define _simd_store_ps SIMD::store_ps
-#define _simd_mul_ps SIMD::mul_ps
-#define _simd_add_ps SIMD::add_ps
-#define _simd_sub_ps SIMD::sub_ps
-#define _simd_rsqrt_ps SIMD::rsqrt_ps
-#define _simd_min_ps SIMD::min_ps
-#define _simd_max_ps SIMD::max_ps
-#define _simd_movemask_ps SIMD::movemask_ps
-#define _simd_cvtps_epi32 SIMD::cvtps_epi32
-#define _simd_cvttps_epi32 SIMD::cvttps_epi32
-#define _simd_cvtepi32_ps SIMD::cvtepi32_ps
-#define _simd_cmplt_ps SIMD::cmplt_ps
-#define _simd_cmpgt_ps SIMD::cmpgt_ps
-#define _simd_cmpneq_ps SIMD::cmpneq_ps
-#define _simd_cmpeq_ps SIMD::cmpeq_ps
-#define _simd_cmpge_ps SIMD::cmpge_ps
-#define _simd_cmple_ps SIMD::cmple_ps
-#define _simd_cmp_ps(a, b, imm) SIMD::cmp_ps<SIMD::CompareType(imm)>(a, b)
-#define _simd_and_ps SIMD::and_ps
-#define _simd_or_ps SIMD::or_ps
-#define _simd_rcp_ps SIMD::rcp_ps
-#define _simd_div_ps SIMD::div_ps
-#define _simd_castsi_ps SIMD::castsi_ps
-#define _simd_castps_pd SIMD::castps_pd
-#define _simd_castpd_ps SIMD::castpd_ps
-#define _simd_andnot_ps SIMD::andnot_ps
-#define _simd_round_ps(a, i) SIMD::round_ps<SIMD::RoundMode(i)>(a)
-#define _simd_castpd_ps SIMD::castpd_ps
-#define _simd_broadcast_ps(a) SIMD::broadcast_ps((SIMD128::Float const*)(a))
-#define _simd_stream_ps SIMD::stream_ps
-
-#define _simd_movemask_pd SIMD::movemask_pd
-#define _simd_castsi_pd SIMD::castsi_pd
-
-#define _simd_mul_epi32 SIMD::mul_epi32
-#define _simd_mullo_epi32 SIMD::mullo_epi32
-#define _simd_sub_epi32 SIMD::sub_epi32
-#define _simd_sub_epi64 SIMD::sub_epi64
-#define _simd_min_epi32 SIMD::min_epi32
-#define _simd_min_epu32 SIMD::min_epu32
-#define _simd_max_epi32 SIMD::max_epi32
-#define _simd_max_epu32 SIMD::max_epu32
-#define _simd_add_epi32 SIMD::add_epi32
-#define _simd_and_si SIMD::and_si
-#define _simd_andnot_si SIMD::andnot_si
-#define _simd_cmpeq_epi32 SIMD::cmpeq_epi32
-#define _simd_cmplt_epi32 SIMD::cmplt_epi32
-#define _simd_cmpgt_epi32 SIMD::cmpgt_epi32
-#define _simd_or_si SIMD::or_si
-#define _simd_xor_si SIMD::xor_si
-#define _simd_castps_si SIMD::castps_si
-#define _simd_adds_epu8 SIMD::adds_epu8
-#define _simd_subs_epu8 SIMD::subs_epu8
-#define _simd_add_epi8 SIMD::add_epi8
-#define _simd_cmpeq_epi64 SIMD::cmpeq_epi64
-#define _simd_cmpgt_epi64 SIMD::cmpgt_epi64
-#define _simd_cmpgt_epi8 SIMD::cmpgt_epi8
-#define _simd_cmpeq_epi8 SIMD::cmpeq_epi8
-#define _simd_cmpgt_epi16 SIMD::cmpgt_epi16
-#define _simd_cmpeq_epi16 SIMD::cmpeq_epi16
-#define _simd_movemask_epi8 SIMD::movemask_epi8
-#define _simd_permute_ps_i(a, i) SIMD::permute_ps<i>(a)
-#define _simd_permute_ps SIMD::permute_ps
-#define _simd_permute_epi32 SIMD::permute_epi32
-#define _simd_srlv_epi32 SIMD::srlv_epi32
-#define _simd_sllv_epi32 SIMD::sllv_epi32
-
-#define _simd_unpacklo_epi8 SIMD::unpacklo_epi8
-#define _simd_unpackhi_epi8 SIMD::unpackhi_epi8
-#define _simd_unpacklo_epi16 SIMD::unpacklo_epi16
-#define _simd_unpackhi_epi16 SIMD::unpackhi_epi16
-#define _simd_unpacklo_epi32 SIMD::unpacklo_epi32
-#define _simd_unpackhi_epi32 SIMD::unpackhi_epi32
-#define _simd_unpacklo_epi64 SIMD::unpacklo_epi64
-#define _simd_unpackhi_epi64 SIMD::unpackhi_epi64
-
-#define _simd_slli_epi32(a, i) SIMD::slli_epi32<i>(a)
-#define _simd_srai_epi32(a, i) SIMD::srai_epi32<i>(a)
-#define _simd_srli_epi32(a, i) SIMD::srli_epi32<i>(a)
-#define _simd_srlisi_ps(a, i) SIMD::srlisi_ps<i>(a)
-
-#define _simd_fmadd_ps SIMD::fmadd_ps
-#define _simd_fmsub_ps SIMD::fmsub_ps
-#define _simd_shuffle_epi8 SIMD::shuffle_epi8
-
-#define _simd_i32gather_ps(p, o, s) SIMD::i32gather_ps<SIMD::ScaleFactor(s)>(p, o)
-#define _simd_mask_i32gather_ps(r, p, o, m, s) \
-    SIMD::mask_i32gather_ps<SIMD::ScaleFactor(s)>(r, p, o, m)
-#define _simd_abs_epi32 SIMD::abs_epi32
-
-#define _simd_cvtepu8_epi16 SIMD::cvtepu8_epi16
-#define _simd_cvtepu8_epi32 SIMD::cvtepu8_epi32
-#define _simd_cvtepu16_epi32 SIMD::cvtepu16_epi32
-#define _simd_cvtepu16_epi64 SIMD::cvtepu16_epi64
-#define _simd_cvtepu32_epi64 SIMD::cvtepu32_epi64
-
-#define _simd_packus_epi16 SIMD::packus_epi16
-#define _simd_packs_epi16 SIMD::packs_epi16
-#define _simd_packus_epi32 SIMD::packus_epi32
-#define _simd_packs_epi32 SIMD::packs_epi32
-
-#define _simd_unpacklo_ps SIMD::unpacklo_ps
-#define _simd_unpackhi_ps SIMD::unpackhi_ps
-#define _simd_unpacklo_pd SIMD::unpacklo_pd
-#define _simd_unpackhi_pd SIMD::unpackhi_pd
-#define _simd_insertf128_ps SIMD::insertf128_ps
-#define _simd_insertf128_pd SIMD::insertf128_pd
-#define _simd_insertf128_si(a, b, i) SIMD::insertf128_si<i>(a, b)
-#define _simd_extractf128_ps(a, i) SIMD::extractf128_ps<i>(a)
-#define _simd_extractf128_pd(a, i) SIMD::extractf128_pd<i>(a)
-#define _simd_extractf128_si(a, i) SIMD::extractf128_si<i>(a)
-#define _simd_permute2f128_ps(a, b, i) SIMD::permute2f128_ps<i>(a, b)
-#define _simd_permute2f128_pd(a, b, i) SIMD::permute2f128_pd<i>(a, b)
-#define _simd_permute2f128_si(a, b, i) SIMD::permute2f128_si<i>(a, b)
-#define _simd_shuffle_ps(a, b, i) SIMD::shuffle_ps<i>(a, b)
-#define _simd_shuffle_pd(a, b, i) SIMD::shuffle_pd<i>(a, b)
-#define _simd_shuffle_epi32(a, b, imm8) SIMD::shuffle_epi32<imm8>(a, b)
-#define _simd_shuffle_epi64(a, b, imm8) SIMD::shuffle_epi64<imm8>(a, b)
-#define _simd_set1_epi32 SIMD::set1_epi32
-#define _simd_set_epi32 SIMD::set_epi32
-#define _simd_set_ps SIMD::set_ps
-#define _simd_set1_epi8 SIMD::set1_epi8
-#define _simd_setzero_si SIMD::setzero_si
-#define _simd_cvttps_epi32 SIMD::cvttps_epi32
-#define _simd_store_si SIMD::store_si
-#define _simd_broadcast_ss SIMD::broadcast_ss
-#define _simd_maskstore_ps SIMD::maskstore_ps
-#define _simd_load_si SIMD::load_si
-#define _simd_loadu_si SIMD::loadu_si
-#define _simd_sub_ps SIMD::sub_ps
-#define _simd_testz_ps SIMD::testz_ps
-#define _simd_testz_si SIMD::testz_si
-#define _simd_xor_ps SIMD::xor_ps
-
-#define _simd_loadu2_si SIMD::loadu2_si
-#define _simd_storeu2_si SIMD::storeu2_si
-
-#define _simd_blendv_epi32 SIMD::blendv_epi32
-#define _simd_vmask_ps SIMD::vmask_ps
-
-template <int mask>
-SIMDINLINE SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer const& a, SIMD128::Integer const& b)
-{
-    return SIMD128::castps_si(
-        SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b)));
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Compute plane equation vA * vX + vB * vY + vC
-SIMDINLINE simdscalar vplaneps(simdscalar const& vA,
-                               simdscalar const& vB,
-                               simdscalar const& vC,
-                               simdscalar const& vX,
-                               simdscalar const& vY)
-{
-    simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
-    vOut            = _simd_fmadd_ps(vB, vY, vOut);
-    return vOut;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Compute plane equation vA * vX + vB * vY + vC
-SIMDINLINE simd4scalar vplaneps(simd4scalar const& vA,
-                                simd4scalar const& vB,
-                                simd4scalar const& vC,
-                                simd4scalar const& vX,
-                                simd4scalar const& vY)
-{
-    simd4scalar vOut = _simd128_fmadd_ps(vA, vX, vC);
-    vOut             = _simd128_fmadd_ps(vB, vY, vOut);
-    return vOut;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Interpolates a single component.
-/// @param vI - barycentric I
-/// @param vJ - barycentric J
-/// @param pInterpBuffer - pointer to attribute barycentric coeffs
-template <UINT Attrib, UINT Comp, UINT numComponents = 4>
-static SIMDINLINE simdscalar InterpolateComponent(simdscalar const& vI,
-                                                  simdscalar const& vJ,
-                                                  const float*      pInterpBuffer)
-{
-    const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
-    const float* pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
-    const float* pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
-
-    if ((pInterpA[0] == pInterpB[0]) && (pInterpA[0] == pInterpC[0]))
-    {
-        // Ensure constant attribs are constant.  Required for proper
-        // 3D resource copies.
-        return _simd_broadcast_ss(pInterpA);
-    }
-
-    simdscalar vA = _simd_broadcast_ss(pInterpA);
-    simdscalar vB = _simd_broadcast_ss(pInterpB);
-    simdscalar vC = _simd_broadcast_ss(pInterpC);
-
-    simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ);
-    vC            = _simd_mul_ps(vk, vC);
-
-    return vplaneps(vA, vB, vC, vI, vJ);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Interpolates a single component (flat shade).
-/// @param pInterpBuffer - pointer to attribute barycentric coeffs
-template <UINT Attrib, UINT Comp, UINT numComponents = 4>
-static SIMDINLINE simdscalar InterpolateComponentFlat(const float* pInterpBuffer)
-{
-    const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
-
-    simdscalar vA = _simd_broadcast_ss(pInterpA);
-
-    return vA;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Interpolates a single component (flat shade).
-/// @param pInterpBuffer - pointer to attribute barycentric coeffs
-template <UINT Attrib, UINT Comp, UINT numComponents = 4>
-static SIMDINLINE simdscalari InterpolateComponentFlatInt(const uint32_t* pInterpBuffer)
-{
-    const uint32_t interpA = pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
-
-    simdscalari vA = _simd_set1_epi32(interpA);
-
-    return vA;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Interpolates a single component.
-/// @param vI - barycentric I
-/// @param vJ - barycentric J
-/// @param pInterpBuffer - pointer to attribute barycentric coeffs
-template <UINT Attrib, UINT Comp, UINT numComponents = 4>
-static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar const& vI,
-                                                   simd4scalar const& vJ,
-                                                   const float*       pInterpBuffer)
-{
-    const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
-    const float* pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
-    const float* pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
-
-    if ((pInterpA[0] == pInterpB[0]) && (pInterpA[0] == pInterpC[0]))
-    {
-        // Ensure constant attribs are constant.  Required for proper
-        // 3D resource copies.
-        return SIMD128::broadcast_ss(pInterpA);
-    }
-
-    simd4scalar vA = SIMD128::broadcast_ss(pInterpA);
-    simd4scalar vB = SIMD128::broadcast_ss(pInterpB);
-    simd4scalar vC = SIMD128::broadcast_ss(pInterpC);
-
-    simd4scalar vk = SIMD128::sub_ps(SIMD128::sub_ps(SIMD128::set1_ps(1.0f), vI), vJ);
-    vC             = SIMD128::mul_ps(vk, vC);
-
-    return vplaneps(vA, vB, vC, vI, vJ);
-}
-
-static SIMDINLINE simd4scalar _simd128_abs_ps(simd4scalar const& a)
-{
-    simd4scalari ai = SIMD128::castps_si(a);
-    return SIMD128::castsi_ps(SIMD128::and_si(ai, SIMD128::set1_epi32(0x7fffffff)));
-}
-
-static SIMDINLINE simdscalar _simd_abs_ps(simdscalar const& a)
-{
-    simdscalari ai = _simd_castps_si(a);
-    return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff)));
-}
-
-#include "simd16intrin.h"
-
-#endif //__SWR_SIMDINTRIN_H__
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
@ -1,234 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#pragma once
-
-#include "simdlib_types.hpp"
-
-// For documentation, please see the following include...
-// #include "simdlib_interface.hpp"
-
-namespace SIMDImpl
-{
-    namespace SIMD128Impl
-    {
-#if SIMD_ARCH >= SIMD_ARCH_AVX
-        struct AVXImpl
-        {
-#define __SIMD_LIB_AVX_HPP__
-#include "simdlib_128_avx.inl"
-#undef __SIMD_LIB_AVX_HPP__
-        }; // struct AVXImpl
-#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX
-
-#if SIMD_ARCH >= SIMD_ARCH_AVX2
-        struct AVX2Impl : AVXImpl
-        {
-#define __SIMD_LIB_AVX2_HPP__
-#include "simdlib_128_avx2.inl"
-#undef __SIMD_LIB_AVX2_HPP__
-        }; // struct AVX2Impl
-#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX2
-
-#if SIMD_ARCH >= SIMD_ARCH_AVX512
-        struct AVX512Impl : AVX2Impl
-        {
-#if defined(SIMD_OPT_128_AVX512)
-#define __SIMD_LIB_AVX512_HPP__
-#include "simdlib_128_avx512.inl"
-#if defined(SIMD_ARCH_KNIGHTS)
-#include "simdlib_128_avx512_knights.inl"
-#else // optimize for core
-#include "simdlib_128_avx512_core.inl"
-#endif // defined(SIMD_ARCH_KNIGHTS)
-#undef __SIMD_LIB_AVX512_HPP__
-#endif     // SIMD_OPT_128_AVX512
-        }; // struct AVX2Impl
-#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX512
-
-        struct Traits : SIMDImpl::Traits
-        {
-#if SIMD_ARCH == SIMD_ARCH_AVX
-            using IsaImpl = AVXImpl;
-#elif SIMD_ARCH == SIMD_ARCH_AVX2
-            using IsaImpl = AVX2Impl;
-#elif SIMD_ARCH == SIMD_ARCH_AVX512
-            using IsaImpl = AVX512Impl;
-#else
-#error Invalid value for SIMD_ARCH
-#endif
-
-            using Float   = SIMD128Impl::Float;
-            using Double  = SIMD128Impl::Double;
-            using Integer = SIMD128Impl::Integer;
-            using Vec4    = SIMD128Impl::Vec4;
-            using Mask    = SIMD128Impl::Mask;
-        };
-    } // namespace SIMD128Impl
-
-    namespace SIMD256Impl
-    {
-#if SIMD_ARCH >= SIMD_ARCH_AVX
-        struct AVXImpl
-        {
-#define __SIMD_LIB_AVX_HPP__
-#include "simdlib_256_avx.inl"
-#undef __SIMD_LIB_AVX_HPP__
-        }; // struct AVXImpl
-#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX
-
-#if SIMD_ARCH >= SIMD_ARCH_AVX2
-        struct AVX2Impl : AVXImpl
-        {
-#define __SIMD_LIB_AVX2_HPP__
-#include "simdlib_256_avx2.inl"
-#undef __SIMD_LIB_AVX2_HPP__
-        }; // struct AVX2Impl
-#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX2
-
-#if SIMD_ARCH >= SIMD_ARCH_AVX512
-        struct AVX512Impl : AVX2Impl
-        {
-#if defined(SIMD_OPT_256_AVX512)
-#define __SIMD_LIB_AVX512_HPP__
-#include "simdlib_256_avx512.inl"
-#if defined(SIMD_ARCH_KNIGHTS)
-#include "simdlib_256_avx512_knights.inl"
-#else // optimize for core
-#include "simdlib_256_avx512_core.inl"
-#endif // defined(SIMD_ARCH_KNIGHTS)
-#undef __SIMD_LIB_AVX512_HPP__
-#endif     // SIMD_OPT_256_AVX512
-        }; // struct AVX2Impl
-#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX512
-
-        struct Traits : SIMDImpl::Traits
-        {
-#if SIMD_ARCH == SIMD_ARCH_AVX
-            using IsaImpl = AVXImpl;
-#elif SIMD_ARCH == SIMD_ARCH_AVX2
-            using IsaImpl = AVX2Impl;
-#elif SIMD_ARCH == SIMD_ARCH_AVX512
-            using IsaImpl = AVX512Impl;
-#else
-#error Invalid value for SIMD_ARCH
-#endif
-
-            using Float   = SIMD256Impl::Float;
-            using Double  = SIMD256Impl::Double;
-            using Integer = SIMD256Impl::Integer;
-            using Vec4    = SIMD256Impl::Vec4;
-            using Mask    = SIMD256Impl::Mask;
-        };
-    } // namespace SIMD256Impl
-
-    namespace SIMD512Impl
-    {
-#if SIMD_ARCH >= SIMD_ARCH_AVX
-        template <typename SIMD256T>
-        struct AVXImplBase
-        {
-#define __SIMD_LIB_AVX_HPP__
-#include "simdlib_512_emu.inl"
-#include "simdlib_512_emu_masks.inl"
-#undef __SIMD_LIB_AVX_HPP__
-        }; // struct AVXImplBase
-        using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>;
-#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
-
-#if SIMD_ARCH >= SIMD_ARCH_AVX2
-        using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>;
-#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
-
-#if SIMD_ARCH >= SIMD_ARCH_AVX512
-        struct AVX512Impl : AVXImplBase<SIMD256Impl::AVX512Impl>
-        {
-#define __SIMD_LIB_AVX512_HPP__
-#include "simdlib_512_avx512.inl"
-#include "simdlib_512_avx512_masks.inl"
-#if defined(SIMD_ARCH_KNIGHTS)
-#include "simdlib_512_avx512_knights.inl"
-#include "simdlib_512_avx512_masks_knights.inl"
-#else // optimize for core
-#include "simdlib_512_avx512_core.inl"
-#include "simdlib_512_avx512_masks_core.inl"
-#endif // defined(SIMD_ARCH_KNIGHTS)
-#undef __SIMD_LIB_AVX512_HPP__
-        }; // struct AVX512ImplBase
-#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX512
-
-        struct Traits : SIMDImpl::Traits
-        {
-#if SIMD_ARCH == SIMD_ARCH_AVX
-            using IsaImpl = AVXImpl;
-#elif SIMD_ARCH == SIMD_ARCH_AVX2
-            using IsaImpl = AVX2Impl;
-#elif SIMD_ARCH == SIMD_ARCH_AVX512
-            using IsaImpl = AVX512Impl;
-#else
-#error Invalid value for SIMD_ARCH
-#endif
-
-            using Float   = SIMD512Impl::Float;
-            using Double  = SIMD512Impl::Double;
-            using Integer = SIMD512Impl::Integer;
-            using Vec4    = SIMD512Impl::Vec4;
-            using Mask    = SIMD512Impl::Mask;
-        };
-    } // namespace SIMD512Impl
-} // namespace SIMDImpl
-
-template <typename Traits>
-struct SIMDBase : Traits::IsaImpl
-{
-    using CompareType = typename Traits::CompareType;
-    using ScaleFactor = typename Traits::ScaleFactor;
-    using RoundMode   = typename Traits::RoundMode;
-    using SIMD        = typename Traits::IsaImpl;
-    using Float       = typename Traits::Float;
-    using Double      = typename Traits::Double;
-    using Integer     = typename Traits::Integer;
-    using Vec4        = typename Traits::Vec4;
-    using Mask        = typename Traits::Mask;
-}; // struct SIMDBase
-
-using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
-using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>;
-using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>;
-
-template <typename SIMD_T>
-using CompareType = typename SIMD_T::CompareType;
-template <typename SIMD_T>
-using ScaleFactor = typename SIMD_T::ScaleFactor;
-template <typename SIMD_T>
-using RoundMode = typename SIMD_T::RoundMode;
-template <typename SIMD_T>
-using Float = typename SIMD_T::Float;
-template <typename SIMD_T>
-using Double = typename SIMD_T::Double;
-template <typename SIMD_T>
-using Integer = typename SIMD_T::Integer;
-template <typename SIMD_T>
-using Vec4 = typename SIMD_T::Vec4;
-template <typename SIMD_T>
-using Mask = typename SIMD_T::Mask;
-
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
@ -1,593 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD128 AVX (1) implementation
-//============================================================================
-
-#define SIMD_WRAPPER_1(op) \
-    static SIMDINLINE Float SIMDCALL op(Float a) { return _mm_##op(a); }
-
-#define SIMD_WRAPPER_2(op) \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm_##op(a, b); }
-
-#define SIMD_DWRAPPER_2(op) \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm_##op(a, b); }
-
-#define SIMD_WRAPPER_2I(op)                               \
-    template <int ImmT>                                   \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
-    {                                                     \
-        return _mm_##op(a, b, ImmT);                      \
-    }
-
-#define SIMD_DWRAPPER_2I(op)                                 \
-    template <int ImmT>                                      \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
-    {                                                        \
-        return _mm_##op(a, b, ImmT);                         \
-    }
-
-#define SIMD_WRAPPER_3(op) \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm_##op(a, b, c); }
-
-#define SIMD_IWRAPPER_1(op) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm_##op(a); }
-
-#define SIMD_IWRAPPER_1I_(op, intrin)                \
-    template <int ImmT>                              \
-    static SIMDINLINE Integer SIMDCALL op(Integer a) \
-    {                                                \
-        return intrin(a, ImmT);                      \
-    }
-#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm_##op)
-
-#define SIMD_IWRAPPER_2_(op, intrin) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return intrin(a, b); }
-
-#define SIMD_IWRAPPER_2(op) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm_##op(a, b); }
-
-#define SIMD_IFWRAPPER_2(op, intrin)                            \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
-    {                                                           \
-        return castps_si(intrin(castsi_ps(a), castsi_ps(b)));   \
-    }
-
-#define SIMD_IWRAPPER_2I(op)                                    \
-    template <int ImmT>                                         \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
-    {                                                           \
-        return _mm_##op(a, b, ImmT);                            \
-    }
-
-//-----------------------------------------------------------------------
-// Single precision floating point arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2(add_ps);   // return a + b
-SIMD_WRAPPER_2(div_ps);   // return a / b
-SIMD_WRAPPER_2(max_ps);   // return (a > b) ? a : b
-SIMD_WRAPPER_2(min_ps);   // return (a < b) ? a : b
-SIMD_WRAPPER_2(mul_ps);   // return a * b
-SIMD_WRAPPER_1(rcp_ps);   // return 1.0f / a
-SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
-SIMD_WRAPPER_2(sub_ps);   // return a - b
-
-static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
-{
-    return add_ps(mul_ps(a, b), c);
-}
-static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c) // return (a * b) - c
-{
-    return sub_ps(mul_ps(a, b), c);
-}
-
-template <RoundMode RMT>
-static SIMDINLINE Float SIMDCALL round_ps(Float a)
-{
-    return _mm_round_ps(a, static_cast<int>(RMT));
-}
-
-static SIMDINLINE Float SIMDCALL ceil_ps(Float a)
-{
-    return round_ps<RoundMode::CEIL_NOEXC>(a);
-}
-static SIMDINLINE Float SIMDCALL floor_ps(Float a)
-{
-    return round_ps<RoundMode::FLOOR_NOEXC>(a);
-}
-
-//-----------------------------------------------------------------------
-// Integer (various width) arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
-SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
-SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
-SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
-SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
-SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
-SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
-SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
-
-// return (a * b) & 0xFFFFFFFF
-//
-// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
-// and store the low 32 bits of the intermediate integers in dst.
-SIMD_IWRAPPER_2(mullo_epi32);
-SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
-SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
-SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
-
-//-----------------------------------------------------------------------
-// Logical operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2(and_ps);                        // return a & b       (float treated as int)
-SIMD_IWRAPPER_2_(and_si, _mm_and_si128);       // return a & b       (int)
-SIMD_WRAPPER_2(andnot_ps);                     // return (~a) & b    (float treated as int)
-SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128); // return (~a) & b    (int)
-SIMD_WRAPPER_2(or_ps);                         // return a | b       (float treated as int)
-SIMD_IWRAPPER_2_(or_si, _mm_or_si128);         // return a | b       (int)
-SIMD_WRAPPER_2(xor_ps);                        // return a ^ b       (float treated as int)
-SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128);       // return a ^ b       (int)
-
-//-----------------------------------------------------------------------
-// Shift operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
-SIMD_IWRAPPER_1I(slli_epi64); // return a << ImmT
-
-static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b      (uint32)
-{
-    int32_t a, count;
-    a     = _mm_extract_epi32(vA, 0);
-    count = _mm_extract_epi32(vB, 0);
-    a <<= count;
-    vA = _mm_insert_epi32(vA, a, 0);
-
-    a     = _mm_extract_epi32(vA, 1);
-    count = _mm_extract_epi32(vB, 1);
-    a <<= count;
-    vA = _mm_insert_epi32(vA, a, 1);
-
-    a     = _mm_extract_epi32(vA, 2);
-    count = _mm_extract_epi32(vB, 2);
-    a <<= count;
-    vA = _mm_insert_epi32(vA, a, 2);
-
-    a     = _mm_extract_epi32(vA, 3);
-    count = _mm_extract_epi32(vB, 3);
-    a <<= count;
-    vA = _mm_insert_epi32(vA, a, 3);
-
-    return vA;
-}
-
-SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
-SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
-SIMD_IWRAPPER_1I_(srli_si, _mm_srli_si128); // return a >> (ImmT*8) (uint)
-
-static SIMDINLINE Integer SIMDCALL srl_epi64(Integer a, Integer n)
-{
-    return _mm_srl_epi64(a, n);
-}
-
-template <int ImmT> // same as srli_si, but with Float cast to int
-static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
-{
-    return castsi_ps(srli_si<ImmT>(castps_si(a)));
-}
-
-static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b      (uint32)
-{
-    int32_t a, count;
-    a     = _mm_extract_epi32(vA, 0);
-    count = _mm_extract_epi32(vB, 0);
-    a >>= count;
-    vA = _mm_insert_epi32(vA, a, 0);
-
-    a     = _mm_extract_epi32(vA, 1);
-    count = _mm_extract_epi32(vB, 1);
-    a >>= count;
-    vA = _mm_insert_epi32(vA, a, 1);
-
-    a     = _mm_extract_epi32(vA, 2);
-    count = _mm_extract_epi32(vB, 2);
-    a >>= count;
-    vA = _mm_insert_epi32(vA, a, 2);
-
-    a     = _mm_extract_epi32(vA, 3);
-    count = _mm_extract_epi32(vB, 3);
-    a >>= count;
-    vA = _mm_insert_epi32(vA, a, 3);
-
-    return vA;
-}
-
-//-----------------------------------------------------------------------
-// Conversion operations
-//-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
-{
-    return _mm_castpd_ps(a);
-}
-
-static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
-{
-    return _mm_castps_si128(a);
-}
-
-static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
-{
-    return _mm_castsi128_pd(a);
-}
-
-static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
-{
-    return _mm_castps_pd(a);
-}
-
-static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
-{
-    return _mm_castsi128_ps(a);
-}
-
-static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
-{
-    return _mm_cvtepi32_ps(a);
-}
-
-static SIMDINLINE int32_t SIMDCALL cvtsi128_si32(Integer a) // return a.v[0]
-{
-    return _mm_cvtsi128_si32(a);
-}
-
-static SIMDINLINE Integer SIMDCALL cvtsi32_si128(int32_t n) // return a[0] = n, a[1]...a[3] = 0
-{
-    return _mm_cvtsi32_si128(n);
-}
-
-SIMD_IWRAPPER_1(cvtepu8_epi16);  // return (int16)a    (uint8 --> int16)
-SIMD_IWRAPPER_1(cvtepu8_epi32);  // return (int32)a    (uint8 --> int32)
-SIMD_IWRAPPER_1(cvtepu16_epi32); // return (int32)a    (uint16 --> int32)
-SIMD_IWRAPPER_1(cvtepu16_epi64); // return (int64)a    (uint16 --> int64)
-SIMD_IWRAPPER_1(cvtepu32_epi64); // return (int64)a    (uint32 --> int64)
-
-static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a    (float --> int32)
-{
-    return _mm_cvtps_epi32(a);
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          cvttps_epi32(Float a) // return (int32)a    (rnd_to_zero(float) --> int32)
-{
-    return _mm_cvttps_epi32(a);
-}
-
-//-----------------------------------------------------------------------
-// Comparison operations
-//-----------------------------------------------------------------------
-template <CompareType CmpTypeT>
-static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
-{
-    return _mm_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
-}
-static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::LT_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::GT_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::NEQ_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::EQ_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::GE_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::LE_OQ>(a, b);
-}
-
-SIMD_IWRAPPER_2(cmpeq_epi8);  // return a == b (int8)
-SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
-SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
-SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
-SIMD_IWRAPPER_2(cmpgt_epi8);  // return a > b (int8)
-SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
-SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
-SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
-SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
-
-static SIMDINLINE bool SIMDCALL testz_ps(Float a,
-                                         Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
-{
-    return 0 != _mm_testz_ps(a, b);
-}
-
-static SIMDINLINE bool SIMDCALL testz_si(Integer a,
-                                         Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
-{
-    return 0 != _mm_testz_si128(a, b);
-}
-
-//-----------------------------------------------------------------------
-// Blend / shuffle / permute operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a  (float)
-SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a  (float)
-
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
-                                                Integer b,
-                                                Float   mask) // return mask ? b : a (int)
-{
-    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
-}
-
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
-                                                Integer b,
-                                                Integer mask) // return mask ? b : a (int)
-{
-    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
-}
-
-static SIMDINLINE Float SIMDCALL
-                        broadcast_ss(float const* p) // return *p (all elements in vector get same value)
-{
-    return _mm_broadcast_ss(p);
-}
-
-SIMD_IWRAPPER_2(packs_epi16);  // See documentation for _mm_packs_epi16 and _mm512_packs_epi16
-SIMD_IWRAPPER_2(packs_epi32);  // See documentation for _mm_packs_epi32 and _mm512_packs_epi32
-SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm_packus_epi16 and _mm512_packus_epi16
-SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm_packus_epi32 and _mm512_packus_epi32
-
-static SIMDINLINE Integer SIMDCALL
-                          permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
-{
-    return castps_si(_mm_permutevar_ps(castsi_ps(a), swiz));
-}
-
-static SIMDINLINE Float SIMDCALL
-                        permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
-{
-    return _mm_permutevar_ps(a, swiz);
-}
-
-SIMD_IWRAPPER_1I(shuffle_epi32);
-
-template <int ImmT>
-static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) = delete;
-
-SIMD_IWRAPPER_2(shuffle_epi8);
-SIMD_DWRAPPER_2I(shuffle_pd);
-SIMD_WRAPPER_2I(shuffle_ps);
-SIMD_IWRAPPER_2(unpackhi_epi16);
-
-// SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps);
-static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
-{
-    return castps_si(_mm_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
-}
-
-SIMD_IWRAPPER_2(unpackhi_epi64);
-SIMD_IWRAPPER_2(unpackhi_epi8);
-SIMD_DWRAPPER_2(unpackhi_pd);
-SIMD_WRAPPER_2(unpackhi_ps);
-SIMD_IWRAPPER_2(unpacklo_epi16);
-SIMD_IFWRAPPER_2(unpacklo_epi32, _mm_unpacklo_ps);
-SIMD_IWRAPPER_2(unpacklo_epi64);
-SIMD_IWRAPPER_2(unpacklo_epi8);
-SIMD_DWRAPPER_2(unpacklo_pd);
-SIMD_WRAPPER_2(unpacklo_ps);
-
-//-----------------------------------------------------------------------
-// Load / store operations
-//-----------------------------------------------------------------------
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
-{
-    uint32_t* pOffsets = (uint32_t*)&idx;
-    Float     vResult;
-    float*    pResult = (float*)&vResult;
-    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
-    {
-        uint32_t offset = pOffsets[i];
-        offset          = offset * static_cast<uint32_t>(ScaleT);
-        pResult[i]      = *(float const*)(((uint8_t const*)p + offset));
-    }
-
-    return vResult;
-}
-
-static SIMDINLINE Float SIMDCALL
-                        load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
-{
-    return broadcast_ss(p);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
-{
-    return _mm_load_ps(p);
-}
-
-static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
-{
-    return _mm_load_si128(&p->v);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
-{
-    return _mm_loadu_ps(p);
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
-{
-    return _mm_lddqu_si128(&p->v);
-}
-
-// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
-{
-    uint32_t* pOffsets = (uint32_t*)&idx;
-    Float     vResult  = old;
-    float*    pResult  = (float*)&vResult;
-    unsigned long index;
-    uint32_t  umask = movemask_ps(mask);
-    while (_BitScanForward(&index, umask))
-    {
-        umask &= ~(1 << index);
-        uint32_t offset = pOffsets[index];
-        offset          = offset * static_cast<uint32_t>(ScaleT);
-        pResult[index]  = *(float const*)(((uint8_t const*)p + offset));
-    }
-
-    return vResult;
-}
-
-static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
-{
-    _mm_maskstore_ps(p, mask, src);
-}
-
-static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
-{
-    return static_cast<uint32_t>(_mm_movemask_epi8(a));
-}
-
-static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
-{
-    return static_cast<uint32_t>(_mm_movemask_pd(a));
-}
-static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
-{
-    return static_cast<uint32_t>(_mm_movemask_ps(a));
-}
-
-static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
-{
-    return _mm_set1_epi32(i);
-}
-
-static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
-{
-    return _mm_set1_epi8(i);
-}
-
-static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
-{
-    return _mm_set1_ps(f);
-}
-
-static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
-{
-    return _mm_setzero_ps();
-}
-
-static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
-{
-    return _mm_setzero_si128();
-}
-
-static SIMDINLINE void SIMDCALL
-                       store_ps(float* p, Float a) // *p = a   (stores all elements contiguously in memory)
-{
-    _mm_store_ps(p, a);
-}
-
-static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
-{
-    _mm_store_si128(&p->v, a);
-}
-
-static SIMDINLINE void SIMDCALL
-                       storeu_si(Integer* p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
-{
-    _mm_storeu_si128(&p->v, a);
-}
-
-static SIMDINLINE void SIMDCALL
-                       stream_ps(float* p, Float a) // *p = a   (same as store_ps, but doesn't keep memory in cache)
-{
-    _mm_stream_ps(p, a);
-}
-
-static SIMDINLINE Float SIMDCALL set_ps(float in3, float in2, float in1, float in0)
-{
-    return _mm_set_ps(in3, in2, in1, in0);
-}
-
-static SIMDINLINE Integer SIMDCALL set_epi32(int in3, int in2, int in1, int in0)
-{
-    return _mm_set_epi32(in3, in2, in1, in0);
-}
-
-template <int ImmT>
-static SIMDINLINE float SIMDCALL extract_ps(Float a)
-{
-    int tmp = _mm_extract_ps(a, ImmT);
-    return *reinterpret_cast<float*>(&tmp);
-}
-
-static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
-{
-    Integer       vec = set1_epi32(mask);
-    const Integer bit = set_epi32(0x08, 0x04, 0x02, 0x01);
-    vec               = and_si(vec, bit);
-    vec               = cmplt_epi32(setzero_si(), vec);
-    return castsi_ps(vec);
-}
-
-#undef SIMD_WRAPPER_1
-#undef SIMD_WRAPPER_2
-#undef SIMD_DWRAPPER_2
-#undef SIMD_DWRAPPER_2I
-#undef SIMD_WRAPPER_2I
-#undef SIMD_WRAPPER_3
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IFWRAPPER_2
-#undef SIMD_IWRAPPER_2I
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_1I
-#undef SIMD_IWRAPPER_1I_
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_2I
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl
@ -1,66 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX2_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD4 AVX (2) implementation
-//
-// Since this implementation inherits from the AVX (1) implementation,
-// the only operations below ones that replace AVX (1) operations.
-// Only 2 shifts and 2 gathers were introduced with AVX 2
-// Also, add native support for FMA operations
-//============================================================================
-#define SIMD_WRAPPER_3(op) \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm_##op(a, b, c); }
-
-SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
-SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
-
-static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b      (uint32)
-{
-    return _mm_sllv_epi32(vA, vB);
-}
-
-static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b      (uint32)
-{
-    return _mm_srlv_epi32(vA, vB);
-}
-
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
-{
-    return _mm_i32gather_ps(p, idx, static_cast<const int>(ScaleT));
-}
-
-// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
-{
-    return _mm_mask_i32gather_ps(old, p, idx, mask, static_cast<const int>(ScaleT));
-}
-
-#undef SIMD_WRAPPER_3
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
@ -1,368 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD128 AVX (512) implementation
-//
-// Since this implementation inherits from the AVX (2) implementation,
-// the only operations below ones that replace AVX (2) operations.
-// These use native AVX512 instructions with masking to enable a larger
-// register set.
-//============================================================================
-
-private:
-static SIMDINLINE __m512 __conv(Float r)
-{
-    return _mm512_castps128_ps512(r.v);
-}
-static SIMDINLINE __m512d __conv(Double r)
-{
-    return _mm512_castpd128_pd512(r.v);
-}
-static SIMDINLINE __m512i __conv(Integer r)
-{
-    return _mm512_castsi128_si512(r.v);
-}
-static SIMDINLINE Float __conv(__m512 r)
-{
-    return _mm512_castps512_ps128(r);
-}
-static SIMDINLINE Double __conv(__m512d r)
-{
-    return _mm512_castpd512_pd128(r);
-}
-static SIMDINLINE Integer __conv(__m512i r)
-{
-    return _mm512_castsi512_si128(r);
-}
-
-public:
-#define SIMD_WRAPPER_1_(op, intrin, mask)                        \
-    static SIMDINLINE Float SIMDCALL op(Float a)                 \
-    {                                                            \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
-    }
-#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
-
-#define SIMD_WRAPPER_1I_(op, intrin, mask)                             \
-    template <int ImmT>                                                \
-    static SIMDINLINE Float SIMDCALL op(Float a)                       \
-    {                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
-    }
-#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
-
-#define SIMD_WRAPPER_2_(op, intrin, mask)                                   \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                   \
-    {                                                                       \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
-    }
-#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
-
-#define SIMD_WRAPPER_2I(op)                                                \
-    template <int ImmT>                                                    \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                  \
-    {                                                                      \
-        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
-    }
-
-#define SIMD_WRAPPER_3_(op, intrin, mask)                                              \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)                     \
-    {                                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \
-    }
-#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
-
-#define SIMD_DWRAPPER_2I(op)                                               \
-    template <int ImmT>                                                    \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)               \
-    {                                                                      \
-        return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT)); \
-    }
-
-#define SIMD_IWRAPPER_1_(op, intrin, mask)                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)             \
-    {                                                            \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
-    }
-#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xf))
-
-#define SIMD_IWRAPPER_1I_(op, intrin, mask)                            \
-    template <int ImmT>                                                \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)                   \
-    {                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
-    }
-#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xf))
-
-#define SIMD_IWRAPPER_2_(op, intrin, mask)                                  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)             \
-    {                                                                       \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
-    }
-#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xf))
-
-#define SIMD_IWRAPPER_2I(op)                                               \
-    template <int ImmT>                                                    \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)            \
-    {                                                                      \
-        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
-    }
-
-//-----------------------------------------------------------------------
-// Single precision floating point arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2(add_ps);                                // return a + b
-SIMD_WRAPPER_2(div_ps);                                // return a / b
-SIMD_WRAPPER_3(fmadd_ps);                              // return (a * b) + c
-SIMD_WRAPPER_3(fmsub_ps);                              // return (a * b) - c
-SIMD_WRAPPER_2(max_ps);                                // return (a > b) ? a : b
-SIMD_WRAPPER_2(min_ps);                                // return (a < b) ? a : b
-SIMD_WRAPPER_2(mul_ps);                                // return a * b
-SIMD_WRAPPER_1_(rcp_ps, rcp14_ps, __mmask16(0xf));     // return 1.0f / a
-SIMD_WRAPPER_1_(rsqrt_ps, rsqrt14_ps, __mmask16(0xf)); // return 1.0f / sqrt(a)
-SIMD_WRAPPER_2(sub_ps);                                // return a - b
-
-//-----------------------------------------------------------------------
-// Integer (various width) arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32)
-SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32)
-SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32)
-SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32)
-SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32)
-SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
-SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
-
-// SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
-// SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-
-// return (a * b) & 0xFFFFFFFF
-//
-// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
-// and store the low 32 bits of the intermediate integers in dst.
-SIMD_IWRAPPER_2_32(mullo_epi32);
-SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
-
-// SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
-// SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
-
-//-----------------------------------------------------------------------
-// Logical operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xf));       // return a & b       (int)
-SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xf)); // return (~a) & b    (int)
-SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xf));         // return a | b       (int)
-SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xf));       // return a ^ b       (int)
-
-//-----------------------------------------------------------------------
-// Shift operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT
-SIMD_IWRAPPER_2_32(sllv_epi32);  // return a << b      (uint32)
-SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT   (int32)
-SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT   (uint32)
-SIMD_IWRAPPER_2_32(srlv_epi32);  // return a >> b      (uint32)
-
-// use AVX2 version
-// SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
-
-//-----------------------------------------------------------------------
-// Conversion operations (Use AVX2 versions)
-//-----------------------------------------------------------------------
-// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff);    // return (int16)a    (uint8 --> int16)
-// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff);      // return (int32)a    (uint8 --> int32)
-// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff);     // return (int32)a    (uint16 --> int32)
-// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf);      // return (int64)a    (uint16 --> int64)
-// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf);      // return (int64)a    (uint32 --> int64)
-
-//-----------------------------------------------------------------------
-// Comparison operations (Use AVX2 versions
-//-----------------------------------------------------------------------
-// SIMD_IWRAPPER_2_CMP(cmpeq_epi8);    // return a == b (int8)
-// SIMD_IWRAPPER_2_CMP(cmpeq_epi16);   // return a == b (int16)
-// SIMD_IWRAPPER_2_CMP(cmpeq_epi32);   // return a == b (int32)
-// SIMD_IWRAPPER_2_CMP(cmpeq_epi64);   // return a == b (int64)
-// SIMD_IWRAPPER_2_CMP(cmpgt_epi8,);   // return a > b (int8)
-// SIMD_IWRAPPER_2_CMP(cmpgt_epi16);   // return a > b (int16)
-// SIMD_IWRAPPER_2_CMP(cmpgt_epi32);   // return a > b (int32)
-// SIMD_IWRAPPER_2_CMP(cmpgt_epi64);   // return a > b (int64)
-//
-// static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
-//{
-//    return cmpgt_epi32(b, a);
-//}
-
-//-----------------------------------------------------------------------
-// Blend / shuffle / permute operations
-//-----------------------------------------------------------------------
-// SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16
-// and _mm512_packs_epi16 SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation
-// for _mm256_packs_epi32 and _mm512_packs_epi32 SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 -->
-// uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-// SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for
-// _mm256_packus_epi32 and _mm512_packus_epi32 SIMD_IWRAPPER_2_(permute_epi32,
-// permutevar8x32_epi32);
-
-// static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for
-// each 32-bit lane i (float)
-//{
-//    return _mm256_permutevar8x32_ps(a, swiz);
-//}
-
-SIMD_IWRAPPER_1I_32(shuffle_epi32);
-// template<int ImmT>
-// static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
-//{
-//    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
-//}
-// SIMD_IWRAPPER_2(shuffle_epi8);
-SIMD_IWRAPPER_2_32(unpackhi_epi32);
-SIMD_IWRAPPER_2_32(unpacklo_epi32);
-
-// SIMD_IWRAPPER_2_16(unpackhi_epi16);
-// SIMD_IWRAPPER_2_64(unpackhi_epi64);
-// SIMD_IWRAPPER_2_8(unpackhi_epi8);
-// SIMD_IWRAPPER_2_16(unpacklo_epi16);
-// SIMD_IWRAPPER_2_64(unpacklo_epi64);
-// SIMD_IWRAPPER_2_8(unpacklo_epi8);
-
-//-----------------------------------------------------------------------
-// Load / store operations
-//-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL
-                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
-{
-    return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p));
-}
-
-static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
-{
-    return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p));
-}
-
-static SIMDINLINE Float SIMDCALL
-                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
-{
-    return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p));
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
-{
-    return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p));
-}
-
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
-{
-    return __conv(_mm512_mask_i32gather_ps(
-        _mm512_setzero_ps(), __mmask16(0xf), __conv(idx), p, static_cast<int>(ScaleT)));
-}
-
-// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
-{
-    __mmask16 m = 0xf;
-    m           = _mm512_mask_test_epi32_mask(
-        m, _mm512_castps_si512(__conv(mask)), _mm512_set1_epi32(0x80000000));
-    return __conv(
-        _mm512_mask_i32gather_ps(__conv(old), m, __conv(idx), p, static_cast<int>(ScaleT)));
-}
-
-// static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
-// {
-//     __mmask64 m = 0xffffull;
-//     return static_cast<uint32_t>(
-//         _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
-// }
-
-static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
-{
-    __mmask16 m = 0xf;
-    m           = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
-    _mm512_mask_storeu_ps(p, m, __conv(src));
-}
-
-static SIMDINLINE void SIMDCALL
-                       store_ps(float* p, Float a) // *p = a   (stores all elements contiguously in memory)
-{
-    _mm512_mask_storeu_ps(p, __mmask16(0xf), __conv(a));
-}
-
-static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
-{
-    _mm512_mask_storeu_epi32(p, __mmask16(0xf), __conv(a));
-}
-
-static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
-{
-    return castsi_ps(__conv(_mm512_maskz_set1_epi32(__mmask16(mask & 0xf), -1)));
-}
-
-//=======================================================================
-// Legacy interface (available only in SIMD256 width)
-//=======================================================================
-
-#undef SIMD_WRAPPER_1_
-#undef SIMD_WRAPPER_1
-#undef SIMD_WRAPPER_1I_
-#undef SIMD_WRAPPER_1I
-#undef SIMD_WRAPPER_2_
-#undef SIMD_WRAPPER_2
-#undef SIMD_WRAPPER_2I
-#undef SIMD_WRAPPER_3_
-#undef SIMD_WRAPPER_3
-#undef SIMD_DWRAPPER_1_
-#undef SIMD_DWRAPPER_1
-#undef SIMD_DWRAPPER_1I_
-#undef SIMD_DWRAPPER_1I
-#undef SIMD_DWRAPPER_2_
-#undef SIMD_DWRAPPER_2
-#undef SIMD_DWRAPPER_2I
-#undef SIMD_IWRAPPER_1_
-#undef SIMD_IWRAPPER_1_8
-#undef SIMD_IWRAPPER_1_16
-#undef SIMD_IWRAPPER_1_32
-#undef SIMD_IWRAPPER_1_64
-#undef SIMD_IWRAPPER_1I_
-#undef SIMD_IWRAPPER_1I_8
-#undef SIMD_IWRAPPER_1I_16
-#undef SIMD_IWRAPPER_1I_32
-#undef SIMD_IWRAPPER_1I_64
-#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_2_8
-#undef SIMD_IWRAPPER_2_16
-#undef SIMD_IWRAPPER_2_32
-#undef SIMD_IWRAPPER_2_64
-#undef SIMD_IWRAPPER_2I
-//#undef SIMD_IWRAPPER_2I_8
-//#undef SIMD_IWRAPPER_2I_16
-//#undef SIMD_IWRAPPER_2I_32
-//#undef SIMD_IWRAPPER_2I_64
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_core.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_core.inl
@ -1,196 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD128 AVX (512) implementation
-//
-// Since this implementation inherits from the AVX (2) implementation,
-// the only operations below ones that replace AVX (2) operations.
-// These use native AVX512 instructions with masking to enable a larger
-// register set.
-//============================================================================
-
-#define SIMD_WRAPPER_1_(op, intrin, mask)                        \
-    static SIMDINLINE Float SIMDCALL op(Float a)                 \
-    {                                                            \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
-    }
-#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
-
-#define SIMD_WRAPPER_1I_(op, intrin, mask)                             \
-    template <int ImmT>                                                \
-    static SIMDINLINE Float SIMDCALL op(Float a)                       \
-    {                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
-    }
-#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
-
-#define SIMD_WRAPPER_2_(op, intrin, mask)                                   \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                   \
-    {                                                                       \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
-    }
-#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
-
-#define SIMD_WRAPPER_2I(op)                                                \
-    template <int ImmT>                                                    \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                  \
-    {                                                                      \
-        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
-    }
-
-#define SIMD_WRAPPER_3_(op, intrin, mask)                                              \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)                     \
-    {                                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \
-    }
-#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
-
-#define SIMD_DWRAPPER_1_(op, intrin, mask)                       \
-    static SIMDINLINE Double SIMDCALL op(Double a)               \
-    {                                                            \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
-    }
-#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
-
-#define SIMD_DWRAPPER_1I_(op, intrin, mask)                            \
-    template <int ImmT>                                                \
-    static SIMDINLINE Double SIMDCALL op(Double a)                     \
-    {                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
-    }
-#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
-
-#define SIMD_DWRAPPER_2_(op, intrin, mask)                                  \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)                \
-    {                                                                       \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
-    }
-#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
-
-#define SIMD_DWRAPPER_2I(op)                                               \
-    template <int ImmT>                                                    \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)               \
-    {                                                                      \
-        return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT)); \
-    }
-
-#define SIMD_IWRAPPER_1_(op, intrin, mask)                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)             \
-    {                                                            \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
-    }
-#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
-#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
-#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
-
-#define SIMD_IWRAPPER_1I_(op, intrin, mask)                            \
-    template <int ImmT>                                                \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)                   \
-    {                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
-    }
-#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
-#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
-#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
-
-#define SIMD_IWRAPPER_2_(op, intrin, mask)                                  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)             \
-    {                                                                       \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
-    }
-#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
-#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
-#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
-
-#define SIMD_IWRAPPER_2I(op)                                               \
-    template <int ImmT>                                                    \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)            \
-    {                                                                      \
-        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
-    }
-
-SIMD_IWRAPPER_2_8(add_epi8);      // return a + b (int8)
-SIMD_IWRAPPER_2_8(adds_epu8);     // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-SIMD_IWRAPPER_2_64(sub_epi64);    // return a - b (int64)
-SIMD_IWRAPPER_2_8(subs_epu8);     // return (b > a) ? 0 : (a - b) (uint8)
-SIMD_IWRAPPER_2_8(packs_epi16);   // int16 --> int8    See documentation for _mm256_packs_epi16 and
-                                  // _mm512_packs_epi16
-SIMD_IWRAPPER_2_16(packs_epi32);  // int32 --> int16   See documentation for _mm256_packs_epi32 and
-                                  // _mm512_packs_epi32
-SIMD_IWRAPPER_2_8(packus_epi16);  // uint16 --> uint8  See documentation for _mm256_packus_epi16 and
-                                  // _mm512_packus_epi16
-SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and
-                                  // _mm512_packus_epi32
-SIMD_IWRAPPER_2_16(unpackhi_epi16);
-SIMD_IWRAPPER_2_64(unpackhi_epi64);
-SIMD_IWRAPPER_2_8(unpackhi_epi8);
-SIMD_IWRAPPER_2_16(unpacklo_epi16);
-SIMD_IWRAPPER_2_64(unpacklo_epi64);
-SIMD_IWRAPPER_2_8(unpacklo_epi8);
-
-static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
-{
-    __mmask64 m = 0xffffull;
-    return static_cast<uint32_t>(_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
-}
-
-#undef SIMD_WRAPPER_1_
-#undef SIMD_WRAPPER_1
-#undef SIMD_WRAPPER_1I_
-#undef SIMD_WRAPPER_1I
-#undef SIMD_WRAPPER_2_
-#undef SIMD_WRAPPER_2
-#undef SIMD_WRAPPER_2I
-#undef SIMD_WRAPPER_3_
-#undef SIMD_WRAPPER_3
-#undef SIMD_DWRAPPER_1_
-#undef SIMD_DWRAPPER_1
-#undef SIMD_DWRAPPER_1I_
-#undef SIMD_DWRAPPER_1I
-#undef SIMD_DWRAPPER_2_
-#undef SIMD_DWRAPPER_2
-#undef SIMD_DWRAPPER_2I
-#undef SIMD_IWRAPPER_1_
-#undef SIMD_IWRAPPER_1_8
-#undef SIMD_IWRAPPER_1_16
-#undef SIMD_IWRAPPER_1_32
-#undef SIMD_IWRAPPER_1_64
-#undef SIMD_IWRAPPER_1I_
-#undef SIMD_IWRAPPER_1I_8
-#undef SIMD_IWRAPPER_1I_16
-#undef SIMD_IWRAPPER_1I_32
-#undef SIMD_IWRAPPER_1I_64
-#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_2_8
-#undef SIMD_IWRAPPER_2_16
-#undef SIMD_IWRAPPER_2_32
-#undef SIMD_IWRAPPER_2_64
-#undef SIMD_IWRAPPER_2I
-//#undef SIMD_IWRAPPER_2I_8
-//#undef SIMD_IWRAPPER_2I_16
-//#undef SIMD_IWRAPPER_2I_32
-//#undef SIMD_IWRAPPER_2I_64
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_knights.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_knights.inl
@ -1,34 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD128 AVX (512) implementation for Knights Family
-//
-// Since this implementation inherits from the AVX512Base implementation,
-// the only operations below ones that replace AVX512F / AVX512CD operations
-// These use native AVX512 instructions with masking to enable a larger
-// register set.
-//============================================================================
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
@ -1,826 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-using SIMD128T = SIMD128Impl::AVXImpl;
-
-//============================================================================
-// SIMD256 AVX (1) implementation
-//============================================================================
-
-#define SIMD_WRAPPER_1(op) \
-    static SIMDINLINE Float SIMDCALL op(Float const& a) { return _mm256_##op(a); }
-
-#define SIMD_WRAPPER_2(op)                                              \
-    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
-    {                                                                   \
-        return _mm256_##op(a, b);                                       \
-    }
-
-#define SIMD_DWRAPPER_2(op)                                                \
-    static SIMDINLINE Double SIMDCALL op(Double const& a, Double const& b) \
-    {                                                                      \
-        return _mm256_##op(a, b);                                          \
-    }
-
-#define SIMD_WRAPPER_2I(op)                                             \
-    template <int ImmT>                                                 \
-    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
-    {                                                                   \
-        return _mm256_##op(a, b, ImmT);                                 \
-    }
-
-#define SIMD_DWRAPPER_2I(op)                                               \
-    template <int ImmT>                                                    \
-    static SIMDINLINE Double SIMDCALL op(Double const& a, Double const& b) \
-    {                                                                      \
-        return _mm256_##op(a, b, ImmT);                                    \
-    }
-
-#define SIMD_WRAPPER_3(op)                                                              \
-    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \
-    {                                                                                   \
-        return _mm256_##op(a, b, c);                                                    \
-    }
-
-#define SIMD_IWRAPPER_1(op) \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a) { return _mm256_##op(a); }
-
-#define SIMD_IWRAPPER_2(op)                                                   \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return _mm256_##op(a, b);                                             \
-    }
-
-#define SIMD_IFWRAPPER_2(op, intrin)                                          \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return castps_si(intrin(castsi_ps(a), castsi_ps(b)));                 \
-    }
-
-#define SIMD_IFWRAPPER_2I(op, intrin)                                         \
-    template <int ImmT>                                                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return castps_si(intrin(castsi_ps(a), castsi_ps(b), ImmT));           \
-    }
-
-#define SIMD_IWRAPPER_2I_(op, intrin)                                         \
-    template <int ImmT>                                                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return _mm256_##intrin(a, b, ImmT);                                   \
-    }
-#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
-
-#define SIMD_IWRAPPER_3(op)                                                                     \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \
-    {                                                                                           \
-        return _mm256_##op(a, b, c);                                                            \
-    }
-
-// emulated integer simd
-#define SIMD_EMU_IWRAPPER_1(op)                             \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
-    {                                                       \
-        return Integer{                                     \
-            SIMD128T::op(a.v4[0]),                          \
-            SIMD128T::op(a.v4[1]),                          \
-        };                                                  \
-    }
-#define SIMD_EMU_IWRAPPER_1L(op, shift)                                  \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a)              \
-    {                                                                    \
-        return Integer{                                                  \
-            SIMD128T::op(a.v4[0]),                                       \
-            SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])),    \
-        };                                                               \
-    }                                                                    \
-    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer const& a) \
-    {                                                                    \
-        return Integer{                                                  \
-            SIMD128T::op(a),                                             \
-            SIMD128T::op(SIMD128T::template srli_si<shift>(a)),          \
-        };                                                               \
-    }
-
-#define SIMD_EMU_IWRAPPER_1I(op)                            \
-    template <int ImmT>                                     \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
-    {                                                       \
-        return Integer{                                     \
-            SIMD128T::template op<ImmT>(a.v4[0]),           \
-            SIMD128T::template op<ImmT>(a.v4[1]),           \
-        };                                                  \
-    }
-
-#define SIMD_EMU_IWRAPPER_2(op)                                               \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return Integer{                                                       \
-            SIMD128T::op(a.v4[0], b.v4[0]),                                   \
-            SIMD128T::op(a.v4[1], b.v4[1]),                                   \
-        };                                                                    \
-    }
-
-#define SIMD_EMU_IWRAPPER_2I(op)                                              \
-    template <int ImmT>                                                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return Integer{                                                       \
-            SIMD128T::template op<ImmT>(a.v4[0], b.v[0]),                     \
-            SIMD128T::template op<ImmT>(a.v4[1], b.v[1]),                     \
-        };                                                                    \
-    }
-
-//-----------------------------------------------------------------------
-// Single precision floating point arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2(add_ps); // return a + b
-SIMD_WRAPPER_2(div_ps); // return a / b
-
-static SIMDINLINE Float SIMDCALL fmadd_ps(Float const& a,
-                                          Float const& b,
-                                          Float const& c) // return (a * b) + c
-{
-    return add_ps(mul_ps(a, b), c);
-}
-
-static SIMDINLINE Float SIMDCALL fmsub_ps(Float const& a,
-                                          Float const& b,
-                                          Float const& c) // return (a * b) - c
-{
-    return sub_ps(mul_ps(a, b), c);
-}
-
-SIMD_WRAPPER_2(max_ps);   // return (a > b) ? a : b
-SIMD_WRAPPER_2(min_ps);   // return (a < b) ? a : b
-SIMD_WRAPPER_2(mul_ps);   // return a * b
-SIMD_WRAPPER_1(rcp_ps);   // return 1.0f / a
-SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
-SIMD_WRAPPER_2(sub_ps);   // return a - b
-
-template <RoundMode RMT>
-static SIMDINLINE Float SIMDCALL round_ps(Float const& a)
-{
-    return _mm256_round_ps(a, static_cast<int>(RMT));
-}
-
-static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a)
-{
-    return round_ps<RoundMode::CEIL_NOEXC>(a);
-}
-static SIMDINLINE Float SIMDCALL floor_ps(Float const& a)
-{
-    return round_ps<RoundMode::FLOOR_NOEXC>(a);
-}
-
-//-----------------------------------------------------------------------
-// Integer (various width) arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_EMU_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
-SIMD_EMU_IWRAPPER_2(add_epi32); // return a + b (int32)
-SIMD_EMU_IWRAPPER_2(add_epi8);  // return a + b (int8)
-SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-SIMD_EMU_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
-SIMD_EMU_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
-SIMD_EMU_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
-SIMD_EMU_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
-SIMD_EMU_IWRAPPER_2(mul_epi32); // return a * b (int32)
-
-// return (a * b) & 0xFFFFFFFF
-//
-// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
-// and store the low 32 bits of the intermediate integers in dst.
-SIMD_EMU_IWRAPPER_2(mullo_epi32);
-SIMD_EMU_IWRAPPER_2(sub_epi32); // return a - b (int32)
-SIMD_EMU_IWRAPPER_2(sub_epi64); // return a - b (int64)
-SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
-
-//-----------------------------------------------------------------------
-// Logical operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2(and_ps);                         // return a & b       (float treated as int)
-SIMD_IFWRAPPER_2(and_si, _mm256_and_ps);        // return a & b       (int)
-SIMD_WRAPPER_2(andnot_ps);                      // return (~a) & b    (float treated as int)
-SIMD_IFWRAPPER_2(andnot_si, _mm256_andnot_ps);  // return (~a) & b    (int)
-SIMD_WRAPPER_2(or_ps);                          // return a | b       (float treated as int)
-SIMD_IFWRAPPER_2(or_si, _mm256_or_ps);          // return a | b       (int)
-SIMD_WRAPPER_2(xor_ps);                         // return a ^ b       (float treated as int)
-SIMD_IFWRAPPER_2(xor_si, _mm256_xor_ps);        // return a ^ b       (int)
-
-//-----------------------------------------------------------------------
-// Shift operations
-//-----------------------------------------------------------------------
-SIMD_EMU_IWRAPPER_1I(slli_epi32); // return a << ImmT
-
-static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer const& vA,
-                                              Integer const& vCount) // return a << b      (uint32)
-{
-    int32_t aHi, aLow, countHi, countLow;
-    __m128i vAHi      = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
-    __m128i vALow     = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
-    __m128i vCountHi  = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
-    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
-
-    aHi     = _mm_extract_epi32(vAHi, 0);
-    countHi = _mm_extract_epi32(vCountHi, 0);
-    aHi <<= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
-
-    aLow     = _mm_extract_epi32(vALow, 0);
-    countLow = _mm_extract_epi32(vCountLow, 0);
-    aLow <<= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 0);
-
-    aHi     = _mm_extract_epi32(vAHi, 1);
-    countHi = _mm_extract_epi32(vCountHi, 1);
-    aHi <<= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
-
-    aLow     = _mm_extract_epi32(vALow, 1);
-    countLow = _mm_extract_epi32(vCountLow, 1);
-    aLow <<= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 1);
-
-    aHi     = _mm_extract_epi32(vAHi, 2);
-    countHi = _mm_extract_epi32(vCountHi, 2);
-    aHi <<= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
-
-    aLow     = _mm_extract_epi32(vALow, 2);
-    countLow = _mm_extract_epi32(vCountLow, 2);
-    aLow <<= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 2);
-
-    aHi     = _mm_extract_epi32(vAHi, 3);
-    countHi = _mm_extract_epi32(vCountHi, 3);
-    aHi <<= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
-
-    aLow     = _mm_extract_epi32(vALow, 3);
-    countLow = _mm_extract_epi32(vCountLow, 3);
-    aLow <<= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 3);
-
-    __m256i ret = _mm256_set1_epi32(0);
-    ret         = _mm256_insertf128_si256(ret, vAHi, 1);
-    ret         = _mm256_insertf128_si256(ret, vALow, 0);
-    return ret;
-}
-
-SIMD_EMU_IWRAPPER_1I(srai_epi32); // return a >> ImmT   (int32)
-SIMD_EMU_IWRAPPER_1I(srli_epi32); // return a >> ImmT   (uint32)
-SIMD_EMU_IWRAPPER_1I(srli_si);    // return a >> (ImmT*8) (uint)
-
-template <int ImmT> // same as srli_si, but with Float cast to int
-static SIMDINLINE Float SIMDCALL srlisi_ps(Float const& a)
-{
-    return castsi_ps(srli_si<ImmT>(castps_si(a)));
-}
-
-static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer const& vA,
-                                              Integer const& vCount) // return a >> b      (uint32)
-{
-    int32_t aHi, aLow, countHi, countLow;
-    __m128i vAHi      = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
-    __m128i vALow     = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
-    __m128i vCountHi  = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
-    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
-
-    aHi     = _mm_extract_epi32(vAHi, 0);
-    countHi = _mm_extract_epi32(vCountHi, 0);
-    aHi >>= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
-
-    aLow     = _mm_extract_epi32(vALow, 0);
-    countLow = _mm_extract_epi32(vCountLow, 0);
-    aLow >>= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 0);
-
-    aHi     = _mm_extract_epi32(vAHi, 1);
-    countHi = _mm_extract_epi32(vCountHi, 1);
-    aHi >>= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
-
-    aLow     = _mm_extract_epi32(vALow, 1);
-    countLow = _mm_extract_epi32(vCountLow, 1);
-    aLow >>= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 1);
-
-    aHi     = _mm_extract_epi32(vAHi, 2);
-    countHi = _mm_extract_epi32(vCountHi, 2);
-    aHi >>= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
-
-    aLow     = _mm_extract_epi32(vALow, 2);
-    countLow = _mm_extract_epi32(vCountLow, 2);
-    aLow >>= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 2);
-
-    aHi     = _mm_extract_epi32(vAHi, 3);
-    countHi = _mm_extract_epi32(vCountHi, 3);
-    aHi >>= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
-
-    aLow     = _mm_extract_epi32(vALow, 3);
-    countLow = _mm_extract_epi32(vCountLow, 3);
-    aLow >>= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 3);
-
-    __m256i ret = _mm256_set1_epi32(0);
-    ret         = _mm256_insertf128_si256(ret, vAHi, 1);
-    ret         = _mm256_insertf128_si256(ret, vALow, 0);
-    return ret;
-}
-
-//-----------------------------------------------------------------------
-// Conversion operations
-//-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a)
-{
-    return _mm256_castpd_ps(a);
-}
-
-static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a)
-{
-    return _mm256_castps_si256(a);
-}
-
-static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a)
-{
-    return _mm256_castsi256_pd(a);
-}
-
-static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a)
-{
-    return _mm256_castps_pd(a);
-}
-
-static SIMDINLINE Integer SIMDCALL castpd_si(Double const& a) // return *(Integer*)(&a)
-{
-    return _mm256_castpd_si256(a);
-}
-
-static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a)
-{
-    return _mm256_castsi256_ps(a);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        cvtepi32_ps(Integer const& a) // return (float)a    (int32 --> float)
-{
-    return _mm256_cvtepi32_ps(a);
-}
-
-SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8);  // return (int16)a    (uint8 --> int16)
-SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4);  // return (int32)a    (uint8 --> int32)
-SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8); // return (int32)a    (uint16 --> int32)
-SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4); // return (int64)a    (uint16 --> int64)
-SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8); // return (int64)a    (uint32 --> int64)
-
-static SIMDINLINE Integer SIMDCALL
-                          cvtps_epi32(Float const& a) // return (int32)a    (float --> int32)
-{
-    return _mm256_cvtps_epi32(a);
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          cvttps_epi32(Float const& a) // return (int32)a    (rnd_to_zero(float) --> int32)
-{
-    return _mm256_cvttps_epi32(a);
-}
-
-//-----------------------------------------------------------------------
-// Comparison operations
-//-----------------------------------------------------------------------
-template <CompareType CmpTypeT>
-static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b
-{
-    return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
-}
-static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::LT_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::GT_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::NEQ_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::EQ_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::GE_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::LE_OQ>(a, b);
-}
-
-SIMD_EMU_IWRAPPER_2(cmpeq_epi8);  // return a == b (int8)
-SIMD_EMU_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
-SIMD_EMU_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
-SIMD_EMU_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
-SIMD_EMU_IWRAPPER_2(cmpgt_epi8);  // return a > b (int8)
-SIMD_EMU_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
-SIMD_EMU_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
-SIMD_EMU_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
-SIMD_EMU_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
-
-static SIMDINLINE bool SIMDCALL
-                       testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
-{
-    return 0 != _mm256_testz_ps(a, b);
-}
-
-static SIMDINLINE bool SIMDCALL
-                       testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
-{
-    return 0 != _mm256_testz_si256(a, b);
-}
-
-//-----------------------------------------------------------------------
-// Blend / shuffle / permute operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2I(blend_ps);                       // return ImmT ? b : a  (float)
-SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps); // return ImmT ? b : a  (int32)
-SIMD_WRAPPER_3(blendv_ps);                       // return mask ? b : a  (float)
-
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
-                                                Integer const& b,
-                                                Float const&   mask) // return mask ? b : a (int)
-{
-    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
-}
-
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
-                                                Integer const& b,
-                                                Integer const& mask) // return mask ? b : a (int)
-{
-    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
-}
-
-static SIMDINLINE Float SIMDCALL
-                        broadcast_ss(float const* p) // return *p (all elements in vector get same value)
-{
-    return _mm256_broadcast_ss(p);
-}
-
-SIMD_EMU_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
-SIMD_EMU_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
-SIMD_EMU_IWRAPPER_2(
-    packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-SIMD_EMU_IWRAPPER_2(
-    packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
-
-template <int ImmT>
-static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
-{
-    return _mm256_permute_ps(a, ImmT);
-}
-
-static SIMDINLINE Integer SIMDCALL permute_epi32(
-    Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
-{
-    Integer result;
-
-    // Ugly slow implementation
-    uint32_t const* pA      = reinterpret_cast<uint32_t const*>(&a);
-    uint32_t const* pSwiz   = reinterpret_cast<uint32_t const*>(&swiz);
-    uint32_t*       pResult = reinterpret_cast<uint32_t*>(&result);
-
-    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
-    {
-        pResult[i] = pA[0xF & pSwiz[i]];
-    }
-
-    return result;
-}
-
-static SIMDINLINE Float SIMDCALL
-                        permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
-{
-    Float result;
-
-    // Ugly slow implementation
-    float const*    pA      = reinterpret_cast<float const*>(&a);
-    uint32_t const* pSwiz   = reinterpret_cast<uint32_t const*>(&swiz);
-    float*          pResult = reinterpret_cast<float*>(&result);
-
-    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
-    {
-        pResult[i] = pA[0xF & pSwiz[i]];
-    }
-
-    return result;
-}
-
-SIMD_WRAPPER_2I(permute2f128_ps);
-SIMD_DWRAPPER_2I(permute2f128_pd);
-SIMD_IWRAPPER_2I_(permute2f128_si, permute2f128_si256);
-
-SIMD_EMU_IWRAPPER_1I(shuffle_epi32);
-
-template <int ImmT>
-static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const& a, Integer const& b)
-{
-    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
-}
-SIMD_EMU_IWRAPPER_2(shuffle_epi8);
-SIMD_DWRAPPER_2I(shuffle_pd);
-SIMD_WRAPPER_2I(shuffle_ps);
-SIMD_EMU_IWRAPPER_2(unpackhi_epi16);
-SIMD_IFWRAPPER_2(unpackhi_epi32, _mm256_unpackhi_ps);
-SIMD_EMU_IWRAPPER_2(unpackhi_epi64);
-SIMD_EMU_IWRAPPER_2(unpackhi_epi8);
-SIMD_DWRAPPER_2(unpackhi_pd);
-SIMD_WRAPPER_2(unpackhi_ps);
-SIMD_EMU_IWRAPPER_2(unpacklo_epi16);
-SIMD_IFWRAPPER_2(unpacklo_epi32, _mm256_unpacklo_ps);
-SIMD_EMU_IWRAPPER_2(unpacklo_epi64);
-SIMD_EMU_IWRAPPER_2(unpacklo_epi8);
-SIMD_DWRAPPER_2(unpacklo_pd);
-SIMD_WRAPPER_2(unpacklo_ps);
-
-//-----------------------------------------------------------------------
-// Load / store operations
-//-----------------------------------------------------------------------
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
-{
-    uint32_t* pOffsets = (uint32_t*)&idx;
-    Float     vResult;
-    float*    pResult = (float*)&vResult;
-    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
-    {
-        uint32_t offset = pOffsets[i];
-        offset          = offset * static_cast<uint32_t>(ScaleT);
-        pResult[i]      = *(float const*)(((uint8_t const*)p + offset));
-    }
-
-    return vResult;
-}
-
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
-{
-    return i32gather_ps<ScaleT>(p, idx);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
-{
-    return broadcast_ss(p);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
-{
-    return _mm256_load_ps(p);
-}
-
-static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
-{
-    return _mm256_load_si256(&p->v);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
-{
-    return _mm256_loadu_ps(p);
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
-{
-    return _mm256_lddqu_si256(&p->v);
-}
-
-// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
-{
-    uint32_t* pOffsets = (uint32_t*)&idx;
-    Float     vResult  = old;
-    float*    pResult  = (float*)&vResult;
-    unsigned long index = 0;
-    uint32_t  umask = movemask_ps(mask);
-    while (_BitScanForward(&index, umask))
-    {
-        umask &= ~(1 << index);
-        uint32_t offset = pOffsets[index];
-        offset          = offset * static_cast<uint32_t>(ScaleT);
-        pResult[index]  = *(float const*)(((uint8_t const*)p + offset));
-    }
-
-    return vResult;
-}
-
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
-{
-    return mask_i32gather_ps<ScaleT>(old, p, idx, mask);
-}
-
-static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src)
-{
-    _mm256_maskstore_ps(p, mask, src);
-}
-
-static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a)
-{
-    return SIMD128T::movemask_epi8(a.v4[0]) | (SIMD128T::movemask_epi8(a.v4[1]) << 16);
-}
-
-static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a)
-{
-    return static_cast<uint32_t>(_mm256_movemask_pd(a));
-}
-static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a)
-{
-    return static_cast<uint32_t>(_mm256_movemask_ps(a));
-}
-
-static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
-{
-    return _mm256_set1_epi32(i);
-}
-
-static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
-{
-    return _mm256_set1_epi8(i);
-}
-
-static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
-{
-    return _mm256_set1_ps(f);
-}
-
-static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
-{
-    return _mm256_setzero_ps();
-}
-
-static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
-{
-    return _mm256_setzero_si256();
-}
-
-static SIMDINLINE void SIMDCALL
-                       store_ps(float* p, Float const& a) // *p = a   (stores all elements contiguously in memory)
-{
-    _mm256_store_ps(p, a);
-}
-
-static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a
-{
-    _mm256_store_si256(&p->v, a);
-}
-
-static SIMDINLINE void SIMDCALL
-                       stream_ps(float* p, Float const& a) // *p = a   (same as store_ps, but doesn't keep memory in cache)
-{
-    _mm256_stream_ps(p, a);
-}
-
-//=======================================================================
-// Legacy interface (available only in SIMD256 width)
-//=======================================================================
-
-static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const* p)
-{
-    return _mm256_broadcast_ps(&p->v);
-}
-
-template <int ImmT>
-static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double const& a)
-{
-    return _mm256_extractf128_pd(a, ImmT);
-}
-
-template <int ImmT>
-static SIMDINLINE SIMD128Impl::Float SIMDCALL extractf128_ps(Float const& a)
-{
-    return _mm256_extractf128_ps(a, ImmT);
-}
-
-template <int ImmT>
-static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer const& a)
-{
-    return _mm256_extractf128_si256(a, ImmT);
-}
-
-template <int ImmT>
-static SIMDINLINE Double SIMDCALL insertf128_pd(Double const& a, SIMD128Impl::Double const& b)
-{
-    return _mm256_insertf128_pd(a, b, ImmT);
-}
-
-template <int ImmT>
-static SIMDINLINE Float SIMDCALL insertf128_ps(Float const& a, SIMD128Impl::Float const& b)
-{
-    return _mm256_insertf128_ps(a, b, ImmT);
-}
-
-template <int ImmT>
-static SIMDINLINE Integer SIMDCALL insertf128_si(Integer const& a, SIMD128Impl::Integer const& b)
-{
-    return _mm256_insertf128_si256(a, b, ImmT);
-}
-
-#ifndef _mm256_set_m128i
-#define _mm256_set_m128i(/* SIMD128Impl::Integer */ hi, /* SIMD128Impl::Integer */ lo) \
-    _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
-#endif
-
-#ifndef _mm256_loadu2_m128i
-#define _mm256_loadu2_m128i(/* SIMD128Impl::Integer const* */ hiaddr, \
-                            /* SIMD128Impl::Integer const* */ loaddr) \
-    _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
-#endif
-
-static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi,
-                                             SIMD128Impl::Integer const* plo)
-{
-    return _mm256_loadu2_m128i(&phi->v, &plo->v);
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
-{
-    return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
-{
-    return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0);
-}
-
-static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer* phi,
-                                           SIMD128Impl::Integer* plo,
-                                           Integer const&        src)
-{
-    _mm256_storeu2_m128i(&phi->v, &plo->v, src);
-}
-
-static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
-{
-    Integer       vec = set1_epi32(mask);
-    const Integer bit = set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
-    vec               = and_si(vec, bit);
-    vec               = cmplt_epi32(setzero_si(), vec);
-    return castsi_ps(vec);
-}
-
-#undef SIMD_WRAPPER_1
-#undef SIMD_WRAPPER_2
-#undef SIMD_DWRAPPER_2
-#undef SIMD_DWRAPPER_2I
-#undef SIMD_WRAPPER_2I
-#undef SIMD_WRAPPER_3
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IFWRAPPER_2
-#undef SIMD_IFWRAPPER_2I
-#undef SIMD_IWRAPPER_2I
-#undef SIMD_IWRAPPER_2I_
-#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_3
-#undef SIMD_EMU_IWRAPPER_1
-#undef SIMD_EMU_IWRAPPER_1I
-#undef SIMD_EMU_IWRAPPER_2
-#undef SIMD_EMU_IWRAPPER_2I
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
@ -1,255 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX2_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD256 AVX (2) implementation
-//
-// Since this implementation inherits from the AVX (1) implementation,
-// the only operations below ones that replace AVX (1) operations.
-// Mostly these are integer operations that are no longer emulated with SSE
-//============================================================================
-
-#define SIMD_IWRAPPER_1(op) \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a) { return _mm256_##op(a); }
-
-#define SIMD_IWRAPPER_1L(op)                                \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
-    {                                                       \
-        return _mm256_##op(_mm256_castsi256_si128(a));      \
-    }
-
-#define SIMD_IWRAPPER_1I(op)                                \
-    template <int ImmT>                                     \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
-    {                                                       \
-        return _mm256_##op(a, ImmT);                        \
-    }
-
-#define SIMD_IWRAPPER_1I_(op, intrin)                       \
-    template <int ImmT>                                     \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
-    {                                                       \
-        return _mm256_##intrin(a, ImmT);                    \
-    }
-
-#define SIMD_IWRAPPER_2_(op, intrin)                                          \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return _mm256_##intrin(a, b);                                         \
-    }
-
-#define SIMD_IWRAPPER_2(op)                                                   \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return _mm256_##op(a, b);                                             \
-    }
-
-#define SIMD_IWRAPPER_2I(op)                                                  \
-    template <int ImmT>                                                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return _mm256_##op(a, b, ImmT);                                       \
-    }
-
-#define SIMD_IWRAPPER_2I(op)                                                  \
-    template <int ImmT>                                                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return _mm256_##op(a, b, ImmT);                                       \
-    }
-
-
-//-----------------------------------------------------------------------
-// Floating point arithmetic operations
-//-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL fmadd_ps(Float const& a,
-                                          Float const& b,
-                                          Float const& c) // return (a * b) + c
-{
-    return _mm256_fmadd_ps(a, b, c);
-}
-
-//-----------------------------------------------------------------------
-// Integer (various width) arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
-SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
-SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
-SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
-SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
-SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
-SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
-SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
-
-// return (a * b) & 0xFFFFFFFF
-//
-// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
-// and store the low 32 bits of the intermediate integers in dst.
-SIMD_IWRAPPER_2(mullo_epi32);
-SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
-SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
-SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
-
-//-----------------------------------------------------------------------
-// Logical operations
-//-----------------------------------------------------------------------
-#if _MSC_VER >= 1920 // && _MSC_FULL_VER < [some_fixed_version]
-// Some versions of MSVC 2019 don't handle constant folding of and_si() correctly.
-// Using and_ps instead inhibits the compiler's constant folding and actually issues
-// the and intrinsic even though both inputs are constant values.
-#else
-// Use native integer and intrinsic
-SIMD_IWRAPPER_2_(and_si, and_si256); // return a & b       (int)
-#endif
-SIMD_IWRAPPER_2_(andnot_si, andnot_si256); // return (~a) & b    (int)
-SIMD_IWRAPPER_2_(or_si, or_si256);         // return a | b       (int)
-SIMD_IWRAPPER_2_(xor_si, xor_si256);       // return a ^ b       (int)
-
-//-----------------------------------------------------------------------
-// Shift operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1I(slli_epi32);           // return a << ImmT
-SIMD_IWRAPPER_2(sllv_epi32);            // return a << b      (uint32)
-SIMD_IWRAPPER_1I(srai_epi32);           // return a >> ImmT   (int32)
-SIMD_IWRAPPER_1I(srli_epi32);           // return a >> ImmT   (uint32)
-SIMD_IWRAPPER_2(srlv_epi32);            // return a >> b      (uint32)
-SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint)
-
-template <int ImmT> // same as srli_si, but with Float cast to int
-static SIMDINLINE Float SIMDCALL srlisi_ps(Float const& a)
-{
-    return castsi_ps(srli_si<ImmT>(castps_si(a)));
-}
-
-//-----------------------------------------------------------------------
-// Conversion operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1L(cvtepu8_epi16);  // return (int16)a    (uint8 --> int16)
-SIMD_IWRAPPER_1L(cvtepu8_epi32);  // return (int32)a    (uint8 --> int32)
-SIMD_IWRAPPER_1L(cvtepu16_epi32); // return (int32)a    (uint16 --> int32)
-SIMD_IWRAPPER_1L(cvtepu16_epi64); // return (int64)a    (uint16 --> int64)
-SIMD_IWRAPPER_1L(cvtepu32_epi64); // return (int64)a    (uint32 --> int64)
-
-//-----------------------------------------------------------------------
-// Comparison operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_2(cmpeq_epi8);  // return a == b (int8)
-SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
-SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
-SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
-SIMD_IWRAPPER_2(cmpgt_epi8);  // return a > b (int8)
-SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
-SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
-SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
-
-static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer const& a,
-                                               Integer const& b) // return a < b (int32)
-{
-    return cmpgt_epi32(b, a);
-}
-
-//-----------------------------------------------------------------------
-// Blend / shuffle / permute operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a  (int32)
-SIMD_IWRAPPER_2(packs_epi16);  // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
-SIMD_IWRAPPER_2(packs_epi32);  // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
-SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
-
-template <int ImmT>
-static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
-{
-    return _mm256_permute_ps(a, ImmT);
-}
-
-SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
-
-static SIMDINLINE Float SIMDCALL
-                        permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
-{
-    return _mm256_permutevar8x32_ps(a, swiz);
-}
-
-SIMD_IWRAPPER_1I(shuffle_epi32);
-template <int ImmT>
-static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const& a, Integer const& b)
-{
-    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
-}
-SIMD_IWRAPPER_2(shuffle_epi8);
-SIMD_IWRAPPER_2(unpackhi_epi16);
-SIMD_IWRAPPER_2(unpackhi_epi32);
-SIMD_IWRAPPER_2(unpackhi_epi64);
-SIMD_IWRAPPER_2(unpackhi_epi8);
-SIMD_IWRAPPER_2(unpacklo_epi16);
-SIMD_IWRAPPER_2(unpacklo_epi32);
-SIMD_IWRAPPER_2(unpacklo_epi64);
-SIMD_IWRAPPER_2(unpacklo_epi8);
-
-//-----------------------------------------------------------------------
-// Load / store operations
-//-----------------------------------------------------------------------
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
-{
-    return _mm256_i32gather_ps(p, idx, static_cast<int>(ScaleT));
-}
-
-#if _MSC_VER == 1920 // && _MSC_FULL_VER < [some_fixed_version]
-// Don't use _mm256_mask_i32gather_ps(), the compiler doesn't preserve the mask register
-// correctly in early versions of MSVC 2019
-#else
-// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
-{
-    // g++ in debug mode needs the explicit .v suffix instead of relying on operator __m256()
-    // Only for this intrinsic - not sure why. :(
-    return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast<int>(ScaleT));
-}
-#endif
-
-static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a)
-{
-    return static_cast<uint32_t>(_mm256_movemask_epi8(a));
-}
-
-//=======================================================================
-// Legacy interface (available only in SIMD256 width)
-//=======================================================================
-
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_1L
-#undef SIMD_IWRAPPER_1I
-#undef SIMD_IWRAPPER_1I_
-#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IWRAPPER_2I
-#undef SIMD_IWRAPPER_2I
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
@ -1,349 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD256 AVX (512) implementation
-//
-// Since this implementation inherits from the AVX (2) implementation,
-// the only operations below ones that replace AVX (2) operations.
-// These use native AVX512 instructions with masking to enable a larger
-// register set.
-//============================================================================
-
-private:
-static SIMDINLINE __m512 __conv(Float r)
-{
-    return _mm512_castps256_ps512(r.v);
-}
-static SIMDINLINE __m512d __conv(Double r)
-{
-    return _mm512_castpd256_pd512(r.v);
-}
-static SIMDINLINE __m512i __conv(Integer r)
-{
-    return _mm512_castsi256_si512(r.v);
-}
-static SIMDINLINE Float __conv(__m512 r)
-{
-    return _mm512_castps512_ps256(r);
-}
-static SIMDINLINE Double __conv(__m512d r)
-{
-    return _mm512_castpd512_pd256(r);
-}
-static SIMDINLINE Integer __conv(__m512i r)
-{
-    return _mm512_castsi512_si256(r);
-}
-
-public:
-#define SIMD_WRAPPER_1_(op, intrin, mask)                        \
-    static SIMDINLINE Float SIMDCALL op(Float a)                 \
-    {                                                            \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
-    }
-#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xff))
-
-#define SIMD_WRAPPER_1I_(op, intrin, mask)                             \
-    template <int ImmT>                                                \
-    static SIMDINLINE Float SIMDCALL op(Float a)                       \
-    {                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
-    }
-#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xff))
-
-#define SIMD_WRAPPER_2_(op, intrin, mask)                                   \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                   \
-    {                                                                       \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
-    }
-#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xff))
-
-#define SIMD_WRAPPER_2I(op)                                                 \
-    template <int ImmT>                                                     \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                   \
-    {                                                                       \
-        return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT)); \
-    }
-
-#define SIMD_WRAPPER_3_(op, intrin, mask)                                              \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)                     \
-    {                                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \
-    }
-#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xff))
-
-#define SIMD_DWRAPPER_2I(op)                                               \
-    template <int ImmT>                                                    \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)               \
-    {                                                                      \
-        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
-    }
-
-#define SIMD_IWRAPPER_1_(op, intrin, mask)                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)             \
-    {                                                            \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
-    }
-#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xff))
-
-#define SIMD_IWRAPPER_1I_(op, intrin, mask)                            \
-    template <int ImmT>                                                \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)                   \
-    {                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
-    }
-#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xff))
-
-#define SIMD_IWRAPPER_2_(op, intrin, mask)                                  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)             \
-    {                                                                       \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
-    }
-#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xff))
-
-#define SIMD_IWRAPPER_2I(op)                                                \
-    template <int ImmT>                                                     \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)             \
-    {                                                                       \
-        return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT)); \
-    }
-
-//-----------------------------------------------------------------------
-// Single precision floating point arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2(add_ps);                                 // return a + b
-SIMD_WRAPPER_2(div_ps);                                 // return a / b
-SIMD_WRAPPER_3(fmadd_ps);                               // return (a * b) + c
-SIMD_WRAPPER_3(fmsub_ps);                               // return (a * b) - c
-SIMD_WRAPPER_2(max_ps);                                 // return (a > b) ? a : b
-SIMD_WRAPPER_2(min_ps);                                 // return (a < b) ? a : b
-SIMD_WRAPPER_2(mul_ps);                                 // return a * b
-SIMD_WRAPPER_1_(rcp_ps, rcp14_ps, __mmask16(0xff));     // return 1.0f / a
-SIMD_WRAPPER_1_(rsqrt_ps, rsqrt14_ps, __mmask16(0xff)); // return 1.0f / sqrt(a)
-SIMD_WRAPPER_2(sub_ps);                                 // return a - b
-
-//-----------------------------------------------------------------------
-// Integer (various width) arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32)
-SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32)
-SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32)
-SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32)
-SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32)
-SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
-SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
-
-// SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
-// SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-
-// return (a * b) & 0xFFFFFFFF
-//
-// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
-// and store the low 32 bits of the intermediate integers in dst.
-SIMD_IWRAPPER_2_32(mullo_epi32);
-SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
-
-// SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
-// SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
-
-//-----------------------------------------------------------------------
-// Logical operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xff));       // return a & b       (int)
-SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xff)); // return (~a) & b    (int)
-SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xff));         // return a | b       (int)
-SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xff));       // return a ^ b       (int)
-
-//-----------------------------------------------------------------------
-// Shift operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT
-SIMD_IWRAPPER_2_32(sllv_epi32);  // return a << b      (uint32)
-SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT   (int32)
-SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT   (uint32)
-SIMD_IWRAPPER_2_32(srlv_epi32);  // return a >> b      (uint32)
-
-// use AVX2 version
-// SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
-
-//-----------------------------------------------------------------------
-// Conversion operations (Use AVX2 versions)
-//-----------------------------------------------------------------------
-// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff);    // return (int16)a    (uint8 --> int16)
-// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff);      // return (int32)a    (uint8 --> int32)
-// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff);     // return (int32)a    (uint16 --> int32)
-// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf);      // return (int64)a    (uint16 --> int64)
-// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf);      // return (int64)a    (uint32 --> int64)
-
-//-----------------------------------------------------------------------
-// Comparison operations (Use AVX2 versions
-//-----------------------------------------------------------------------
-// SIMD_IWRAPPER_2_CMP(cmpeq_epi8);    // return a == b (int8)
-// SIMD_IWRAPPER_2_CMP(cmpeq_epi16);   // return a == b (int16)
-// SIMD_IWRAPPER_2_CMP(cmpeq_epi32);   // return a == b (int32)
-// SIMD_IWRAPPER_2_CMP(cmpeq_epi64);   // return a == b (int64)
-// SIMD_IWRAPPER_2_CMP(cmpgt_epi8,);   // return a > b (int8)
-// SIMD_IWRAPPER_2_CMP(cmpgt_epi16);   // return a > b (int16)
-// SIMD_IWRAPPER_2_CMP(cmpgt_epi32);   // return a > b (int32)
-// SIMD_IWRAPPER_2_CMP(cmpgt_epi64);   // return a > b (int64)
-//
-// static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
-//{
-//    return cmpgt_epi32(b, a);
-//}
-
-//-----------------------------------------------------------------------
-// Blend / shuffle / permute operations
-//-----------------------------------------------------------------------
-// SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16
-// and _mm512_packs_epi16 SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation
-// for _mm256_packs_epi32 and _mm512_packs_epi32 SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 -->
-// uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-// SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for
-// _mm256_packus_epi32 and _mm512_packus_epi32
-
-// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
-
-// static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for
-// each 32-bit lane i (float)
-//{
-//    return _mm256_permutevar8x32_ps(a, swiz);
-//}
-
-SIMD_IWRAPPER_1I_32(shuffle_epi32);
-// template<int ImmT>
-// static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
-//{
-//    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
-//}
-// SIMD_IWRAPPER_2(shuffle_epi8);
-SIMD_IWRAPPER_2_32(unpackhi_epi32);
-SIMD_IWRAPPER_2_32(unpacklo_epi32);
-
-// SIMD_IWRAPPER_2_16(unpackhi_epi16);
-// SIMD_IWRAPPER_2_64(unpackhi_epi64);
-// SIMD_IWRAPPER_2_8(unpackhi_epi8);
-// SIMD_IWRAPPER_2_16(unpacklo_epi16);
-// SIMD_IWRAPPER_2_64(unpacklo_epi64);
-// SIMD_IWRAPPER_2_8(unpacklo_epi8);
-
-//-----------------------------------------------------------------------
-// Load / store operations
-//-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL
-                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
-{
-    return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p));
-}
-
-static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
-{
-    return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p));
-}
-
-static SIMDINLINE Float SIMDCALL
-                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
-{
-    return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p));
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
-{
-    return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p));
-}
-
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
-{
-    return __conv(_mm512_mask_i32gather_ps(
-        _mm512_setzero_ps(), __mmask16(0xff), __conv(idx), p, static_cast<int>(ScaleT)));
-}
-
-// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
-{
-    __mmask16 m = 0xff;
-    m           = _mm512_mask_test_epi32_mask(
-        m, _mm512_castps_si512(__conv(mask)), _mm512_set1_epi32(0x80000000));
-    return __conv(
-        _mm512_mask_i32gather_ps(__conv(old), m, __conv(idx), p, static_cast<int>(ScaleT)));
-}
-
-// static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
-// {
-//     __mmask64 m = 0xffffffffull;
-//     return static_cast<uint32_t>(
-//         _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
-// }
-
-static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
-{
-    __mmask16 m = 0xff;
-    m           = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
-    _mm512_mask_storeu_ps(p, m, __conv(src));
-}
-
-static SIMDINLINE void SIMDCALL
-                       store_ps(float* p, Float a) // *p = a   (stores all elements contiguously in memory)
-{
-    _mm512_mask_storeu_ps(p, __mmask16(0xff), __conv(a));
-}
-
-static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
-{
-    _mm512_mask_storeu_epi32(p, __mmask16(0xff), __conv(a));
-}
-
-static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
-{
-    return castsi_ps(__conv(_mm512_maskz_set1_epi32(__mmask16(mask & 0xff), -1)));
-}
-
-//=======================================================================
-// Legacy interface (available only in SIMD256 width)
-//=======================================================================
-
-#undef SIMD_WRAPPER_1_
-#undef SIMD_WRAPPER_1
-#undef SIMD_WRAPPER_1I_
-#undef SIMD_WRAPPER_1I
-#undef SIMD_WRAPPER_2_
-#undef SIMD_WRAPPER_2
-#undef SIMD_WRAPPER_2I
-#undef SIMD_WRAPPER_3_
-#undef SIMD_WRAPPER_3
-#undef SIMD_IWRAPPER_1_
-#undef SIMD_IWRAPPER_1_32
-#undef SIMD_IWRAPPER_1I_
-#undef SIMD_IWRAPPER_1I_32
-#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_2_32
-#undef SIMD_IWRAPPER_2I
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_core.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_core.inl
@ -1,129 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD256 AVX (512) implementation for Core processors
-//
-// Since this implementation inherits from the AVX (2) implementation,
-// the only operations below ones that replace AVX (2) operations.
-// These use native AVX512 instructions with masking to enable a larger
-// register set.
-//============================================================================
-
-#define SIMD_DWRAPPER_1_(op, intrin, mask)                       \
-    static SIMDINLINE Double SIMDCALL op(Double a)               \
-    {                                                            \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
-    }
-#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
-
-#define SIMD_DWRAPPER_1I_(op, intrin, mask)                            \
-    template <int ImmT>                                                \
-    static SIMDINLINE Double SIMDCALL op(Double a)                     \
-    {                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
-    }
-#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
-
-#define SIMD_DWRAPPER_2_(op, intrin, mask)                                  \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)                \
-    {                                                                       \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
-    }
-#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
-
-#define SIMD_IWRAPPER_1_(op, intrin, mask)                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)             \
-    {                                                            \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
-    }
-#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
-#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
-#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
-
-#define SIMD_IWRAPPER_1I_(op, intrin, mask)                            \
-    template <int ImmT>                                                \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)                   \
-    {                                                                  \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
-    }
-#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
-#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
-#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
-
-#define SIMD_IWRAPPER_2_(op, intrin, mask)                                  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)             \
-    {                                                                       \
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
-    }
-#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
-#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
-#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
-
-SIMD_IWRAPPER_2_8(add_epi8);      // return a + b (int8)
-SIMD_IWRAPPER_2_8(adds_epu8);     // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-SIMD_IWRAPPER_2_64(sub_epi64);    // return a - b (int64)
-SIMD_IWRAPPER_2_8(subs_epu8);     // return (b > a) ? 0 : (a - b) (uint8)
-SIMD_IWRAPPER_2_8(packs_epi16);   // int16 --> int8    See documentation for _mm256_packs_epi16 and
-                                  // _mm512_packs_epi16
-SIMD_IWRAPPER_2_16(packs_epi32);  // int32 --> int16   See documentation for _mm256_packs_epi32 and
-                                  // _mm512_packs_epi32
-SIMD_IWRAPPER_2_8(packus_epi16);  // uint16 --> uint8  See documentation for _mm256_packus_epi16 and
-                                  // _mm512_packus_epi16
-SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and
-                                  // _mm512_packus_epi32
-SIMD_IWRAPPER_2_16(unpackhi_epi16);
-SIMD_IWRAPPER_2_64(unpackhi_epi64);
-SIMD_IWRAPPER_2_8(unpackhi_epi8);
-SIMD_IWRAPPER_2_16(unpacklo_epi16);
-SIMD_IWRAPPER_2_64(unpacklo_epi64);
-SIMD_IWRAPPER_2_8(unpacklo_epi8);
-
-static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
-{
-    __mmask64 m = 0xffffffffull;
-    return static_cast<uint32_t>(_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
-}
-
-#undef SIMD_DWRAPPER_1_
-#undef SIMD_DWRAPPER_1
-#undef SIMD_DWRAPPER_1I_
-#undef SIMD_DWRAPPER_1I
-#undef SIMD_DWRAPPER_2_
-#undef SIMD_DWRAPPER_2
-#undef SIMD_DWRAPPER_2I
-#undef SIMD_IWRAPPER_1_
-#undef SIMD_IWRAPPER_1_8
-#undef SIMD_IWRAPPER_1_16
-#undef SIMD_IWRAPPER_1_64
-#undef SIMD_IWRAPPER_1I_
-#undef SIMD_IWRAPPER_1I_8
-#undef SIMD_IWRAPPER_1I_16
-#undef SIMD_IWRAPPER_1I_64
-#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_2_8
-#undef SIMD_IWRAPPER_2_16
-#undef SIMD_IWRAPPER_2_64
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_knights.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_knights.inl
@ -1,34 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD256 AVX (512) implementation for Knights Family
-//
-// Since this implementation inherits from the AVX (2) implementation,
-// the only operations below ones that replace AVX (2) operations.
-// These use native AVX512 instructions with masking to enable a larger
-// register set.
-//============================================================================
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
@ -1,699 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
-// gcc as of 7.1 was missing these intrinsics
-#ifndef _mm512_cmpneq_ps_mask
-#define _mm512_cmpneq_ps_mask(a, b) _mm512_cmp_ps_mask((a), (b), _CMP_NEQ_UQ)
-#endif
-
-#ifndef _mm512_cmplt_ps_mask
-#define _mm512_cmplt_ps_mask(a, b) _mm512_cmp_ps_mask((a), (b), _CMP_LT_OS)
-#endif
-
-#ifndef _mm512_cmplt_pd_mask
-#define _mm512_cmplt_pd_mask(a, b) _mm512_cmp_pd_mask((a), (b), _CMP_LT_OS)
-#endif
-
-#endif
-
-//============================================================================
-// SIMD16 AVX512 (F) implementation (compatible with Knights and Core
-// processors)
-//
-//============================================================================
-
-static const int TARGET_SIMD_WIDTH = 16;
-using SIMD256T                     = SIMD256Impl::AVX2Impl;
-
-#define SIMD_WRAPPER_1_(op, intrin) \
-    static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); }
-
-#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op)
-
-#define SIMD_WRAPPER_2_(op, intrin) \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); }
-#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
-
-#define SIMD_WRAPPERI_2_(op, intrin)                                          \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                     \
-    {                                                                         \
-        return _mm512_castsi512_ps(                                           \
-            _mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \
-    }
-
-#define SIMD_DWRAPPER_2(op) \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); }
-
-#define SIMD_WRAPPER_2I_(op, intrin)                      \
-    template <int ImmT>                                   \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
-    {                                                     \
-        return _mm512_##intrin(a, b, ImmT);               \
-    }
-#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
-
-#define SIMD_DWRAPPER_2I_(op, intrin)                        \
-    template <int ImmT>                                      \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
-    {                                                        \
-        return _mm512_##intrin(a, b, ImmT);                  \
-    }
-#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
-
-#define SIMD_WRAPPER_3(op) \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); }
-
-#define SIMD_IWRAPPER_1(op) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); }
-#define SIMD_IWRAPPER_1_8(op) \
-    static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); }
-
-#define SIMD_IWRAPPER_1_4(op) \
-    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); }
-
-#define SIMD_IWRAPPER_1I_(op, intrin)                \
-    template <int ImmT>                              \
-    static SIMDINLINE Integer SIMDCALL op(Integer a) \
-    {                                                \
-        return intrin(a, ImmT);                      \
-    }
-#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
-
-#define SIMD_IWRAPPER_2_(op, intrin) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); }
-#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
-
-#define SIMD_IWRAPPER_2_CMP(op, cmp) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); }
-
-#define SIMD_IFWRAPPER_2(op, intrin)                                   \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)        \
-    {                                                                  \
-        return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \
-    }
-
-#define SIMD_IWRAPPER_2I_(op, intrin)                           \
-    template <int ImmT>                                         \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
-    {                                                           \
-        return _mm512_##intrin(a, b, ImmT);                     \
-    }
-#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
-
-private:
-static SIMDINLINE Integer vmask(__mmask16 m)
-{
-    return _mm512_maskz_set1_epi32(m, -1);
-}
-
-static SIMDINLINE Integer vmask(__mmask8 m)
-{
-    return _mm512_maskz_set1_epi64(m, -1LL);
-}
-
-public:
-//-----------------------------------------------------------------------
-// Single precision floating point arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2(add_ps);                       // return a + b
-SIMD_WRAPPER_2(div_ps);                       // return a / b
-SIMD_WRAPPER_3(fmadd_ps);                     // return (a * b) + c
-SIMD_WRAPPER_3(fmsub_ps);                     // return (a * b) - c
-SIMD_WRAPPER_2(max_ps);                       // return (a > b) ? a : b
-SIMD_WRAPPER_2(min_ps);                       // return (a < b) ? a : b
-SIMD_WRAPPER_2(mul_ps);                       // return a * b
-SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp14_ps);     // return 1.0f / a
-SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt14_ps); // return 1.0f / sqrt(a)
-SIMD_WRAPPER_2(sub_ps);                       // return a - b
-
-template <RoundMode RMT>
-static SIMDINLINE Float SIMDCALL round_ps(Float a)
-{
-    return _mm512_roundscale_ps(a, static_cast<int>(RMT));
-}
-
-static SIMDINLINE Float SIMDCALL ceil_ps(Float a)
-{
-    return round_ps<RoundMode::CEIL_NOEXC>(a);
-}
-static SIMDINLINE Float SIMDCALL floor_ps(Float a)
-{
-    return round_ps<RoundMode::FLOOR_NOEXC>(a);
-}
-
-//-----------------------------------------------------------------------
-// Integer (various width) arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
-SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
-// SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
-// SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
-SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
-SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
-SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
-SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
-
-// return (a * b) & 0xFFFFFFFF
-//
-// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
-// and store the low 32 bits of the intermediate integers in dst.
-SIMD_IWRAPPER_2(mullo_epi32);
-SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
-SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
-// SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
-
-//-----------------------------------------------------------------------
-// Logical operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_2_(and_si, and_si512);       // return a & b       (int)
-SIMD_IWRAPPER_2_(andnot_si, andnot_si512); // return (~a) & b    (int)
-SIMD_IWRAPPER_2_(or_si, or_si512);         // return a | b       (int)
-SIMD_IWRAPPER_2_(xor_si, xor_si512);       // return a ^ b       (int)
-
-// SIMD_WRAPPER_2(and_ps);                     // return a & b       (float treated as int)
-// SIMD_WRAPPER_2(andnot_ps);                  // return (~a) & b    (float treated as int)
-// SIMD_WRAPPER_2(or_ps);                      // return a | b       (float treated as int)
-// SIMD_WRAPPER_2(xor_ps);                     // return a ^ b       (float treated as int)
-
-//-----------------------------------------------------------------------
-// Shift operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
-SIMD_IWRAPPER_2(sllv_epi32);
-SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT   (int32)
-SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT   (uint32)
-
-#if 0
-SIMD_IWRAPPER_1I_(srli_si, srli_si512);     // return a >> (ImmT*8) (uint)
-
-template<int ImmT>                              // same as srli_si, but with Float cast to int
-static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
-{
-    return castsi_ps(srli_si<ImmT>(castps_si(a)));
-}
-#endif
-
-SIMD_IWRAPPER_2(srlv_epi32);
-
-//-----------------------------------------------------------------------
-// Conversion operations
-//-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
-{
-    return _mm512_castpd_ps(a);
-}
-
-static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
-{
-    return _mm512_castps_si512(a);
-}
-
-static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
-{
-    return _mm512_castsi512_pd(a);
-}
-
-static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
-{
-    return _mm512_castps_pd(a);
-}
-
-static SIMDINLINE Integer SIMDCALL castpd_si(Double a) // return *(Integer*)(&a)
-{
-    return _mm512_castpd_si512(a);
-}
-
-static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
-{
-    return _mm512_castsi512_ps(a);
-}
-
-static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
-{
-    return _mm512_cvtepi32_ps(a);
-}
-
-// SIMD_IWRAPPER_1_8(cvtepu8_epi16);     // return (int16)a    (uint8 --> int16)
-SIMD_IWRAPPER_1_4(cvtepu8_epi32);  // return (int32)a    (uint8 --> int32)
-SIMD_IWRAPPER_1_8(cvtepu16_epi32); // return (int32)a    (uint16 --> int32)
-SIMD_IWRAPPER_1_4(cvtepu16_epi64); // return (int64)a    (uint16 --> int64)
-SIMD_IWRAPPER_1_8(cvtepu32_epi64); // return (int64)a    (uint32 --> int64)
-
-static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a    (float --> int32)
-{
-    return _mm512_cvtps_epi32(a);
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          cvttps_epi32(Float a) // return (int32)a    (rnd_to_zero(float) --> int32)
-{
-    return _mm512_cvttps_epi32(a);
-}
-
-//-----------------------------------------------------------------------
-// Comparison operations
-//-----------------------------------------------------------------------
-template <CompareType CmpTypeT>
-static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
-{
-    return _mm512_cmp_ps_mask(a, b, static_cast<const int>(CmpTypeT));
-}
-
-template <CompareType CmpTypeT>
-static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
-{
-    // Legacy vector mask generator
-    __mmask16 result = cmp_ps_mask<CmpTypeT>(a, b);
-    return castsi_ps(vmask(result));
-}
-
-static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::LT_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::GT_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::NEQ_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::EQ_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::GE_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b)
-{
-    return cmp_ps<CompareType::LE_OQ>(a, b);
-}
-
-template <CompareTypeInt CmpTypeT>
-static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b)
-{
-    // Legacy vector mask generator
-    __mmask16 result = _mm512_cmp_epi32_mask(a, b, static_cast<const int>(CmpTypeT));
-    return vmask(result);
-}
-template <CompareTypeInt CmpTypeT>
-static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b)
-{
-    // Legacy vector mask generator
-    __mmask8 result = _mm512_cmp_epi64_mask(a, b, static_cast<const int>(CmpTypeT));
-    return vmask(result);
-}
-
-// SIMD_IWRAPPER_2_CMP(cmpeq_epi8,  cmp_epi8<CompareTypeInt::EQ>);    // return a == b (int8)
-// SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>);   // return a == b (int16)
-SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>); // return a == b (int32)
-SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>); // return a == b (int64)
-// SIMD_IWRAPPER_2_CMP(cmpgt_epi8,  cmp_epi8<CompareTypeInt::GT>);    // return a > b (int8)
-// SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>);   // return a > b (int16)
-SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>); // return a > b (int32)
-SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>); // return a > b (int64)
-SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>); // return a < b (int32)
-
-static SIMDINLINE bool SIMDCALL testz_ps(Float a,
-                                         Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
-{
-    return (0 == static_cast<int>(_mm512_test_epi32_mask(castps_si(a), castps_si(b))));
-}
-
-static SIMDINLINE bool SIMDCALL testz_si(Integer a,
-                                         Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
-{
-    return (0 == static_cast<int>(_mm512_test_epi32_mask(a, b)));
-}
-
-//-----------------------------------------------------------------------
-// Blend / shuffle / permute operations
-//-----------------------------------------------------------------------
-template <int ImmT>
-static SIMDINLINE Float blend_ps(Float a, Float b) // return ImmT ? b : a  (float)
-{
-    return _mm512_mask_blend_ps(__mmask16(ImmT), a, b);
-}
-
-template <int ImmT>
-static SIMDINLINE Integer blend_epi32(Integer a, Integer b) // return ImmT ? b : a  (int32)
-{
-    return _mm512_mask_blend_epi32(__mmask16(ImmT), a, b);
-}
-
-static SIMDINLINE Float blendv_ps(Float a, Float b, Float mask) // return mask ? b : a  (float)
-{
-    return _mm512_mask_blend_ps(__mmask16(movemask_ps(mask)), a, b);
-}
-
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
-                                                Integer b,
-                                                Float   mask) // return mask ? b : a (int)
-{
-    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
-}
-
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
-                                                Integer b,
-                                                Integer mask) // return mask ? b : a (int)
-{
-    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
-}
-
-static SIMDINLINE Float SIMDCALL
-                        broadcast_ss(float const* p) // return *p (all elements in vector get same value)
-{
-    return _mm512_set1_ps(*p);
-}
-
-template <int imm>
-static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
-{
-    return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), imm));
-}
-
-template <int imm>
-static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
-{
-    return _mm512_extractf64x4_pd(a, imm);
-}
-
-template <int imm>
-static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
-{
-    return _mm512_extracti64x4_epi64(a, imm);
-}
-
-template <int imm>
-static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
-{
-    return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(a), _mm256_castps_pd(b), imm));
-}
-
-template <int imm>
-static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
-{
-    return _mm512_insertf64x4(a, b, imm);
-}
-
-template <int imm>
-static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
-{
-    return _mm512_inserti64x4(a, b, imm);
-}
-
-// SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm512_packs_epi16 and
-// _mm512_packs_epi16 SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm512_packs_epi32
-// and _mm512_packs_epi32 SIMD_IWRAPPER_2(packus_epi16);  // See documentation for
-// _mm512_packus_epi16 and _mm512_packus_epi16 SIMD_IWRAPPER_2(packus_epi32);  // See documentation
-// for _mm512_packus_epi32 and _mm512_packus_epi32
-
-template <int ImmT>
-static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
-{
-    return _mm512_permute_ps(a, ImmT);
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
-{
-    return _mm512_permutexvar_epi32(swiz, a);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
-{
-    return _mm512_permutexvar_ps(swiz, a);
-}
-
-SIMD_WRAPPER_2I_(permute2f128_ps, shuffle_f32x4);
-SIMD_DWRAPPER_2I_(permute2f128_pd, shuffle_f64x2);
-SIMD_IWRAPPER_2I_(permute2f128_si, shuffle_i32x4);
-
-SIMD_IWRAPPER_1I(shuffle_epi32);
-
-// SIMD_IWRAPPER_2(shuffle_epi8);
-SIMD_DWRAPPER_2I(shuffle_pd);
-SIMD_WRAPPER_2I(shuffle_ps);
-
-template <int ImmT>
-static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
-{
-    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
-}
-
-SIMD_IWRAPPER_2(unpackhi_epi16);
-
-// SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps);
-static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
-{
-    return castps_si(_mm512_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
-}
-
-SIMD_IWRAPPER_2(unpackhi_epi64);
-// SIMD_IWRAPPER_2(unpackhi_epi8);
-SIMD_DWRAPPER_2(unpackhi_pd);
-SIMD_WRAPPER_2(unpackhi_ps);
-// SIMD_IWRAPPER_2(unpacklo_epi16);
-SIMD_IFWRAPPER_2(unpacklo_epi32, unpacklo_ps);
-SIMD_IWRAPPER_2(unpacklo_epi64);
-// SIMD_IWRAPPER_2(unpacklo_epi8);
-SIMD_DWRAPPER_2(unpacklo_pd);
-SIMD_WRAPPER_2(unpacklo_ps);
-
-//-----------------------------------------------------------------------
-// Load / store operations
-//-----------------------------------------------------------------------
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
-{
-    return _mm512_i32gather_ps(idx, p, static_cast<int>(ScaleT));
-}
-
-static SIMDINLINE Float SIMDCALL
-                        load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
-{
-    return broadcast_ss(p);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
-{
-    return _mm512_load_ps(p);
-}
-
-static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
-{
-    return _mm512_load_si512(&p->v);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
-{
-    return _mm512_loadu_ps(p);
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
-{
-    return _mm512_loadu_si512(p);
-}
-
-// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
-{
-    __mmask16 k = _mm512_test_epi32_mask(castps_si(mask), set1_epi32(0x80000000));
-
-    return _mm512_mask_i32gather_ps(old, k, idx, p, static_cast<int>(ScaleT));
-}
-
-static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
-{
-    Mask m = _mm512_cmplt_epi32_mask(mask, setzero_si());
-    _mm512_mask_store_ps(p, m, src);
-}
-
-// static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
-//{
-//    __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si());
-//    return static_cast<uint64_t>(m);
-//}
-
-static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
-{
-    __mmask8 m = _mm512_test_epi64_mask(castpd_si(a), set1_epi64(0x8000000000000000LL));
-    return static_cast<uint32_t>(m);
-}
-static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
-{
-    __mmask16 m = _mm512_test_epi32_mask(castps_si(a), set1_epi32(0x80000000));
-    return static_cast<uint32_t>(m);
-}
-
-static SIMDINLINE Integer SIMDCALL set1_epi64(long long i) // return i (all elements are same value)
-{
-    return _mm512_set1_epi64(i);
-}
-
-static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
-{
-    return _mm512_set1_epi32(i);
-}
-
-static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
-{
-    return _mm512_set1_epi8(i);
-}
-
-static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
-{
-    return _mm512_set1_ps(f);
-}
-
-static SIMDINLINE Double SIMDCALL setzero_pd() // return 0 (double)
-{
-    return _mm512_setzero_pd();
-}
-
-static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
-{
-    return _mm512_setzero_ps();
-}
-
-static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
-{
-    return _mm512_setzero_si512();
-}
-
-static SIMDINLINE void SIMDCALL
-                       store_ps(float* p, Float a) // *p = a   (stores all elements contiguously in memory)
-{
-    _mm512_store_ps(p, a);
-}
-
-static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
-{
-    _mm512_store_si512(&p->v, a);
-}
-
-static SIMDINLINE void SIMDCALL
-                       storeu_si(Integer* p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
-{
-    _mm512_storeu_si512(&p->v, a);
-}
-
-static SIMDINLINE void SIMDCALL
-                       stream_ps(float* p, Float a) // *p = a   (same as store_ps, but doesn't keep memory in cache)
-{
-    _mm512_stream_ps(p, a);
-}
-
-static SIMDINLINE Integer SIMDCALL set_epi32(int i15,
-                                             int i14,
-                                             int i13,
-                                             int i12,
-                                             int i11,
-                                             int i10,
-                                             int i9,
-                                             int i8,
-                                             int i7,
-                                             int i6,
-                                             int i5,
-                                             int i4,
-                                             int i3,
-                                             int i2,
-                                             int i1,
-                                             int i0)
-{
-    return _mm512_set_epi32(i15, i14, i13, i12, i11, i10, i9, i8, i7, i6, i5, i4, i3, i2, i1, i0);
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
-{
-    return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
-}
-
-static SIMDINLINE Float SIMDCALL set_ps(float i15,
-                                        float i14,
-                                        float i13,
-                                        float i12,
-                                        float i11,
-                                        float i10,
-                                        float i9,
-                                        float i8,
-                                        float i7,
-                                        float i6,
-                                        float i5,
-                                        float i4,
-                                        float i3,
-                                        float i2,
-                                        float i1,
-                                        float i0)
-{
-    return _mm512_set_ps(i15, i14, i13, i12, i11, i10, i9, i8, i7, i6, i5, i4, i3, i2, i1, i0);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
-{
-    return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
-}
-
-static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
-{
-    return castsi_ps(_mm512_maskz_mov_epi32(__mmask16(mask), set1_epi32(-1)));
-}
-
-#undef SIMD_WRAPPER_1_
-#undef SIMD_WRAPPER_1
-#undef SIMD_WRAPPER_2
-#undef SIMD_WRAPPER_2_
-#undef SIMD_WRAPPERI_2_
-#undef SIMD_DWRAPPER_2
-#undef SIMD_DWRAPPER_2I
-#undef SIMD_WRAPPER_2I_
-#undef SIMD_WRAPPER_3_
-#undef SIMD_WRAPPER_2I
-#undef SIMD_WRAPPER_3
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IFWRAPPER_2
-#undef SIMD_IWRAPPER_2I
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_1I
-#undef SIMD_IWRAPPER_1I_
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_2I
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl
@ -1,186 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD16 AVX512 (F) implementation for Core processors
-//
-//============================================================================
-
-#define SIMD_WRAPPER_1_(op, intrin) \
-    static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); }
-
-#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op)
-
-#define SIMD_WRAPPER_2_(op, intrin) \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); }
-#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
-
-#define SIMD_WRAPPERI_2_(op, intrin)                                          \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                     \
-    {                                                                         \
-        return _mm512_castsi512_ps(                                           \
-            _mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \
-    }
-
-#define SIMD_DWRAPPER_2(op) \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); }
-
-#define SIMD_WRAPPER_2I_(op, intrin)                      \
-    template <int ImmT>                                   \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
-    {                                                     \
-        return _mm512_##intrin(a, b, ImmT);               \
-    }
-#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
-
-#define SIMD_DWRAPPER_2I_(op, intrin)                        \
-    template <int ImmT>                                      \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
-    {                                                        \
-        return _mm512_##intrin(a, b, ImmT);                  \
-    }
-#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
-
-#define SIMD_WRAPPER_3(op) \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); }
-
-#define SIMD_IWRAPPER_1(op) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); }
-#define SIMD_IWRAPPER_1_8(op) \
-    static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); }
-
-#define SIMD_IWRAPPER_1_4(op) \
-    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); }
-
-#define SIMD_IWRAPPER_1I_(op, intrin)                \
-    template <int ImmT>                              \
-    static SIMDINLINE Integer SIMDCALL op(Integer a) \
-    {                                                \
-        return intrin(a, ImmT);                      \
-    }
-#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
-
-#define SIMD_IWRAPPER_2_(op, intrin) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); }
-#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
-
-#define SIMD_IWRAPPER_2_CMP(op, cmp) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); }
-
-#define SIMD_IFWRAPPER_2(op, intrin)                                   \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)        \
-    {                                                                  \
-        return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \
-    }
-
-#define SIMD_IWRAPPER_2I_(op, intrin)                           \
-    template <int ImmT>                                         \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
-    {                                                           \
-        return _mm512_##intrin(a, b, ImmT);                     \
-    }
-#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
-
-private:
-static SIMDINLINE Integer vmask(__mmask32 m)
-{
-    return _mm512_maskz_set1_epi16(m, -1);
-}
-static SIMDINLINE Integer vmask(__mmask64 m)
-{
-    return _mm512_maskz_set1_epi8(m, -1);
-}
-
-public:
-SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
-SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
-
-SIMD_WRAPPER_2(and_ps);    // return a & b       (float treated as int)
-SIMD_WRAPPER_2(andnot_ps); // return (~a) & b    (float treated as int)
-SIMD_WRAPPER_2(or_ps);     // return a | b       (float treated as int)
-SIMD_WRAPPER_2(xor_ps);    // return a ^ b       (float treated as int)
-
-SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a    (uint8 --> int16)
-
-template <CompareTypeInt CmpTypeT>
-static SIMDINLINE Integer SIMDCALL cmp_epi8(Integer a, Integer b)
-{
-    // Legacy vector mask generator
-    __mmask64 result = _mm512_cmp_epi8_mask(a, b, static_cast<const int>(CmpTypeT));
-    return vmask(result);
-}
-template <CompareTypeInt CmpTypeT>
-static SIMDINLINE Integer SIMDCALL cmp_epi16(Integer a, Integer b)
-{
-    // Legacy vector mask generator
-    __mmask32 result = _mm512_cmp_epi16_mask(a, b, static_cast<const int>(CmpTypeT));
-    return vmask(result);
-}
-
-SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8<CompareTypeInt::EQ>);   // return a == b (int8)
-SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>); // return a == b (int16)
-SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8<CompareTypeInt::GT>);   // return a > b (int8)
-SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>); // return a > b (int16)
-
-SIMD_IWRAPPER_2(packs_epi16);  // See documentation for _mm512_packs_epi16
-SIMD_IWRAPPER_2(packs_epi32);  // See documentation for _mm512_packs_epi32
-SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16
-SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32
-
-SIMD_IWRAPPER_2(unpackhi_epi8);  // See documentation for _mm512_unpackhi_epi8
-SIMD_IWRAPPER_2(unpacklo_epi16); // See documentation for _mm512_unpacklo_epi16
-SIMD_IWRAPPER_2(unpacklo_epi8);  // See documentation for _mm512_unpacklo_epi8
-
-SIMD_IWRAPPER_2(shuffle_epi8);
-
-static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
-{
-    __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si());
-    return static_cast<uint64_t>(m);
-}
-
-#undef SIMD_WRAPPER_1_
-#undef SIMD_WRAPPER_1
-#undef SIMD_WRAPPER_2
-#undef SIMD_WRAPPER_2_
-#undef SIMD_WRAPPERI_2_
-#undef SIMD_DWRAPPER_2
-#undef SIMD_DWRAPPER_2I
-#undef SIMD_WRAPPER_2I_
-#undef SIMD_WRAPPER_3_
-#undef SIMD_WRAPPER_2I
-#undef SIMD_WRAPPER_3
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IFWRAPPER_2
-#undef SIMD_IWRAPPER_2I
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_1I
-#undef SIMD_IWRAPPER_1I_
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_2I
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl
@ -1,132 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD16 AVX512 (F) implementation for Knights Family Processors
-//
-//============================================================================
-
-#define SIMD_WRAPPER_1_(op, intrin) \
-    static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); }
-
-#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op)
-
-#define SIMD_WRAPPER_2_(op, intrin) \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); }
-#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
-
-#define SIMD_WRAPPERI_2_(op, intrin)                                          \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                     \
-    {                                                                         \
-        return _mm512_castsi512_ps(                                           \
-            _mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \
-    }
-
-#define SIMD_DWRAPPER_2(op) \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); }
-
-#define SIMD_WRAPPER_2I_(op, intrin)                      \
-    template <int ImmT>                                   \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
-    {                                                     \
-        return _mm512_##intrin(a, b, ImmT);               \
-    }
-#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
-
-#define SIMD_DWRAPPER_2I_(op, intrin)                        \
-    template <int ImmT>                                      \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
-    {                                                        \
-        return _mm512_##intrin(a, b, ImmT);                  \
-    }
-#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
-
-#define SIMD_WRAPPER_3(op) \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); }
-
-#define SIMD_IWRAPPER_1(op) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); }
-#define SIMD_IWRAPPER_1_8(op) \
-    static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); }
-
-#define SIMD_IWRAPPER_1_4(op) \
-    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); }
-
-#define SIMD_IWRAPPER_1I_(op, intrin)                \
-    template <int ImmT>                              \
-    static SIMDINLINE Integer SIMDCALL op(Integer a) \
-    {                                                \
-        return intrin(a, ImmT);                      \
-    }
-#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
-
-#define SIMD_IWRAPPER_2_(op, intrin) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); }
-#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
-
-#define SIMD_IWRAPPER_2_CMP(op, cmp) \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); }
-
-#define SIMD_IFWRAPPER_2(op, intrin)                                   \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)        \
-    {                                                                  \
-        return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \
-    }
-
-#define SIMD_IWRAPPER_2I_(op, intrin)                           \
-    template <int ImmT>                                         \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
-    {                                                           \
-        return _mm512_##intrin(a, b, ImmT);                     \
-    }
-#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
-
-SIMD_WRAPPERI_2_(and_ps, and_epi32);       // return a & b       (float treated as int)
-SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32); // return (~a) & b    (float treated as int)
-SIMD_WRAPPERI_2_(or_ps, or_epi32);         // return a | b       (float treated as int)
-SIMD_WRAPPERI_2_(xor_ps, xor_epi32);       // return a ^ b       (float treated as int)
-
-#undef SIMD_WRAPPER_1_
-#undef SIMD_WRAPPER_1
-#undef SIMD_WRAPPER_2
-#undef SIMD_WRAPPER_2_
-#undef SIMD_WRAPPERI_2_
-#undef SIMD_DWRAPPER_2
-#undef SIMD_DWRAPPER_2I
-#undef SIMD_WRAPPER_2I_
-#undef SIMD_WRAPPER_3_
-#undef SIMD_WRAPPER_2I
-#undef SIMD_WRAPPER_3
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IFWRAPPER_2
-#undef SIMD_IWRAPPER_2I
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_1I
-#undef SIMD_IWRAPPER_1I_
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_2I
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl
@ -1,27 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-// Implement mask-enabled SIMD functions
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_core.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_core.inl
@ -1,27 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-// Implement mask-enabled SIMD functions
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_knights.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_knights.inl
@ -1,27 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX512_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-// Implement mask-enabled SIMD functions
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
@ -1,852 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-//============================================================================
-// SIMD16 AVX (1) implementation
-//============================================================================
-
-static const int TARGET_SIMD_WIDTH = 8;
-using SIMD128T                     = SIMD128Impl::AVXImpl;
-
-#define SIMD_WRAPPER_1(op)                              \
-    static SIMDINLINE Float SIMDCALL op(Float const& a) \
-    {                                                   \
-        return Float{                                   \
-            SIMD256T::op(a.v8[0]),                      \
-            SIMD256T::op(a.v8[1]),                      \
-        };                                              \
-    }
-
-#define SIMD_WRAPPER_2(op)                                              \
-    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
-    {                                                                   \
-        return Float{                                                   \
-            SIMD256T::op(a.v8[0], b.v8[0]),                             \
-            SIMD256T::op(a.v8[1], b.v8[1]),                             \
-        };                                                              \
-    }
-
-#define SIMD_WRAPPER_2I(op)                                                              \
-    template <int ImmT>                                                                  \
-    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b)                  \
-    {                                                                                    \
-        return Float{                                                                    \
-            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),                        \
-            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
-        };                                                                               \
-    }
-
-#define SIMD_WRAPPER_2I_1(op)                                           \
-    template <int ImmT>                                                 \
-    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
-    {                                                                   \
-        return Float{                                                   \
-            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),              \
-            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),              \
-        };                                                              \
-    }
-
-#define SIMD_WRAPPER_3(op)                                                              \
-    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \
-    {                                                                                   \
-        return Float{                                                                   \
-            SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),                                    \
-            SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),                                    \
-        };                                                                              \
-    }
-
-#define SIMD_IWRAPPER_1(op)                                 \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
-    {                                                       \
-        return Integer{                                     \
-            SIMD256T::op(a.v8[0]),                          \
-            SIMD256T::op(a.v8[1]),                          \
-        };                                                  \
-    }
-
-#define SIMD_IWRAPPER_2(op)                                                   \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return Integer{                                                       \
-            SIMD256T::op(a.v8[0], b.v8[0]),                                   \
-            SIMD256T::op(a.v8[1], b.v8[1]),                                   \
-        };                                                                    \
-    }
-
-#define SIMD_IWRAPPER_2I(op)                                                             \
-    template <int ImmT>                                                                  \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b)            \
-    {                                                                                    \
-        return Integer{                                                                  \
-            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),                        \
-            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
-        };                                                                               \
-    }
-
-#define SIMD_IWRAPPER_2I_1(op)                                                \
-    template <int ImmT>                                                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return Integer{                                                       \
-            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),                    \
-            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),                    \
-        };                                                                    \
-    }
-
-#define SIMD_IWRAPPER_2I_2(op)                                                \
-    template <int ImmT>                                                       \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
-    {                                                                         \
-        return Integer{                                                       \
-            SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),              \
-            SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),       \
-        };                                                                    \
-    }
-
-#define SIMD_IWRAPPER_3(op)                                                                     \
-    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \
-    {                                                                                           \
-        return Integer{                                                                         \
-            SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),                                            \
-            SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),                                            \
-        };                                                                                      \
-    }
-
-//-----------------------------------------------------------------------
-// Single precision floating point arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2(add_ps);   // return a + b
-SIMD_WRAPPER_2(div_ps);   // return a / b
-SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
-SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
-SIMD_WRAPPER_2(max_ps);   // return (a > b) ? a : b
-SIMD_WRAPPER_2(min_ps);   // return (a < b) ? a : b
-SIMD_WRAPPER_2(mul_ps);   // return a * b
-SIMD_WRAPPER_1(rcp_ps);   // return 1.0f / a
-SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
-SIMD_WRAPPER_2(sub_ps);   // return a - b
-
-template <RoundMode RMT>
-static SIMDINLINE Float SIMDCALL round_ps(Float const& a)
-{
-    return Float{
-        SIMD256T::template round_ps<RMT>(a.v8[0]),
-        SIMD256T::template round_ps<RMT>(a.v8[1]),
-    };
-}
-
-static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a)
-{
-    return round_ps<RoundMode::CEIL_NOEXC>(a);
-}
-static SIMDINLINE Float SIMDCALL floor_ps(Float const& a)
-{
-    return round_ps<RoundMode::FLOOR_NOEXC>(a);
-}
-
-//-----------------------------------------------------------------------
-// Integer (various width) arithmetic operations
-//-----------------------------------------------------------------------
-SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
-SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
-SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
-SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
-SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
-SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
-SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
-SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
-
-// return (a * b) & 0xFFFFFFFF
-//
-// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
-// and store the low 32 bits of the intermediate integers in dst.
-SIMD_IWRAPPER_2(mullo_epi32);
-SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
-SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
-SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
-
-//-----------------------------------------------------------------------
-// Logical operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2(and_ps);     // return a & b       (float treated as int)
-SIMD_IWRAPPER_2(and_si);    // return a & b       (int)
-SIMD_WRAPPER_2(andnot_ps);  // return (~a) & b    (float treated as int)
-SIMD_IWRAPPER_2(andnot_si); // return (~a) & b    (int)
-SIMD_WRAPPER_2(or_ps);      // return a | b       (float treated as int)
-SIMD_IWRAPPER_2(or_si);     // return a | b       (int)
-SIMD_WRAPPER_2(xor_ps);     // return a ^ b       (float treated as int)
-SIMD_IWRAPPER_2(xor_si);    // return a ^ b       (int)
-
-//-----------------------------------------------------------------------
-// Shift operations
-//-----------------------------------------------------------------------
-template <int ImmT>
-static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const& a) // return a << ImmT
-{
-    return Integer{
-        SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
-        SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
-    };
-}
-
-SIMD_IWRAPPER_2(sllv_epi32); // return a << b      (uint32)
-
-template <int ImmT>
-static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const& a) // return a >> ImmT   (int32)
-{
-    return Integer{
-        SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
-        SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
-    };
-}
-
-template <int ImmT>
-static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const& a) // return a >> ImmT   (uint32)
-{
-    return Integer{
-        SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
-        SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
-    };
-}
-
-template <int ImmT>                                          // for each 128-bit lane:
-static SIMDINLINE Integer SIMDCALL srli_si(Integer const& a) //  return a >> (ImmT*8) (uint)
-{
-    return Integer{
-        SIMD256T::template srli_si<ImmT>(a.v8[0]),
-        SIMD256T::template srli_si<ImmT>(a.v8[1]),
-    };
-}
-template <int ImmT>
-static SIMDINLINE Float SIMDCALL
-                        srlisi_ps(Float const& a) // same as srli_si, but with Float cast to int
-{
-    return Float{
-        SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
-        SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
-    };
-}
-
-SIMD_IWRAPPER_2(srlv_epi32); // return a >> b      (uint32)
-
-//-----------------------------------------------------------------------
-// Conversion operations
-//-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a)
-{
-    return Float{
-        SIMD256T::castpd_ps(a.v8[0]),
-        SIMD256T::castpd_ps(a.v8[1]),
-    };
-}
-
-static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a)
-{
-    return Integer{
-        SIMD256T::castps_si(a.v8[0]),
-        SIMD256T::castps_si(a.v8[1]),
-    };
-}
-
-static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a)
-{
-    return Double{
-        SIMD256T::castsi_pd(a.v8[0]),
-        SIMD256T::castsi_pd(a.v8[1]),
-    };
-}
-
-static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a)
-{
-    return Double{
-        SIMD256T::castps_pd(a.v8[0]),
-        SIMD256T::castps_pd(a.v8[1]),
-    };
-}
-
-static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a)
-{
-    return Float{
-        SIMD256T::castsi_ps(a.v8[0]),
-        SIMD256T::castsi_ps(a.v8[1]),
-    };
-}
-
-static SIMDINLINE Float SIMDCALL
-                        cvtepi32_ps(Integer const& a) // return (float)a    (int32 --> float)
-{
-    return Float{
-        SIMD256T::cvtepi32_ps(a.v8[0]),
-        SIMD256T::cvtepi32_ps(a.v8[1]),
-    };
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          cvtepu8_epi16(SIMD256Impl::Integer const& a) // return (int16)a    (uint8 --> int16)
-{
-    return Integer{
-        SIMD256T::cvtepu8_epi16(a.v4[0]),
-        SIMD256T::cvtepu8_epi16(a.v4[1]),
-    };
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          cvtepu8_epi32(SIMD256Impl::Integer const& a) // return (int32)a    (uint8 --> int32)
-{
-    return Integer{
-        SIMD256T::cvtepu8_epi32(a.v4[0]),
-        SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
-    };
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          cvtepu16_epi32(SIMD256Impl::Integer const& a) // return (int32)a    (uint16 --> int32)
-{
-    return Integer{
-        SIMD256T::cvtepu16_epi32(a.v4[0]),
-        SIMD256T::cvtepu16_epi32(a.v4[1]),
-    };
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          cvtepu16_epi64(SIMD256Impl::Integer const& a) // return (int64)a    (uint16 --> int64)
-{
-    return Integer{
-        SIMD256T::cvtepu16_epi64(a.v4[0]),
-        SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
-    };
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          cvtepu32_epi64(SIMD256Impl::Integer const& a) // return (int64)a    (uint32 --> int64)
-{
-    return Integer{
-        SIMD256T::cvtepu32_epi64(a.v4[0]),
-        SIMD256T::cvtepu32_epi64(a.v4[1]),
-    };
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          cvtps_epi32(Float const& a) // return (int32)a    (float --> int32)
-{
-    return Integer{
-        SIMD256T::cvtps_epi32(a.v8[0]),
-        SIMD256T::cvtps_epi32(a.v8[1]),
-    };
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          cvttps_epi32(Float const& a) // return (int32)a    (rnd_to_zero(float) --> int32)
-{
-    return Integer{
-        SIMD256T::cvtps_epi32(a.v8[0]),
-        SIMD256T::cvtps_epi32(a.v8[1]),
-    };
-}
-
-//-----------------------------------------------------------------------
-// Comparison operations
-//-----------------------------------------------------------------------
-template <CompareType CmpTypeT>
-static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b
-{
-    return Float{
-        SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
-        SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
-    };
-}
-static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::LT_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::GT_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::NEQ_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::EQ_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::GE_OQ>(a, b);
-}
-static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b)
-{
-    return cmp_ps<CompareType::LE_OQ>(a, b);
-}
-
-template <CompareType CmpTypeT>
-static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const& a, Float const& b)
-{
-    return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
-}
-
-SIMD_IWRAPPER_2(cmpeq_epi8);  // return a == b (int8)
-SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
-SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
-SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
-SIMD_IWRAPPER_2(cmpgt_epi8);  // return a > b (int8)
-SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
-SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
-SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
-SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
-
-static SIMDINLINE bool SIMDCALL
-                       testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
-{
-    return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) & SIMD256T::testz_ps(a.v8[1], b.v8[1]));
-}
-
-static SIMDINLINE bool SIMDCALL
-                       testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
-{
-    return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) & SIMD256T::testz_si(a.v8[1], b.v8[1]));
-}
-
-//-----------------------------------------------------------------------
-// Blend / shuffle / permute operations
-//-----------------------------------------------------------------------
-SIMD_WRAPPER_2I(blend_ps);     // return ImmT ? b : a  (float)
-SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a  (int32)
-SIMD_WRAPPER_3(blendv_ps);     // return mask ? b : a  (float)
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
-                                                Integer const& b,
-                                                Float const&   mask) // return mask ? b : a (int)
-{
-    return Integer{
-        SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
-        SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
-    };
-}
-
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
-                                                Integer const& b,
-                                                Integer const& mask) // return mask ? b : a (int)
-{
-    return Integer{
-        SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
-        SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
-    };
-}
-
-static SIMDINLINE Float SIMDCALL
-                        broadcast_ss(float const* p) // return *p (all elements in vector get same value)
-{
-    float f = *p;
-    return Float{
-        SIMD256T::set1_ps(f),
-        SIMD256T::set1_ps(f),
-    };
-}
-
-template <int imm>
-static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const& a)
-{
-    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
-    return a.v8[imm];
-}
-
-template <int imm>
-static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const& a)
-{
-    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
-    return a.v8[imm];
-}
-
-template <int imm>
-static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const& a)
-{
-    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
-    return a.v8[imm];
-}
-
-template <int imm>
-static SIMDINLINE Float SIMDCALL insert_ps(Float const& a, SIMD256Impl::Float const& b)
-{
-    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
-    Float r   = a;
-    r.v8[imm] = b;
-    return r;
-}
-
-template <int imm>
-static SIMDINLINE Double SIMDCALL insert_pd(Double const& a, SIMD256Impl::Double const& b)
-{
-    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
-    Double r  = a;
-    r.v8[imm] = b;
-    return r;
-}
-
-template <int imm>
-static SIMDINLINE Integer SIMDCALL insert_si(Integer const& a, SIMD256Impl::Integer const& b)
-{
-    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
-    Integer r = a;
-    r.v8[imm] = b;
-    return r;
-}
-
-SIMD_IWRAPPER_2(packs_epi16);  // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
-SIMD_IWRAPPER_2(packs_epi32);  // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
-SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
-
-template <int ImmT>
-static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
-{
-    return Float{
-        SIMD256T::template permute_ps<ImmT>(a.v8[0]),
-        SIMD256T::template permute_ps<ImmT>(a.v8[1]),
-    };
-}
-
-static SIMDINLINE Integer SIMDCALL permute_epi32(
-    Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
-{
-    return castps_si(permute_ps(castsi_ps(a), swiz));
-}
-
-static SIMDINLINE Float SIMDCALL
-                        permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
-{
-    const auto mask = SIMD256T::set1_epi32(7);
-
-    auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], mask));
-    auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], mask));
-
-    auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask));
-    auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask));
-
-    return Float{
-        SIMD256T::blendv_ps(
-            lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))),
-        SIMD256T::blendv_ps(
-            hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))),
-    };
-}
-
-// All of the 512-bit permute2f128_XX intrinsics do the following:
-//
-//      SELECT4(src, control) {
-//          CASE(control[1:0])
-//              0 : tmp[127:0] : = src[127:0]
-//              1 : tmp[127:0] : = src[255:128]
-//              2 : tmp[127:0] : = src[383:256]
-//              3 : tmp[127:0] : = src[511:384]
-//              ESAC
-//              RETURN tmp[127:0]
-//      }
-//
-//      dst[127:0]   : = SELECT4(a[511:0], imm8[1:0])
-//      dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
-//      dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
-//      dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
-//      dst[MAX:512] : = 0
-//
-// Since the 256-bit AVX instructions use a 4-bit control field (instead
-// of 2-bit for AVX512), we need to expand the control bits sent to the
-// AVX instructions for emulation.
-//
-template <int shuf>
-static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const& a, Float const& b)
-{
-    return Float{
-        SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
-                                                                                        a.v8[1]),
-        SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
-                                                                                        b.v8[1]),
-    };
-}
-
-template <int shuf>
-static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const& a, Double const& b)
-{
-    return Double{
-        SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
-                                                                                        a.v8[1]),
-        SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
-                                                                                        b.v8[1]),
-    };
-}
-
-template <int shuf>
-static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const& a, Integer const& b)
-{
-    return Integer{
-        SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
-                                                                                        a.v8[1]),
-        SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
-                                                                                        b.v8[1]),
-    };
-}
-
-SIMD_IWRAPPER_2I_1(shuffle_epi32);
-SIMD_IWRAPPER_2I_2(shuffle_epi64);
-SIMD_IWRAPPER_2(shuffle_epi8);
-SIMD_WRAPPER_2I_1(shuffle_pd);
-SIMD_WRAPPER_2I_1(shuffle_ps);
-SIMD_IWRAPPER_2(unpackhi_epi16);
-SIMD_IWRAPPER_2(unpackhi_epi32);
-SIMD_IWRAPPER_2(unpackhi_epi64);
-SIMD_IWRAPPER_2(unpackhi_epi8);
-SIMD_WRAPPER_2(unpackhi_pd);
-SIMD_WRAPPER_2(unpackhi_ps);
-SIMD_IWRAPPER_2(unpacklo_epi16);
-SIMD_IWRAPPER_2(unpacklo_epi32);
-SIMD_IWRAPPER_2(unpacklo_epi64);
-SIMD_IWRAPPER_2(unpacklo_epi8);
-SIMD_WRAPPER_2(unpacklo_pd);
-SIMD_WRAPPER_2(unpacklo_ps);
-
-//-----------------------------------------------------------------------
-// Load / store operations
-//-----------------------------------------------------------------------
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
-{
-    return Float{
-        SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
-        SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
-    };
-}
-
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
-{
-    return Float{
-        SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[0]),
-        SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[1]),
-    };
-}
-
-static SIMDINLINE Float SIMDCALL
-                        load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
-{
-    return broadcast_ss(p);
-}
-
-static SIMDINLINE Float SIMDCALL
-                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
-{
-    return Float{SIMD256T::load_ps(p), SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)};
-}
-
-static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
-{
-    return Integer{
-        SIMD256T::load_si(&p->v8[0]),
-        SIMD256T::load_si(&p->v8[1]),
-    };
-}
-
-static SIMDINLINE Float SIMDCALL
-                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
-{
-    return Float{SIMD256T::loadu_ps(p), SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)};
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
-{
-    return Integer{
-        SIMD256T::loadu_si(&p->v8[0]),
-        SIMD256T::loadu_si(&p->v8[1]),
-    };
-}
-
-// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
-{
-    return Float{
-        SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
-        SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
-    };
-}
-
-template <ScaleFactor ScaleT = ScaleFactor::SF_1>
-static SIMDINLINE Float SIMDCALL
-                        sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
-{
-    return Float{
-        SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
-        SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
-    };
-}
-
-static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src)
-{
-    SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
-    SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
-}
-
-static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const& a)
-{
-    uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
-    mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
-
-    return mask;
-}
-
-static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a)
-{
-    uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
-    mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
-
-    return mask;
-}
-static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a)
-{
-    uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
-    mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
-
-    return mask;
-}
-
-static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
-{
-    return Integer{SIMD256T::set1_epi32(i), SIMD256T::set1_epi32(i)};
-}
-
-static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
-{
-    return Integer{SIMD256T::set1_epi8(i), SIMD256T::set1_epi8(i)};
-}
-
-static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
-{
-    return Float{SIMD256T::set1_ps(f), SIMD256T::set1_ps(f)};
-}
-
-static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
-{
-    return Float{SIMD256T::setzero_ps(), SIMD256T::setzero_ps()};
-}
-
-static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
-{
-    return Integer{SIMD256T::setzero_si(), SIMD256T::setzero_si()};
-}
-
-static SIMDINLINE void SIMDCALL
-                       store_ps(float* p, Float const& a) // *p = a   (stores all elements contiguously in memory)
-{
-    SIMD256T::store_ps(p, a.v8[0]);
-    SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
-}
-
-static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a
-{
-    SIMD256T::store_si(&p->v8[0], a.v8[0]);
-    SIMD256T::store_si(&p->v8[1], a.v8[1]);
-}
-
-static SIMDINLINE void SIMDCALL
-                       stream_ps(float* p, Float const& a) // *p = a   (same as store_ps, but doesn't keep memory in cache)
-{
-    SIMD256T::stream_ps(p, a.v8[0]);
-    SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
-}
-
-static SIMDINLINE Integer SIMDCALL set_epi32(int i15,
-                                             int i14,
-                                             int i13,
-                                             int i12,
-                                             int i11,
-                                             int i10,
-                                             int i9,
-                                             int i8,
-                                             int i7,
-                                             int i6,
-                                             int i5,
-                                             int i4,
-                                             int i3,
-                                             int i2,
-                                             int i1,
-                                             int i0)
-{
-    return Integer{SIMD256T::set_epi32(i7, i6, i5, i4, i3, i2, i1, i0),
-                   SIMD256T::set_epi32(i15, i14, i13, i12, i11, i10, i9, i8)};
-}
-
-static SIMDINLINE Integer SIMDCALL
-                          set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
-{
-    return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
-}
-
-static SIMDINLINE Float SIMDCALL set_ps(float i15,
-                                        float i14,
-                                        float i13,
-                                        float i12,
-                                        float i11,
-                                        float i10,
-                                        float i9,
-                                        float i8,
-                                        float i7,
-                                        float i6,
-                                        float i5,
-                                        float i4,
-                                        float i3,
-                                        float i2,
-                                        float i1,
-                                        float i0)
-{
-    return Float{SIMD256T::set_ps(i7, i6, i5, i4, i3, i2, i1, i0),
-                 SIMD256T::set_ps(i15, i14, i13, i12, i11, i10, i9, i8)};
-}
-
-static SIMDINLINE Float SIMDCALL
-                        set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
-{
-    return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
-}
-
-static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
-{
-    return Float{SIMD256T::vmask_ps(mask), SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH)};
-}
-
-#undef SIMD_WRAPPER_1
-#undef SIMD_WRAPPER_2
-#undef SIMD_WRAPPER_2I
-#undef SIMD_WRAPPER_2I_1
-#undef SIMD_WRAPPER_3
-#undef SIMD_IWRAPPER_1
-#undef SIMD_IWRAPPER_2
-#undef SIMD_IWRAPPER_2I
-#undef SIMD_IWRAPPER_2I_1
-#undef SIMD_IWRAPPER_3
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl
@ -1,27 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#if !defined(__SIMD_LIB_AVX_HPP__)
-#error Do not include this file directly, use "simdlib.hpp" instead.
-#endif
-
-// no backwards compatibility for simd mask-enabled functions
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
@ -1,332 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#pragma once
-#if 0
-//===========================================================================
-// Placeholder name representing either SIMD4, SIMD256, or SIMD16 structures.
-//===========================================================================
-struct SIMD256 // or SIMD4 or SIMD16
-{
-    //=======================================================================
-    // SIMD Types
-    //
-    // These typedefs are examples. The SIMD256 and SIMD16 implementations will
-    // use different base types with this same naming.
-    using Float     = __m256;  // Packed single-precision float vector
-    using Double    = __m256d; // Packed double-precision float vector
-    using Integer   = __m256i; // Packed integer vector (mutable element widths)
-    using Mask      = uint8_t; // Integer representing mask bits
-
-    //=======================================================================
-    // Standard interface
-    // (available in both SIMD256 and SIMD16 widths)
-    //=======================================================================
-
-    //-----------------------------------------------------------------------
-    // Single precision floating point arithmetic operations
-    //-----------------------------------------------------------------------
-    static Float    add_ps(Float a, Float b);               // return a + b
-    static Float    div_ps(Float a, Float b);               // return a / b
-    static Float    fmadd_ps(Float a, Float b, Float c);    // return (a * b) + c
-    static Float    fmsub_ps(Float a, Float b, Float c);    // return (a * b) - c
-    static Float    max_ps(Float a, Float b);               // return (a > b) ? a : b
-    static Float    min_ps(Float a, Float b);               // return (a < b) ? a : b
-    static Float    mul_ps(Float a, Float b);               // return a * b
-    static Float    rcp_ps(Float a);                        // return 1.0f / a
-    static Float    rsqrt_ps(Float a);                      // return 1.0f / sqrt(a)
-    static Float    sub_ps(Float a, Float b);               // return a - b
-
-    enum class RoundMode
-    {
-        TO_NEAREST_INT  = 0x00, // Round to nearest integer == TRUNCATE(value + (signof(value))0.5)
-        TO_NEG_INF      = 0x01, // Round to negative infinity
-        TO_POS_INF      = 0x02, // Round to positive infinity
-        TO_ZERO         = 0x03, // Round to 0 a.k.a. truncate
-        CUR_DIRECTION   = 0x04, // Round in direction set in MXCSR register
-
-        RAISE_EXC       = 0x00, // Raise exception on overflow
-        NO_EXC          = 0x08, // Suppress exceptions
-
-        NINT            = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(RAISE_EXC),
-        NINT_NOEXC      = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(NO_EXC),
-        FLOOR           = static_cast<int>(TO_NEG_INF)      | static_cast<int>(RAISE_EXC),
-        FLOOR_NOEXC     = static_cast<int>(TO_NEG_INF)      | static_cast<int>(NO_EXC),
-        CEIL            = static_cast<int>(TO_POS_INF)      | static_cast<int>(RAISE_EXC),
-        CEIL_NOEXC      = static_cast<int>(TO_POS_INF)      | static_cast<int>(NO_EXC),
-        TRUNC           = static_cast<int>(TO_ZERO)         | static_cast<int>(RAISE_EXC),
-        TRUNC_NOEXC     = static_cast<int>(TO_ZERO)         | static_cast<int>(NO_EXC),
-        RINT            = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(RAISE_EXC),
-        NEARBYINT       = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(NO_EXC),
-    };
-
-    // return round_func(a)
-    //
-    // round_func is chosen on the RMT template parameter.  See the documentation
-    // for the RoundMode enumeration above.
-    template <RoundMode RMT>
-    static Float    round_ps(Float a);                  // return round(a) 
-
-
-    //-----------------------------------------------------------------------
-    // Integer (various width) arithmetic operations
-    //-----------------------------------------------------------------------
-    static Integer  abs_epi32(Integer a);               // return absolute_value(a) (int32)
-    static Integer  add_epi32(Integer a, Integer b);    // return a + b (int32)
-    static Integer  add_epi8(Integer a, Integer b);     // return a + b (int8)
-    static Integer  adds_epu8(Integer a, Integer b);    // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
-    static Integer  max_epi32(Integer a, Integer b);    // return (a > b) ? a : b (int32)
-    static Integer  max_epu32(Integer a, Integer b);    // return (a > b) ? a : b (uint32)
-    static Integer  min_epi32(Integer a, Integer b);    // return (a < b) ? a : b (int32)
-    static Integer  min_epu32(Integer a, Integer b);    // return (a < b) ? a : b (uint32)
-    static Integer  mul_epi32(Integer a, Integer b);    // return a * b (int32)
-
-    // return (a * b) & 0xFFFFFFFF
-    //
-    // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
-    // and store the low 32 bits of the intermediate integers in dst.
-    static Float    mullo_epi32(Integer a, Integer b);
-
-    static Integer  sub_epi32(Integer a, Integer b);    // return a - b (int32)
-    static Integer  sub_epi64(Integer a, Integer b);    // return a - b (int64)
-    static Integer  subs_epu8(Integer a, Integer b);    // return (b > a) ? 0 : (a - b) (uint8)
-
-    //-----------------------------------------------------------------------
-    // Logical operations
-    //-----------------------------------------------------------------------
-    static Float    and_ps(Float a, Float b);           // return a & b       (float treated as int)
-    static Integer  and_si(Integer a, Integer b);       // return a & b       (int)
-    static Float    andnot_ps(Float a, Float b);        // return (~a) & b    (float treated as int)
-    static Integer  andnot_si(Integer a, Integer b);    // return (~a) & b    (int)
-    static Float    or_ps(Float a, Float b);            // return a | b       (float treated as int)
-    static Float    or_si(Integer a, Integer b);        // return a | b       (int)
-    static Float    xor_ps(Float a, Float b);           // return a ^ b       (float treated as int)
-    static Integer  xor_si(Integer a, Integer b);       // return a ^ b       (int)
-
-    //-----------------------------------------------------------------------
-    // Shift operations
-    //-----------------------------------------------------------------------
-    template<int ImmT>
-    static Integer  slli_epi32(Integer a);              // return a << ImmT
-    static Integer  sllv_epi32(Integer a, Integer b);   // return a << b
-    template<int ImmT>
-    static Integer  srai_epi32(Integer a);              // return a >> ImmT   (int32)
-    template<int ImmT>
-    static Integer  srli_epi32(Integer a);              // return a >> ImmT   (uint32)
-    template<int ImmT>                                  // for each 128-bit lane:
-    static Integer  srli_si(Integer a);                 //  return a >> (ImmT*8) (uint)
-    template<int ImmT>
-    static Float    srlisi_ps(Float a);                 // same as srli_si, but with Float cast to int
-    static Integer  srlv_epi32(Integer a, Integer b);   // return a >> b      (uint32)
-
-    //-----------------------------------------------------------------------
-    // Conversion operations
-    //-----------------------------------------------------------------------
-    static Float    castpd_ps(Double a);                // return *(Float*)(&a)
-    static Integer  castps_si(Float a);                 // return *(Integer*)(&a)
-    static Double   castsi_pd(Integer a);               // return *(Double*)(&a)
-    static Double   castps_pd(Float a);                 // return *(Double*)(&a)
-    static Float    castsi_ps(Integer a);               // return *(Float*)(&a)
-    static Float    cvtepi32_ps(Integer a);             // return (float)a    (int32 --> float)
-    static Integer  cvtepu8_epi16(Integer a);           // return (int16)a    (uint8 --> int16)
-    static Integer  cvtepu8_epi32(Integer a);           // return (int32)a    (uint8 --> int32)
-    static Integer  cvtepu16_epi32(Integer a);          // return (int32)a    (uint16 --> int32)
-    static Integer  cvtepu16_epi64(Integer a);          // return (int64)a    (uint16 --> int64)
-    static Integer  cvtepu32_epi64(Integer a);          // return (int64)a    (uint32 --> int64)
-    static Integer  cvtps_epi32(Float a);               // return (int32)a    (float --> int32)
-    static Integer  cvttps_epi32(Float a);              // return (int32)a    (rnd_to_zero(float) --> int32)
-
-    //-----------------------------------------------------------------------
-    // Comparison operations
-    //-----------------------------------------------------------------------
-
-    // Comparison types used with cmp_ps:
-    //   - ordered comparisons are always false if either operand is NaN
-    //   - unordered comparisons are always true if either operand is NaN
-    //   - signaling comparisons raise an exception if either operand is NaN
-    //   - non-signaling comparisons will never raise an exception
-    // 
-    // Ordered:     return (a != NaN) && (b != NaN) && (a cmp b)
-    // Unordered:   return (a == NaN) || (b == NaN) || (a cmp b)
-    enum class CompareType
-    {
-        EQ_OQ      = 0x00, // Equal (ordered, nonsignaling)
-        LT_OS      = 0x01, // Less-than (ordered, signaling)
-        LE_OS      = 0x02, // Less-than-or-equal (ordered, signaling)
-        UNORD_Q    = 0x03, // Unordered (nonsignaling)
-        NEQ_UQ     = 0x04, // Not-equal (unordered, nonsignaling)
-        NLT_US     = 0x05, // Not-less-than (unordered, signaling)
-        NLE_US     = 0x06, // Not-less-than-or-equal (unordered, signaling)
-        ORD_Q      = 0x07, // Ordered (nonsignaling)
-        EQ_UQ      = 0x08, // Equal (unordered, non-signaling)
-        NGE_US     = 0x09, // Not-greater-than-or-equal (unordered, signaling)
-        NGT_US     = 0x0A, // Not-greater-than (unordered, signaling)
-        FALSE_OQ   = 0x0B, // False (ordered, nonsignaling)
-        NEQ_OQ     = 0x0C, // Not-equal (ordered, non-signaling)
-        GE_OS      = 0x0D, // Greater-than-or-equal (ordered, signaling)
-        GT_OS      = 0x0E, // Greater-than (ordered, signaling)
-        TRUE_UQ    = 0x0F, // True (unordered, non-signaling)
-        EQ_OS      = 0x10, // Equal (ordered, signaling)
-        LT_OQ      = 0x11, // Less-than (ordered, nonsignaling)
-        LE_OQ      = 0x12, // Less-than-or-equal (ordered, nonsignaling)
-        UNORD_S    = 0x13, // Unordered (signaling)
-        NEQ_US     = 0x14, // Not-equal (unordered, signaling)
-        NLT_UQ     = 0x15, // Not-less-than (unordered, nonsignaling)
-        NLE_UQ     = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
-        ORD_S      = 0x17, // Ordered (signaling)
-        EQ_US      = 0x18, // Equal (unordered, signaling)
-        NGE_UQ     = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
-        NGT_UQ     = 0x1A, // Not-greater-than (unordered, nonsignaling)
-        FALSE_OS   = 0x1B, // False (ordered, signaling)
-        NEQ_OS     = 0x1C, // Not-equal (ordered, signaling)
-        GE_OQ      = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
-        GT_OQ      = 0x1E, // Greater-than (ordered, nonsignaling)
-        TRUE_US    = 0x1F, // True (unordered, signaling)
-    };
-
-    // return a (CmpTypeT) b (float)
-    //
-    // See documentation for CompareType above for valid values for CmpTypeT.
-    template<CompareType CmpTypeT>
-    static Float    cmp_ps(Float a, Float b);           // return a (CmtTypeT) b (see above)
-    static Float    cmpgt_ps(Float a, Float b);         // return cmp_ps<CompareType::GT_OQ>(a, b)
-    static Float    cmple_ps(Float a, Float b);         // return cmp_ps<CompareType::LE_OQ>(a, b)
-    static Float    cmplt_ps(Float a, Float b);         // return cmp_ps<CompareType::LT_OQ>(a, b)
-    static Float    cmpneq_ps(Float a, Float b);        // return cmp_ps<CompareType::NEQ_OQ>(a, b)
-    static Float    cmpeq_ps(Float a, Float b);         // return cmp_ps<CompareType::EQ_OQ>(a, b)
-    static Float    cmpge_ps(Float a, Float b);         // return cmp_ps<CompareType::GE_OQ>(a, b)
-    static Integer  cmpeq_epi8(Integer a, Integer b);   // return a == b (int8)
-    static Integer  cmpeq_epi16(Integer a, Integer b);  // return a == b (int16)
-    static Integer  cmpeq_epi32(Integer a, Integer b);  // return a == b (int32)
-    static Integer  cmpeq_epi64(Integer a, Integer b);  // return a == b (int64)
-    static Integer  cmpgt_epi8(Integer a, Integer b);   // return a > b (int8)
-    static Integer  cmpgt_epi16(Integer a, Integer b);  // return a > b (int16)
-    static Integer  cmpgt_epi32(Integer a, Integer b);  // return a > b (int32)
-    static Integer  cmpgt_epi64(Integer a, Integer b);  // return a > b (int64)
-    static Integer  cmplt_epi32(Integer a, Integer b);  // return a < b (int32)
-    static bool     testz_ps(Float a, Float b);         // return all_lanes_zero(a & b) ? 1 : 0 (float)
-    static bool     testz_si(Integer a, Integer b);     // return all_lanes_zero(a & b) ? 1 : 0 (int)
-
-    //-----------------------------------------------------------------------
-    // Blend / shuffle / permute operations
-    //-----------------------------------------------------------------------
-    template<int ImmT>
-    static Float    blend_ps(Float a, Float b);                     // return ImmT ? b : a  (float)
-    static Integer  blendv_epi32(Integer a, Integer b, Float mask); // return mask ? b : a (int)
-    static Float    blendv_ps(Float a, Float b, Float mask);        // return mask ? b : a (float)
-    static Float    broadcast_ss(float const *p);                   // return *p (all elements in vector get same value)
-    static Integer  packs_epi16(Integer a, Integer b);              // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
-    static Integer  packs_epi32(Integer a, Integer b);              // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
-    static Integer  packus_epi16(Integer a, Integer b);             // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-    static Integer  packus_epi32(Integer a, Integer b);             // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
-    static Float    permute_epi32(Integer a, Integer swiz);         // return a[swiz[i]] for each 32-bit lane i (int32)
-    static Float    permute_ps(Float a, Integer swiz);              // return a[swiz[i]] for each 32-bit lane i (float)
-    template<int SwizT>
-    static Integer  shuffle_epi32(Integer a, Integer b);    
-    template<int SwizT>
-    static Integer  shuffle_epi64(Integer a, Integer b);
-    static Integer  shuffle_epi8(Integer a, Integer b);
-    template<int SwizT>
-    static Float    shuffle_pd(Double a, Double b);
-    template<int SwizT>
-    static Float    shuffle_ps(Float a, Float b);
-    static Integer  unpackhi_epi16(Integer a, Integer b);
-    static Integer  unpackhi_epi32(Integer a, Integer b);
-    static Integer  unpackhi_epi64(Integer a, Integer b);
-    static Integer  unpackhi_epi8(Integer a, Integer b);
-    static Float    unpackhi_pd(Double a, Double b);
-    static Float    unpackhi_ps(Float a, Float b);
-    static Integer  unpacklo_epi16(Integer a, Integer b);
-    static Integer  unpacklo_epi32(Integer a, Integer b);
-    static Integer  unpacklo_epi64(Integer a, Integer b);
-    static Integer  unpacklo_epi8(Integer a, Integer b);
-    static Float    unpacklo_pd(Double a, Double b);
-    static Float    unpacklo_ps(Float a, Float b);
-
-    //-----------------------------------------------------------------------
-    // Load / store operations
-    //-----------------------------------------------------------------------
-    enum class ScaleFactor
-    {
-        SF_1,   // No scaling
-        SF_2,   // Scale offset by 2
-        SF_4,   // Scale offset by 4
-        SF_8,   // Scale offset by 8
-    };
-
-    template<ScaleFactor ScaleT = ScaleFactor::SF_1>
-    static Float    i32gather_ps(float const* p, Integer idx);  // return *(float*)(((int8*)p) + (idx * ScaleT))
-    static Float    load1_ps(float const *p);                   // return *p    (broadcast 1 value to all elements)
-    static Float    load_ps(float const *p);                    // return *p    (loads SIMD width elements from memory)
-    static Integer  load_si(Integer const *p);                  // return *p
-    static Float    loadu_ps(float const *p);                   // return *p    (same as load_ps but allows for unaligned mem)
-    static Integer  loadu_si(Integer const *p);                 // return *p    (same as load_si but allows for unaligned mem)
-
-    // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-    template<int ScaleT>
-    static Float    mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask);
-
-    static void     maskstore_ps(float *p, Integer mask, Float src);
-    static int      movemask_epi8(Integer a);
-    static int      movemask_pd(Double a);
-    static int      movemask_ps(Float a);
-    static Integer  set1_epi32(int i);                          // return i (all elements are same value)
-    static Integer  set1_epi8(char i);                          // return i (all elements are same value)
-    static Float    set1_ps(float f);                           // return f (all elements are same value)
-    static Float    setzero_ps();                               // return 0 (float)
-    static Integer  setzero_si();                               // return 0 (integer)
-    static void     store_ps(float *p, Float a);                // *p = a   (stores all elements contiguously in memory)
-    static void     store_si(Integer *p, Integer a);            // *p = a
-    static void     stream_ps(float *p, Float a);               // *p = a   (same as store_ps, but doesn't keep memory in cache)
-
-    //=======================================================================
-    // Legacy interface (available only in SIMD256 width)
-    //=======================================================================
-
-    static Float    broadcast_ps(__m128 const *p);
-    template<int ImmT>
-    static __m128d  extractf128_pd(Double a);
-    template<int ImmT>
-    static __m128   extractf128_ps(Float a);
-    template<int ImmT>
-    static __m128i  extractf128_si(Integer a);
-    template<int ImmT>
-    static Double   insertf128_pd(Double a, __m128d b);
-    template<int ImmT>
-    static Float    insertf128_ps(Float a, __m128 b);
-    template<int ImmT>
-    static Integer  insertf128_si(Integer a, __m128i b);
-    static Integer  loadu2_si(__m128 const* phi, __m128 const* plo);
-    template<int ImmT>
-    static Double   permute2f128_pd(Double a, Double b);
-    template<int ImmT>
-    static Float    permute2f128_ps(Float a, Float b);
-    template<int ImmT>
-    static Integer  permute2f128_si(Integer a, Integer b);
-    static Integer  set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0);
-    static void     storeu2_si(__m128i *phi, __m128i *plo, Integer src);
-
-    //=======================================================================
-    // Advanced masking interface (currently available only in SIMD16 width)
-    //=======================================================================
-};
-#endif // #if 0
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
@ -1,457 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-#pragma once
-
-#if !defined(__cplusplus)
-#error C++ compilation required
-#endif
-
-#include <immintrin.h>
-#include <inttypes.h>
-#include <stdint.h>
-
-#define SIMD_ARCH_AVX 0
-#define SIMD_ARCH_AVX2 1
-#define SIMD_ARCH_AVX512 2
-
-#if !defined(SIMD_ARCH)
-#define SIMD_ARCH SIMD_ARCH_AVX
-#endif
-
-#if defined(_MSC_VER)
-#define SIMDCALL __vectorcall
-#define SIMDINLINE __forceinline
-#define SIMDALIGN(type_, align_) __declspec(align(align_)) type_
-#else
-#define SIMDCALL
-#define SIMDINLINE inline
-#define SIMDALIGN(type_, align_) type_ __attribute__((aligned(align_)))
-#endif
-
-// For documentation, please see the following include...
-// #include "simdlib_interface.hpp"
-
-namespace SIMDImpl
-{
-    enum class CompareType
-    {
-        EQ_OQ    = 0x00, // Equal (ordered, nonsignaling)
-        LT_OS    = 0x01, // Less-than (ordered, signaling)
-        LE_OS    = 0x02, // Less-than-or-equal (ordered, signaling)
-        UNORD_Q  = 0x03, // Unordered (nonsignaling)
-        NEQ_UQ   = 0x04, // Not-equal (unordered, nonsignaling)
-        NLT_US   = 0x05, // Not-less-than (unordered, signaling)
-        NLE_US   = 0x06, // Not-less-than-or-equal (unordered, signaling)
-        ORD_Q    = 0x07, // Ordered (nonsignaling)
-        EQ_UQ    = 0x08, // Equal (unordered, non-signaling)
-        NGE_US   = 0x09, // Not-greater-than-or-equal (unordered, signaling)
-        NGT_US   = 0x0A, // Not-greater-than (unordered, signaling)
-        FALSE_OQ = 0x0B, // False (ordered, nonsignaling)
-        NEQ_OQ   = 0x0C, // Not-equal (ordered, non-signaling)
-        GE_OS    = 0x0D, // Greater-than-or-equal (ordered, signaling)
-        GT_OS    = 0x0E, // Greater-than (ordered, signaling)
-        TRUE_UQ  = 0x0F, // True (unordered, non-signaling)
-        EQ_OS    = 0x10, // Equal (ordered, signaling)
-        LT_OQ    = 0x11, // Less-than (ordered, nonsignaling)
-        LE_OQ    = 0x12, // Less-than-or-equal (ordered, nonsignaling)
-        UNORD_S  = 0x13, // Unordered (signaling)
-        NEQ_US   = 0x14, // Not-equal (unordered, signaling)
-        NLT_UQ   = 0x15, // Not-less-than (unordered, nonsignaling)
-        NLE_UQ   = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
-        ORD_S    = 0x17, // Ordered (signaling)
-        EQ_US    = 0x18, // Equal (unordered, signaling)
-        NGE_UQ   = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
-        NGT_UQ   = 0x1A, // Not-greater-than (unordered, nonsignaling)
-        FALSE_OS = 0x1B, // False (ordered, signaling)
-        NEQ_OS   = 0x1C, // Not-equal (ordered, signaling)
-        GE_OQ    = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
-        GT_OQ    = 0x1E, // Greater-than (ordered, nonsignaling)
-        TRUE_US  = 0x1F, // True (unordered, signaling)
-    };
-
-#if SIMD_ARCH >= SIMD_ARCH_AVX512
-    enum class CompareTypeInt
-    {
-        EQ = _MM_CMPINT_EQ, // Equal
-        LT = _MM_CMPINT_LT, // Less than
-        LE = _MM_CMPINT_LE, // Less than or Equal
-        NE = _MM_CMPINT_NE, // Not Equal
-        GE = _MM_CMPINT_GE, // Greater than or Equal
-        GT = _MM_CMPINT_GT, // Greater than
-    };
-#endif // SIMD_ARCH >= SIMD_ARCH_AVX512
-
-    enum class ScaleFactor
-    {
-        SF_1 = 1, // No scaling
-        SF_2 = 2, // Scale offset by 2
-        SF_4 = 4, // Scale offset by 4
-        SF_8 = 8, // Scale offset by 8
-    };
-
-    enum class RoundMode
-    {
-        TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + 0.5)
-        TO_NEG_INF     = 0x01, // Round to negative infinity
-        TO_POS_INF     = 0x02, // Round to positive infinity
-        TO_ZERO        = 0x03, // Round to 0 a.k.a. truncate
-        CUR_DIRECTION  = 0x04, // Round in direction set in MXCSR register
-
-        RAISE_EXC = 0x00, // Raise exception on overflow
-        NO_EXC    = 0x08, // Suppress exceptions
-
-        NINT        = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC),
-        NINT_NOEXC  = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC),
-        FLOOR       = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC),
-        FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC),
-        CEIL        = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC),
-        CEIL_NOEXC  = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC),
-        TRUNC       = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC),
-        TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC),
-        RINT        = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC),
-        NEARBYINT   = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC),
-    };
-
-    struct Traits
-    {
-        using CompareType = SIMDImpl::CompareType;
-        using ScaleFactor = SIMDImpl::ScaleFactor;
-        using RoundMode   = SIMDImpl::RoundMode;
-    };
-
-    // Attribute, 4-dimensional attribute in SIMD SOA layout
-    template <typename Float, typename Integer, typename Double>
-    union Vec4
-    {
-        Float   v[4];
-        Integer vi[4];
-        Double  vd[4];
-        struct
-        {
-            Float x;
-            Float y;
-            Float z;
-            Float w;
-        };
-        SIMDINLINE Float& SIMDCALL operator[](const int i) { return v[i]; }
-        SIMDINLINE Float const& SIMDCALL operator[](const int i) const { return v[i]; }
-        SIMDINLINE Vec4& SIMDCALL operator=(Vec4 const& in)
-        {
-            v[0] = in.v[0];
-            v[1] = in.v[1];
-            v[2] = in.v[2];
-            v[3] = in.v[3];
-            return *this;
-        }
-    };
-
-    namespace SIMD128Impl
-    {
-        union Float
-        {
-            SIMDINLINE Float() = default;
-            SIMDINLINE Float(__m128 in) : v(in) {}
-            SIMDINLINE Float& SIMDCALL operator=(__m128 in)
-            {
-                v = in;
-                return *this;
-            }
-            SIMDINLINE Float& SIMDCALL operator=(Float const& in)
-            {
-                v = in.v;
-                return *this;
-            }
-            SIMDINLINE SIMDCALL operator __m128() const { return v; }
-
-            SIMDALIGN(__m128, 16) v;
-        };
-
-        union Integer
-        {
-            SIMDINLINE Integer() = default;
-            SIMDINLINE Integer(__m128i in) : v(in) {}
-            SIMDINLINE Integer& SIMDCALL operator=(__m128i in)
-            {
-                v = in;
-                return *this;
-            }
-            SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
-            {
-                v = in.v;
-                return *this;
-            }
-            SIMDINLINE SIMDCALL operator __m128i() const { return v; }
-
-            SIMDALIGN(__m128i, 16) v;
-        };
-
-        union Double
-        {
-            SIMDINLINE Double() = default;
-            SIMDINLINE Double(__m128d in) : v(in) {}
-            SIMDINLINE Double& SIMDCALL operator=(__m128d in)
-            {
-                v = in;
-                return *this;
-            }
-            SIMDINLINE Double& SIMDCALL operator=(Double const& in)
-            {
-                v = in.v;
-                return *this;
-            }
-            SIMDINLINE SIMDCALL operator __m128d() const { return v; }
-
-            SIMDALIGN(__m128d, 16) v;
-        };
-
-        using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
-        using Mask = uint8_t;
-
-        static const uint32_t SIMD_WIDTH = 4;
-    } // namespace SIMD128Impl
-
-    namespace SIMD256Impl
-    {
-        union Float
-        {
-            SIMDINLINE Float() = default;
-            SIMDINLINE Float(__m256 in) : v(in) {}
-            SIMDINLINE Float(SIMD128Impl::Float const& in_lo,
-                             SIMD128Impl::Float const& in_hi = _mm_setzero_ps())
-            {
-                v = _mm256_insertf128_ps(_mm256_castps128_ps256(in_lo), in_hi, 0x1);
-            }
-            SIMDINLINE Float& SIMDCALL operator=(__m256 in)
-            {
-                v = in;
-                return *this;
-            }
-            SIMDINLINE Float& SIMDCALL operator=(Float const& in)
-            {
-                v = in.v;
-                return *this;
-            }
-            SIMDINLINE SIMDCALL operator __m256() const { return v; }
-
-            SIMDALIGN(__m256, 32) v;
-            SIMD128Impl::Float v4[2];
-        };
-
-        union Integer
-        {
-            SIMDINLINE Integer() = default;
-            SIMDINLINE Integer(__m256i in) : v(in) {}
-            SIMDINLINE Integer(SIMD128Impl::Integer const& in_lo,
-                               SIMD128Impl::Integer const& in_hi = _mm_setzero_si128())
-            {
-                v = _mm256_insertf128_si256(_mm256_castsi128_si256(in_lo), in_hi, 0x1);
-            }
-            SIMDINLINE Integer& SIMDCALL operator=(__m256i in)
-            {
-                v = in;
-                return *this;
-            }
-            SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
-            {
-                v = in.v;
-                return *this;
-            }
-            SIMDINLINE SIMDCALL operator __m256i() const { return v; }
-
-            SIMDALIGN(__m256i, 32) v;
-            SIMD128Impl::Integer v4[2];
-        };
-
-        union Double
-        {
-            SIMDINLINE Double() = default;
-            SIMDINLINE Double(__m256d const& in) : v(in) {}
-            SIMDINLINE Double(SIMD128Impl::Double const& in_lo,
-                              SIMD128Impl::Double const& in_hi = _mm_setzero_pd())
-            {
-                v = _mm256_insertf128_pd(_mm256_castpd128_pd256(in_lo), in_hi, 0x1);
-            }
-            SIMDINLINE Double& SIMDCALL operator=(__m256d in)
-            {
-                v = in;
-                return *this;
-            }
-            SIMDINLINE Double& SIMDCALL operator=(Double const& in)
-            {
-                v = in.v;
-                return *this;
-            }
-            SIMDINLINE SIMDCALL operator __m256d() const { return v; }
-
-            SIMDALIGN(__m256d, 32) v;
-            SIMD128Impl::Double v4[2];
-        };
-
-        using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
-        using Mask = uint8_t;
-
-        static const uint32_t SIMD_WIDTH = 8;
-    } // namespace SIMD256Impl
-
-    namespace SIMD512Impl
-    {
-#if !(defined(__AVX512F__) || defined(_ZMMINTRIN_H_INCLUDED))
-        // Define AVX512 types if not included via immintrin.h.
-        // All data members of these types are ONLY to viewed
-        // in a debugger.  Do NOT access them via code!
-        union __m512
-        {
-        private:
-            float m512_f32[16];
-        };
-        struct __m512d
-        {
-        private:
-            double m512d_f64[8];
-        };
-
-        union __m512i
-        {
-        private:
-            int8_t   m512i_i8[64];
-            int16_t  m512i_i16[32];
-            int32_t  m512i_i32[16];
-            int64_t  m512i_i64[8];
-            uint8_t  m512i_u8[64];
-            uint16_t m512i_u16[32];
-            uint32_t m512i_u32[16];
-            uint64_t m512i_u64[8];
-        };
-
-        using __mmask16 = uint16_t;
-#endif
-
-#if defined(__INTEL_COMPILER) || (SIMD_ARCH >= SIMD_ARCH_AVX512)
-#define SIMD_ALIGNMENT_BYTES 64
-#else
-#define SIMD_ALIGNMENT_BYTES 32
-#endif
-
-        union Float
-        {
-            SIMDINLINE Float() = default;
-            SIMDINLINE Float(__m512 in) : v(in) {}
-            SIMDINLINE Float(SIMD256Impl::Float const& in_lo,
-                             SIMD256Impl::Float const& in_hi = _mm256_setzero_ps())
-            {
-                v8[0] = in_lo;
-                v8[1] = in_hi;
-            }
-            SIMDINLINE Float& SIMDCALL operator=(__m512 in)
-            {
-                v = in;
-                return *this;
-            }
-            SIMDINLINE Float& SIMDCALL operator=(Float const& in)
-            {
-#if SIMD_ARCH >= SIMD_ARCH_AVX512
-                v = in.v;
-#else
-                v8[0] = in.v8[0];
-                v8[1] = in.v8[1];
-#endif
-                return *this;
-            }
-            SIMDINLINE SIMDCALL operator __m512() const { return v; }
-
-            SIMDALIGN(__m512, SIMD_ALIGNMENT_BYTES) v;
-            SIMD256Impl::Float v8[2];
-        };
-
-        union Integer
-        {
-            SIMDINLINE Integer() = default;
-            SIMDINLINE Integer(__m512i in) : v(in) {}
-            SIMDINLINE Integer(SIMD256Impl::Integer const& in_lo,
-                               SIMD256Impl::Integer const& in_hi = _mm256_setzero_si256())
-            {
-                v8[0] = in_lo;
-                v8[1] = in_hi;
-            }
-            SIMDINLINE Integer& SIMDCALL operator=(__m512i in)
-            {
-                v = in;
-                return *this;
-            }
-            SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
-            {
-#if SIMD_ARCH >= SIMD_ARCH_AVX512
-                v = in.v;
-#else
-                v8[0] = in.v8[0];
-                v8[1] = in.v8[1];
-#endif
-                return *this;
-            }
-
-            SIMDINLINE SIMDCALL operator __m512i() const { return v; }
-
-            SIMDALIGN(__m512i, SIMD_ALIGNMENT_BYTES) v;
-            SIMD256Impl::Integer v8[2];
-        };
-
-        union Double
-        {
-            SIMDINLINE Double() = default;
-            SIMDINLINE Double(__m512d in) : v(in) {}
-            SIMDINLINE Double(SIMD256Impl::Double const& in_lo,
-                              SIMD256Impl::Double const& in_hi = _mm256_setzero_pd())
-            {
-                v8[0] = in_lo;
-                v8[1] = in_hi;
-            }
-            SIMDINLINE Double& SIMDCALL operator=(__m512d in)
-            {
-                v = in;
-                return *this;
-            }
-            SIMDINLINE Double& SIMDCALL operator=(Double const& in)
-            {
-#if SIMD_ARCH >= SIMD_ARCH_AVX512
-                v = in.v;
-#else
-                v8[0] = in.v8[0];
-                v8[1] = in.v8[1];
-#endif
-                return *this;
-            }
-
-            SIMDINLINE SIMDCALL operator __m512d() const { return v; }
-
-            SIMDALIGN(__m512d, SIMD_ALIGNMENT_BYTES) v;
-            SIMD256Impl::Double v8[2];
-        };
-
-        typedef SIMDImpl::Vec4<Float, Integer, Double> SIMDALIGN(Vec4, 64);
-        using Mask = __mmask16;
-
-        static const uint32_t SIMD_WIDTH = 16;
-
-#undef SIMD_ALIGNMENT_BYTES
-    } // namespace SIMD512Impl
-} // namespace SIMDImpl
--- a/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp
+++ b/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp
@ -1,299 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#include "common/os.h"
-#include <stdarg.h>
-#include <stdio.h>
-#include <assert.h>
-#include <algorithm>
-#include <mutex>
-
-#if SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
-
-#if defined(_MSC_VER)
-#pragma comment(lib, "user32.lib")
-#endif // _WIN32
-
-namespace ConsoleUtils
-{
-    enum class TextColor
-    {
-        BLACK = 0,
-#if defined(_WIN32)
-        RED   = 4,
-        GREEN = 2,
-        BLUE  = 1,
-#else
-        RED   = 1,
-        GREEN = 2,
-        BLUE  = 4,
-#endif // _WIN32
-        PURPLE = static_cast<uint32_t>(RED) | static_cast<uint32_t>(BLUE),
-        CYAN   = static_cast<uint32_t>(GREEN) | static_cast<uint32_t>(BLUE),
-        YELLOW = static_cast<uint32_t>(RED) | static_cast<uint32_t>(GREEN),
-        WHITE =
-            static_cast<uint32_t>(RED) | static_cast<uint32_t>(GREEN) | static_cast<uint32_t>(BLUE),
-    };
-
-    enum class TextStyle
-    {
-        NORMAL    = 0,
-        INTENSITY = 1,
-    };
-
-    void SetTextColor(FILE*     stream,
-                      TextColor color = TextColor::WHITE,
-                      TextStyle style = TextStyle::NORMAL)
-    {
-#if defined(_WIN32)
-
-        HANDLE hConsoleHandle = nullptr;
-        if (stream == stderr)
-        {
-            hConsoleHandle = GetStdHandle(STD_ERROR_HANDLE);
-        }
-        else if (stream == stdout)
-        {
-            hConsoleHandle = GetStdHandle(STD_OUTPUT_HANDLE);
-        }
-        else
-        {
-            // Not a console stream, do nothing
-            return;
-        }
-
-        WORD textAttributes = static_cast<WORD>(color);
-        if (style == TextStyle::INTENSITY)
-        {
-            textAttributes |= FOREGROUND_INTENSITY;
-        }
-        SetConsoleTextAttribute(hConsoleHandle, textAttributes);
-
-#else // !_WIN32
-
-        // Print ANSI codes
-        uint32_t cc =
-            30 + ((style == TextStyle::INTENSITY) ? 60 : 0) + static_cast<uint32_t>(color);
-        fprintf(stream, "\033[0m\033[%d;%dm", static_cast<uint32_t>(style), cc);
-
-#endif
-    }
-
-    void ResetTextColor(FILE* stream)
-    {
-#if defined(_WIN32)
-
-        SetTextColor(stream);
-
-#else // !_WIN32
-
-        // Print ANSI codes
-        fprintf(stream, "\033[0m");
-
-#endif
-    }
-
-    static std::mutex g_stderrMutex;
-} // namespace ConsoleUtils
-
-bool SwrAssert(bool        chkDebugger,
-               bool&       enabled,
-               const char* pExpression,
-               const char* pFileName,
-               uint32_t    lineNum,
-               const char* pFunction,
-               const char* pFmtString,
-               ...)
-{
-    using namespace ConsoleUtils;
-    std::lock_guard<std::mutex> l(g_stderrMutex);
-
-    SetTextColor(stderr, TextColor::CYAN, TextStyle::NORMAL);
-
-    fprintf(stderr, "%s(%d): ", pFileName, lineNum);
-
-    SetTextColor(stderr, TextColor::RED, TextStyle::INTENSITY);
-
-    fprintf(stderr, "ASSERT: %s\n", pExpression);
-
-    SetTextColor(stderr, TextColor::CYAN, TextStyle::INTENSITY);
-    fprintf(stderr, "\t%s\n", pFunction);
-
-    if (pFmtString)
-    {
-        SetTextColor(stderr, TextColor::YELLOW, TextStyle::INTENSITY);
-        fprintf(stderr, "\t");
-        va_list args;
-        va_start(args, pFmtString);
-        vfprintf(stderr, pFmtString, args);
-        va_end(args);
-        fprintf(stderr, "\n");
-    }
-    ResetTextColor(stderr);
-    fflush(stderr);
-
-#if defined(_WIN32)
-    static const int MAX_MESSAGE_LEN = 2048;
-    char             msgBuf[MAX_MESSAGE_LEN];
-
-    sprintf_s(msgBuf, "%s(%d): ASSERT: %s\n", pFileName, lineNum, pExpression);
-    msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
-    msgBuf[MAX_MESSAGE_LEN - 1] = 0;
-    OutputDebugStringA(msgBuf);
-
-    sprintf_s(msgBuf, "\t%s\n", pFunction);
-    msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
-    msgBuf[MAX_MESSAGE_LEN - 1] = 0;
-    OutputDebugStringA(msgBuf);
-
-    int offset = 0;
-
-    if (pFmtString)
-    {
-        va_list args;
-        va_start(args, pFmtString);
-        offset = _vsnprintf_s(msgBuf, sizeof(msgBuf), sizeof(msgBuf), pFmtString, args);
-        va_end(args);
-
-        if (offset < 0)
-        {
-            return true;
-        }
-
-        OutputDebugStringA("\t");
-        OutputDebugStringA(msgBuf);
-        OutputDebugStringA("\n");
-    }
-
-    if (enabled && KNOB_ENABLE_ASSERT_DIALOGS)
-    {
-        int retval = sprintf_s(&msgBuf[offset],
-                               MAX_MESSAGE_LEN - offset,
-                               "\n\n"
-                               "File: %s\n"
-                               "Line: %d\n"
-                               "\n"
-                               "Expression: %s\n\n"
-                               "Cancel: Disable this assert for the remainder of the process\n"
-                               "Try Again: Break into the debugger\n"
-                               "Continue: Continue execution (but leave assert enabled)",
-                               pFileName,
-                               lineNum,
-                               pExpression);
-
-        if (retval < 0)
-        {
-            return true;
-        }
-
-        offset += retval;
-
-        if (!IsDebuggerPresent())
-        {
-            sprintf_s(&msgBuf[offset],
-                      MAX_MESSAGE_LEN - offset,
-                      "\n\n*** NO DEBUGGER DETECTED ***\n\nPressing \"Try Again\" will cause a "
-                      "program crash!");
-        }
-
-        retval = MessageBoxA(nullptr,
-                             msgBuf,
-                             "Assert Failed",
-                             MB_CANCELTRYCONTINUE | MB_ICONEXCLAMATION | MB_SETFOREGROUND);
-
-        switch (retval)
-        {
-        case IDCANCEL:
-            enabled = false;
-            return false;
-
-        case IDTRYAGAIN:
-            return true;
-
-        case IDCONTINUE:
-            return false;
-        }
-    }
-    else
-    {
-        return (IsDebuggerPresent() || !chkDebugger) && enabled;
-    }
-#endif // _WIN32
-
-    return enabled;
-}
-
-void SwrTrace(
-    const char* pFileName, uint32_t lineNum, const char* pFunction, const char* pFmtString, ...)
-{
-    using namespace ConsoleUtils;
-    std::lock_guard<std::mutex> l(g_stderrMutex);
-
-    SetTextColor(stderr, TextColor::CYAN, TextStyle::NORMAL);
-
-    fprintf(stderr, "%s(%d): TRACE in %s:\n", pFileName, lineNum, pFunction);
-
-    if (pFmtString)
-    {
-        SetTextColor(stderr, TextColor::PURPLE, TextStyle::INTENSITY);
-        fprintf(stderr, "\t");
-        va_list args;
-        va_start(args, pFmtString);
-        vfprintf(stderr, pFmtString, args);
-        va_end(args);
-        fprintf(stderr, "\n");
-    }
-    ResetTextColor(stderr);
-    fflush(stderr);
-
-#if defined(_WIN32)
-    static const int MAX_MESSAGE_LEN = 2048;
-    char             msgBuf[MAX_MESSAGE_LEN];
-
-    sprintf_s(msgBuf, "%s(%d): TRACE in %s\n", pFileName, lineNum, pFunction);
-    msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
-    msgBuf[MAX_MESSAGE_LEN - 1] = 0;
-    OutputDebugStringA(msgBuf);
-
-    int offset = 0;
-
-    if (pFmtString)
-    {
-        va_list args;
-        va_start(args, pFmtString);
-        offset = _vsnprintf_s(msgBuf, sizeof(msgBuf), sizeof(msgBuf), pFmtString, args);
-        va_end(args);
-
-        if (offset < 0)
-        {
-            return;
-        }
-
-        OutputDebugStringA("\t");
-        OutputDebugStringA(msgBuf);
-        OutputDebugStringA("\n");
-    }
-#endif // _WIN32
-}
-
-#endif // SWR_ENABLE_ASSERTS
--- a/src/gallium/drivers/swr/rasterizer/common/swr_assert.h
+++ b/src/gallium/drivers/swr/rasterizer/common/swr_assert.h
@ -1,242 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#ifndef __SWR_ASSERT_H__
-#define __SWR_ASSERT_H__
-
-#if !defined(__SWR_OS_H__)
-#error swr_assert.h should not be included directly, please include "common/os.h" instead.
-#endif
-
-//=============================================================================
-//
-// MACROS defined in this file:
-//
-// - SWR_ASSUME(expression, ...):   Tell compiler that the expression is true.
-//                                  Helps with static code analysis as well.
-//                                  DO NOT USE if code after this dynamically
-//                                  checks for errors and handles them.  The
-//                                  compiler may optimize out the error check.
-//
-// - SWR_ASSERT(expression, ...):   Inform the user is expression is false.
-//                                  This check is only conditionally made,
-//                                  usually only in debug mode.
-//
-// - SWR_REL_ASSERT(expression, ...): Unconditionally enabled version of SWR_ASSERT
-//
-// - SWR_ASSUME_ASSERT(expression, ...): Conditionally enabled SWR_ASSERT.  Uses
-//                                       SWR_ASSUME if SWR_ASSERT is disabled.
-//                                       DO NOT USE in combination with actual
-//                                       error checking (see SWR_ASSUME)
-//
-// - SWR_REL_ASSUME_ASSERT(expression, ...): Same as SWR_REL_ASSERT.
-//
-//=============================================================================
-
-// Stupid preprocessor tricks to avoid -Wall / -W4 warnings
-#if defined(_MSC_VER)
-#define _SWR_WARN_DISABLE __pragma(warning(push)) __pragma(warning(disable : 4127))
-#define _SWR_WARN_RESTORE __pragma(warning(pop))
-#else // ! MSVC compiler
-#define _SWR_WARN_DISABLE
-#define _SWR_WARN_RESTORE
-#endif
-
-#define _SWR_MACRO_START \
-    do                   \
-    {
-#define _SWR_MACRO_END \
-    _SWR_WARN_DISABLE  \
-    }                  \
-    while (0)          \
-    _SWR_WARN_RESTORE
-
-#if defined(_MSC_VER)
-#define SWR_ASSUME(e, ...)        \
-    _SWR_MACRO_START __assume(e); \
-    _SWR_MACRO_END
-#elif defined(__clang__)
-#define SWR_ASSUME(e, ...)                \
-    _SWR_MACRO_START __builtin_assume(e); \
-    _SWR_MACRO_END
-#elif defined(__GNUC__)
-#define SWR_ASSUME(e, ...)                                       \
-    _SWR_MACRO_START((e) ? ((void)0) : __builtin_unreachable()); \
-    _SWR_MACRO_END
-#else
-#define SWR_ASSUME(e, ...)      \
-    _SWR_MACRO_START ASSUME(e); \
-    _SWR_MACRO_END
-#endif
-
-#if !defined(SWR_ENABLE_ASSERTS)
-
-#if !defined(NDEBUG)
-#define SWR_ENABLE_ASSERTS 1
-#else
-#define SWR_ENABLE_ASSERTS 0
-#endif // _DEBUG
-
-#endif // SWR_ENABLE_ASSERTS
-
-#if !defined(SWR_ENABLE_REL_ASSERTS)
-#define SWR_ENABLE_REL_ASSERTS 1
-#endif
-
-#if SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
-#include "assert.h"
-
-#if !defined(__cplusplus)
-
-#pragma message("C++ is required for SWR Asserts, falling back to assert.h")
-
-#if SWR_ENABLE_ASSERTS
-#define SWR_ASSERT(e, ...) assert(e)
-#endif
-
-#if SWR_ENABLE_REL_ASSERTS
-#define SWR_REL_ASSERT(e, ...) assert(e)
-#endif
-
-#else
-
-bool SwrAssert(bool        chkDebugger,
-               bool&       enabled,
-               const char* pExpression,
-               const char* pFileName,
-               uint32_t    lineNum,
-               const char* function,
-               const char* pFmtString = nullptr,
-               ...);
-
-void SwrTrace(
-    const char* pFileName, uint32_t lineNum, const char* function, const char* pFmtString, ...);
-
-#define _SWR_ASSERT(chkDebugger, e, ...)                                                                            \
-    _SWR_MACRO_START                                                                                                \
-    bool expFailed = !(e);                                                                                          \
-    if (expFailed)                                                                                                  \
-    {                                                                                                               \
-        static bool swrAssertEnabled = true;                                                                        \
-        expFailed                    = SwrAssert(                                                                   \
-            chkDebugger, swrAssertEnabled, #e, __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__); \
-        if (expFailed)                                                                                              \
-        {                                                                                                           \
-            DEBUGBREAK;                                                                                             \
-        }                                                                                                           \
-    }                                                                                                               \
-    _SWR_MACRO_END
-
-#define _SWR_INVALID(chkDebugger, ...)                                                                     \
-    _SWR_MACRO_START                                                                                       \
-    static bool swrAssertEnabled = true;                                                                   \
-    bool        expFailed        = SwrAssert(                                                              \
-        chkDebugger, swrAssertEnabled, "", __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__); \
-    if (expFailed)                                                                                         \
-    {                                                                                                      \
-        DEBUGBREAK;                                                                                        \
-    }                                                                                                      \
-    _SWR_MACRO_END
-
-#define _SWR_TRACE(_fmtstr, ...) SwrTrace(__FILE__, __LINE__, __FUNCTION__, _fmtstr, ##__VA_ARGS__);
-
-#if SWR_ENABLE_ASSERTS
-#define SWR_ASSERT(e, ...) _SWR_ASSERT(true, e, ##__VA_ARGS__)
-#define SWR_ASSUME_ASSERT(e, ...) SWR_ASSERT(e, ##__VA_ARGS__)
-#define SWR_TRACE(_fmtstr, ...) _SWR_TRACE(_fmtstr, ##__VA_ARGS__)
-#endif // SWR_ENABLE_ASSERTS
-
-#if SWR_ENABLE_REL_ASSERTS
-#define SWR_REL_ASSERT(e, ...) _SWR_ASSERT(false, e, ##__VA_ARGS__)
-#define SWR_REL_ASSUME_ASSERT(e, ...) SWR_REL_ASSERT(e, ##__VA_ARGS__)
-#define SWR_REL_TRACE(_fmtstr, ...) _SWR_TRACE(_fmtstr, ##__VA_ARGS__)
-
-// SWR_INVALID is always enabled
-// Funky handling to allow 0 arguments with g++/gcc
-// This is needed because you can't "swallow commas" with ##_VA_ARGS__ unless
-// there is a first argument to the macro.  So having a macro that can optionally
-// accept 0 arguments is tricky.
-#define _SWR_INVALID_0() _SWR_INVALID(false)
-#define _SWR_INVALID_1(...) _SWR_INVALID(false, ##__VA_ARGS__)
-#define _SWR_INVALID_VARGS_(_10, _9, _8, _7, _6, _5, _4, _3, _2, _1, N, ...) N
-#define _SWR_INVALID_VARGS(...) _SWR_INVALID_VARGS_(__VA_ARGS__, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1)
-#define _SWR_INVALID_VARGS_0() 1, 2, 3, 4, 5, 6, 7, 9, 9, 10
-#define _SWR_INVALID_CONCAT_(a, b) a##b
-#define _SWR_INVALID_CONCAT(a, b) _SWR_INVALID_CONCAT_(a, b)
-#define SWR_INVALID(...)                                                                       \
-    _SWR_INVALID_CONCAT(_SWR_INVALID_, _SWR_INVALID_VARGS(_SWR_INVALID_VARGS_0 __VA_ARGS__())) \
-    (__VA_ARGS__)
-
-#define SWR_STATIC_ASSERT(expression, ...) \
-    static_assert((expression), "Failed:\n    " #expression "\n    " __VA_ARGS__);
-
-#endif // SWR_ENABLE_REL_ASSERTS
-
-#endif // C++
-
-#endif // SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
-
-// Needed to allow passing bitfield members to sizeof() in disabled asserts
-template <typename T>
-static bool SwrSizeofWorkaround(T)
-{
-    return false;
-}
-
-#if !SWR_ENABLE_ASSERTS
-#define SWR_ASSERT(e, ...)                                 \
-    _SWR_MACRO_START(void) sizeof(SwrSizeofWorkaround(e)); \
-    _SWR_MACRO_END
-#define SWR_ASSUME_ASSERT(e, ...) SWR_ASSUME(e, ##__VA_ARGS__)
-#define SWR_TRACE(_fmtstr, ...) \
-    _SWR_MACRO_START(void)(0);  \
-    _SWR_MACRO_END
-#endif
-
-#if !SWR_ENABLE_REL_ASSERTS
-#define SWR_REL_ASSERT(e, ...)                             \
-    _SWR_MACRO_START(void) sizeof(SwrSizeofWorkaround(e)); \
-    _SWR_MACRO_END
-#define SWR_INVALID(...)       \
-    _SWR_MACRO_START(void)(0); \
-    _SWR_MACRO_END
-#define SWR_REL_ASSUME_ASSERT(e, ...) SWR_ASSUME(e, ##__VA_ARGS__)
-#define SWR_REL_TRACE(_fmtstr, ...) \
-    _SWR_MACRO_START(void)(0);      \
-    _SWR_MACRO_END
-#define SWR_STATIC_ASSERT(e, ...)                           \
-    _SWR_MACRO_START(void)  sizeof(SwrSizeofWorkaround(e)); \
-    _SWR_MACRO_END
-#endif
-
-#if defined(_MSC_VER)
-#define SWR_FUNCTION_DECL __FUNCSIG__
-#elif (defined(__GNUC__) || defined(__clang__))
-#define SWR_FUNCTION_DECL __PRETTY_FUNCTION__
-#else
-#define SWR_FUNCTION_DECL __FUNCTION__
-#endif
-
-#define SWR_NOT_IMPL SWR_INVALID("%s not implemented", SWR_FUNCTION_DECL)
-
-#endif //__SWR_ASSERT_H__
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@ -1,772 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file api.h
- *
- * @brief API definitions
- *
- ******************************************************************************/
-
-#ifndef __SWR_API_H__
-#define __SWR_API_H__
-
-#include "common/os.h"
-
-#include <assert.h>
-#include <algorithm>
-
-#include "common/intrin.h"
-#include "common/formats.h"
-#include "core/state.h"
-
-typedef void(SWR_API* PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2, uint64_t data3);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Rectangle structure
-struct SWR_RECT
-{
-    int32_t xmin; ///< inclusive
-    int32_t ymin; ///< inclusive
-    int32_t xmax; ///< exclusive
-    int32_t ymax; ///< exclusive
-
-    bool operator==(const SWR_RECT& rhs)
-    {
-        return (this->ymin == rhs.ymin && this->ymax == rhs.ymax && this->xmin == rhs.xmin &&
-                this->xmax == rhs.xmax);
-    }
-
-    bool operator!=(const SWR_RECT& rhs) { return !(*this == rhs); }
-
-    SWR_RECT& Intersect(const SWR_RECT& other)
-    {
-        this->xmin = std::max(this->xmin, other.xmin);
-        this->ymin = std::max(this->ymin, other.ymin);
-        this->xmax = std::min(this->xmax, other.xmax);
-        this->ymax = std::min(this->ymax, other.ymax);
-
-        if (xmax - xmin < 0 || ymax - ymin < 0)
-        {
-            // Zero area
-            ymin = ymax = xmin = xmax = 0;
-        }
-
-        return *this;
-    }
-    SWR_RECT& operator&=(const SWR_RECT& other) { return Intersect(other); }
-
-    SWR_RECT& Union(const SWR_RECT& other)
-    {
-        this->xmin = std::min(this->xmin, other.xmin);
-        this->ymin = std::min(this->ymin, other.ymin);
-        this->xmax = std::max(this->xmax, other.xmax);
-        this->ymax = std::max(this->ymax, other.ymax);
-
-        return *this;
-    }
-
-    SWR_RECT& operator|=(const SWR_RECT& other) { return Union(other); }
-
-    void Translate(int32_t x, int32_t y)
-    {
-        xmin += x;
-        ymin += y;
-        xmax += x;
-        ymax += y;
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Function signature for load hot tiles
-/// @param hDC - handle to DRAW_CONTEXT
-/// @param dstFormat - format of the hot tile
-/// @param renderTargetIndex - render target to store, can be color, depth or stencil
-/// @param x - destination x coordinate
-/// @param y - destination y coordinate
-/// @param pDstHotTile - pointer to the hot tile surface
-typedef void(SWR_API* PFN_LOAD_TILE)(HANDLE                      hDC,
-                                     HANDLE                      hWorkerPrivateData,
-                                     SWR_FORMAT                  dstFormat,
-                                     SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-                                     uint32_t                    x,
-                                     uint32_t                    y,
-                                     uint32_t                    renderTargetArrayIndex,
-                                     uint8_t*                    pDstHotTile);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Function signature for store hot tiles
-/// @param hDC - handle to DRAW_CONTEXT
-/// @param srcFormat - format of the hot tile
-/// @param renderTargetIndex - render target to store, can be color, depth or stencil
-/// @param x - destination x coordinate
-/// @param y - destination y coordinate
-/// @param pSrcHotTile - pointer to the hot tile surface
-typedef void(SWR_API* PFN_STORE_TILE)(HANDLE                      hDC,
-                                      HANDLE                      hWorkerPrivateData,
-                                      SWR_FORMAT                  srcFormat,
-                                      SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-                                      uint32_t                    x,
-                                      uint32_t                    y,
-                                      uint32_t                    renderTargetArrayIndex,
-                                      uint8_t*                    pSrcHotTile);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Function signature for clearing from the hot tiles clear value
-/// @param hPrivateContext - handle to private data
-/// @param renderTargetIndex - render target to store, can be color, depth or stencil
-/// @param x - destination x coordinate
-/// @param y - destination y coordinate
-/// @param renderTargetArrayIndex - render target array offset from arrayIndex
-/// @param pClearColor - pointer to the hot tile's clear value
-typedef void(SWR_API* PFN_CLEAR_TILE)(HANDLE                      hPrivateContext,
-                                      HANDLE                      hWorkerPrivateData,
-                                      SWR_RENDERTARGET_ATTACHMENT rtIndex,
-                                      uint32_t                    x,
-                                      uint32_t                    y,
-                                      uint32_t                    renderTargetArrayIndex,
-                                      const float*                pClearColor);
-
-typedef void*(SWR_API* PFN_TRANSLATE_GFXPTR_FOR_READ)(HANDLE   hPrivateContext,
-                                                      gfxptr_t xpAddr,
-                                                      bool*    pbNullTileAccessed,
-                                                      HANDLE   hPrivateWorkerData);
-
-typedef void*(SWR_API* PFN_TRANSLATE_GFXPTR_FOR_WRITE)(HANDLE   hPrivateContext,
-                                                       gfxptr_t xpAddr,
-                                                       bool*    pbNullTileAccessed,
-                                                       HANDLE   hPrivateWorkerData);
-
-typedef gfxptr_t(SWR_API* PFN_MAKE_GFXPTR)(HANDLE hPrivateContext, void* sysAddr);
-
-typedef HANDLE(SWR_API* PFN_CREATE_MEMORY_CONTEXT)(HANDLE hExternalMemory);
-
-typedef void(SWR_API* PFN_DESTROY_MEMORY_CONTEXT)(HANDLE hExternalMemory, HANDLE hMemoryContext);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Callback to allow driver to update their copy of streamout write offset.
-///        This is call is made for any draw operation that has streamout enabled
-///        and has updated the write offset.
-/// @param hPrivateContext - handle to private data
-/// @param soBufferSlot - buffer slot for write offset
-/// @param soWriteOffset - update value for so write offset.
-typedef void(SWR_API* PFN_UPDATE_SO_WRITE_OFFSET)(HANDLE   hPrivateContext,
-                                                  uint32_t soBufferSlot,
-                                                  uint32_t soWriteOffset);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Callback to allow driver to update their copy of stats.
-/// @param hPrivateContext - handle to private data
-/// @param pStats - pointer to draw stats
-typedef void(SWR_API* PFN_UPDATE_STATS)(HANDLE hPrivateContext, const SWR_STATS* pStats);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Callback to allow driver to update their copy of FE stats.
-/// @note Its optimal to have a separate callback for FE stats since
-///       there is only one DC per FE thread. This means we do not have
-///       to sum up the stats across all of the workers.
-/// @param hPrivateContext - handle to private data
-/// @param pStats - pointer to draw stats
-typedef void(SWR_API* PFN_UPDATE_STATS_FE)(HANDLE hPrivateContext, const SWR_STATS_FE* pStats);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Callback to allow driver to update StreamOut status
-/// @param hPrivateContext - handle to private data
-/// @param numPrims - number of primitives written to StreamOut buffer
-typedef void(SWR_API* PFN_UPDATE_STREAMOUT)(HANDLE hPrivateContext, uint64_t numPrims);
-
-//////////////////////////////////////////////////////////////////////////
-/// BucketManager
-/// Forward Declaration (see rdtsc_buckets.h for full definition)
-/////////////////////////////////////////////////////////////////////////
-class BucketManager;
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_THREADING_INFO
-/////////////////////////////////////////////////////////////////////////
-struct SWR_THREADING_INFO
-{
-    uint32_t BASE_NUMA_NODE;
-    uint32_t BASE_CORE;
-    uint32_t BASE_THREAD;
-    uint32_t MAX_WORKER_THREADS;
-    uint32_t MAX_NUMA_NODES;
-    uint32_t MAX_CORES_PER_NUMA_NODE;
-    uint32_t MAX_THREADS_PER_CORE;
-    bool     SINGLE_THREADED;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_API_THREADING_INFO
-/// Data used to reserve HW threads for API use
-/// API Threads are reserved from numa nodes / cores used for
-/// SWR Worker threads.  Specifying reserved threads here can reduce
-/// the total number of SWR worker threads.
-/////////////////////////////////////////////////////////////////////////
-struct SWR_API_THREADING_INFO
-{
-    uint32_t numAPIReservedThreads; // Default is 1 if SWR_API_THREADING_INFO is not sent
-    uint32_t bindAPIThread0;        // Default is true if numAPIReservedThreads is > 0,
-                                    // binds thread used in SwrCreateContext to API Reserved
-                                    // thread 0
-    uint32_t numAPIThreadsPerCore; // 0 - means use all threads per core, else clamp to this number.
-                                   // Independent of KNOB_MAX_THREADS_PER_CORE.
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_CONTEXT
-/// Forward Declaration (see context.h for full definition)
-/////////////////////////////////////////////////////////////////////////
-struct SWR_CONTEXT;
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_WORKER_PRIVATE_STATE
-/// Data used to allocate per-worker thread private data.  A pointer
-/// to this data will be passed in to each shader function.
-/// The first field of this private data must be SWR_WORKER_DATA
-/// perWorkerPrivateStateSize must be >= sizeof SWR_WORKER_DATA 
-/////////////////////////////////////////////////////////////////////////
-struct SWR_WORKER_PRIVATE_STATE
-{
-    typedef void(SWR_API* PFN_WORKER_DATA)(SWR_CONTEXT* pContext, HANDLE hWorkerPrivateData, uint32_t iWorkerNum);
-
-    size_t          perWorkerPrivateStateSize; ///< Amount of data to allocate per-worker
-    PFN_WORKER_DATA pfnInitWorkerData;         ///< Init function for worker data.  If null
-                                               ///< worker data will be initialized to 0.
-    PFN_WORKER_DATA pfnFinishWorkerData;       ///< Finish / destroy function for worker data.
-                                               ///< Can be null.
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_CREATECONTEXT_INFO
-/////////////////////////////////////////////////////////////////////////
-struct SWR_CREATECONTEXT_INFO
-{
-    // External functions (e.g. sampler) need per draw context state.
-    // Use SwrGetPrivateContextState() to access private state.
-    size_t privateStateSize;
-
-    // Optional per-worker state, can be NULL for no worker-private data
-    SWR_WORKER_PRIVATE_STATE* pWorkerPrivateState;
-
-    // Callback functions
-    PFN_LOAD_TILE                  pfnLoadTile;
-    PFN_STORE_TILE                 pfnStoreTile;
-    PFN_TRANSLATE_GFXPTR_FOR_READ  pfnTranslateGfxptrForRead;
-    PFN_TRANSLATE_GFXPTR_FOR_WRITE pfnTranslateGfxptrForWrite;
-    PFN_MAKE_GFXPTR                pfnMakeGfxPtr;
-    PFN_CREATE_MEMORY_CONTEXT      pfnCreateMemoryContext;
-    PFN_DESTROY_MEMORY_CONTEXT     pfnDestroyMemoryContext;
-    PFN_UPDATE_SO_WRITE_OFFSET     pfnUpdateSoWriteOffset;
-    PFN_UPDATE_STATS               pfnUpdateStats;
-    PFN_UPDATE_STATS_FE            pfnUpdateStatsFE;
-    PFN_UPDATE_STREAMOUT           pfnUpdateStreamOut;
-
-
-    // Pointer to rdtsc buckets mgr returned to the caller.
-    // Only populated when KNOB_ENABLE_RDTSC is set
-    BucketManager* pBucketMgr;
-
-    // Output: size required memory passed to for SwrSaveState / SwrRestoreState
-    size_t contextSaveSize;
-
-    // ArchRast event manager.
-    HANDLE hArEventManager;
-
-    // handle to external memory for worker data to create memory contexts
-    HANDLE hExternalMemory;
-
-    // Input (optional): Threading info that overrides any set KNOB values.
-    SWR_THREADING_INFO* pThreadInfo;
-
-    // Input (optional): Info for reserving API threads
-    SWR_API_THREADING_INFO* pApiThreadInfo;
-
-    // Input: if set to non-zero value, overrides KNOB value for maximum
-    // number of draws in flight
-    uint32_t MAX_DRAWS_IN_FLIGHT;
-
-    std::string contextName;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Create SWR Context.
-/// @param pCreateInfo - pointer to creation info.
-SWR_FUNC(HANDLE, SwrCreateContext, SWR_CREATECONTEXT_INFO* pCreateInfo);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Destroys SWR Context.
-/// @param hContext - Handle passed back from SwrCreateContext
-SWR_FUNC(void, SwrDestroyContext, HANDLE hContext);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Bind current thread to an API reserved HW thread
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param apiThreadId - index of reserved HW thread to bind to.
-SWR_FUNC(void, SwrBindApiThread, HANDLE hContext, uint32_t apiThreadId);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Saves API state associated with hContext
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pOutputStateBlock - Memory block to receive API state data
-/// @param memSize - Size of memory pointed to by pOutputStateBlock
-SWR_FUNC(void, SwrSaveState, HANDLE hContext, void* pOutputStateBlock, size_t memSize);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Restores API state to hContext previously saved with SwrSaveState
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pStateBlock - Memory block to read API state data from
-/// @param memSize - Size of memory pointed to by pStateBlock
-SWR_FUNC(void, SwrRestoreState, HANDLE hContext, const void* pStateBlock, size_t memSize);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Sync cmd. Executes the callback func when all rendering up to this sync
-///        has been completed
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pfnFunc - pointer to callback function,
-/// @param userData - user data to pass back
-SWR_FUNC(void,
-         SwrSync,
-         HANDLE            hContext,
-         PFN_CALLBACK_FUNC pfnFunc,
-         uint64_t          userData,
-         uint64_t          userData2,
-         uint64_t          userData3);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Stall cmd. Stalls the backend until all previous work has been completed.
-///        Frontend work can continue to make progress
-/// @param hContext - Handle passed back from SwrCreateContext
-SWR_FUNC(void, SwrStallBE, HANDLE hContext);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Blocks until all rendering has been completed.
-/// @param hContext - Handle passed back from SwrCreateContext
-SWR_FUNC(void, SwrWaitForIdle, HANDLE hContext);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Blocks until all FE rendering has been completed.
-/// @param hContext - Handle passed back from SwrCreateContext
-SWR_FUNC(void, SwrWaitForIdleFE, HANDLE hContext);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set vertex buffer state.
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param numBuffers - Number of vertex buffer state descriptors.
-/// @param pVertexBuffers - Array of vertex buffer state descriptors.
-SWR_FUNC(void,
-         SwrSetVertexBuffers,
-         HANDLE                         hContext,
-         uint32_t                       numBuffers,
-         const SWR_VERTEX_BUFFER_STATE* pVertexBuffers);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set index buffer
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pIndexBuffer - Index buffer.
-SWR_FUNC(void, SwrSetIndexBuffer, HANDLE hContext, const SWR_INDEX_BUFFER_STATE* pIndexBuffer);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set fetch shader pointer.
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pfnFetchFunc - Pointer to shader.
-SWR_FUNC(void, SwrSetFetchFunc, HANDLE hContext, PFN_FETCH_FUNC pfnFetchFunc);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set streamout shader pointer.
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pfnSoFunc - Pointer to shader.
-/// @param streamIndex - specifies stream
-SWR_FUNC(void, SwrSetSoFunc, HANDLE hContext, PFN_SO_FUNC pfnSoFunc, uint32_t streamIndex);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set streamout state
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pSoState - Pointer to streamout state.
-SWR_FUNC(void, SwrSetSoState, HANDLE hContext, SWR_STREAMOUT_STATE* pSoState);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set streamout buffer state
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pSoBuffer - Pointer to streamout buffer.
-/// @param slot - Slot to bind SO buffer to.
-SWR_FUNC(void, SwrSetSoBuffers, HANDLE hContext, SWR_STREAMOUT_BUFFER* pSoBuffer, uint32_t slot);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set vertex shader pointer.
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pfnVertexFunc - Pointer to shader.
-SWR_FUNC(void, SwrSetVertexFunc, HANDLE hContext, PFN_VERTEX_FUNC pfnVertexFunc);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set frontend state.
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pState - Pointer to state
-SWR_FUNC(void, SwrSetFrontendState, HANDLE hContext, SWR_FRONTEND_STATE* pState);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set geometry shader state.
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pState - Pointer to state
-SWR_FUNC(void, SwrSetGsState, HANDLE hContext, SWR_GS_STATE* pState);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set geometry shader
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pState - Pointer to geometry shader function
-SWR_FUNC(void, SwrSetGsFunc, HANDLE hContext, PFN_GS_FUNC pfnGsFunc);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set compute shader
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pfnCsFunc - Pointer to compute shader function
-/// @param totalThreadsInGroup - product of thread group dimensions.
-/// @param totalSpillFillSize - size in bytes needed for spill/fill.
-/// @param scratchSpaceSizePerInstance - size of the scratch space needed per simd instance
-/// @param numInstances - number of simd instances that are run per execution of the shader
-SWR_FUNC(void,
-         SwrSetCsFunc,
-         HANDLE      hContext,
-         PFN_CS_FUNC pfnCsFunc,
-         uint32_t    totalThreadsInGroup,
-         uint32_t    totalSpillFillSize,
-         uint32_t    scratchSpaceSizePerInstance,
-         uint32_t    numInstances);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set tessellation state.
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pState - Pointer to state
-SWR_FUNC(void, SwrSetTsState, HANDLE hContext, SWR_TS_STATE* pState);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set hull shader
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pfnFunc - Pointer to shader function
-SWR_FUNC(void, SwrSetHsFunc, HANDLE hContext, PFN_HS_FUNC pfnFunc);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set domain shader
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pfnFunc - Pointer to shader function
-SWR_FUNC(void, SwrSetDsFunc, HANDLE hContext, PFN_DS_FUNC pfnFunc);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set depth stencil state
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pState - Pointer to state.
-SWR_FUNC(void, SwrSetDepthStencilState, HANDLE hContext, SWR_DEPTH_STENCIL_STATE* pState);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set backend state
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pState - Pointer to state.
-SWR_FUNC(void, SwrSetBackendState, HANDLE hContext, SWR_BACKEND_STATE* pState);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set depth bounds state
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pState - Pointer to state.
-SWR_FUNC(void, SwrSetDepthBoundsState, HANDLE hContext, SWR_DEPTH_BOUNDS_STATE* pState);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set pixel shader state
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pState - Pointer to state.
-SWR_FUNC(void, SwrSetPixelShaderState, HANDLE hContext, SWR_PS_STATE* pState);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set blend state
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pState - Pointer to state.
-SWR_FUNC(void, SwrSetBlendState, HANDLE hContext, SWR_BLEND_STATE* pState);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Set blend function
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param renderTarget - render target index
-/// @param pfnBlendFunc - function pointer
-SWR_FUNC(
-    void, SwrSetBlendFunc, HANDLE hContext, uint32_t renderTarget, PFN_BLEND_JIT_FUNC pfnBlendFunc);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrDraw
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param topology - Specifies topology for draw.
-/// @param startVertex - Specifies start vertex in vertex buffer for draw.
-/// @param primCount - Number of vertices.
-SWR_FUNC(void,
-         SwrDraw,
-         HANDLE             hContext,
-         PRIMITIVE_TOPOLOGY topology,
-         uint32_t           startVertex,
-         uint32_t           primCount);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrDrawInstanced
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param topology - Specifies topology for draw.
-/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
-/// @param numInstances - How many instances to render.
-/// @param startVertex - Specifies start vertex for draw. (vertex data)
-/// @param startInstance - Which instance to start sequentially fetching from in each buffer
-/// (instanced data)
-SWR_FUNC(void,
-         SwrDrawInstanced,
-         HANDLE             hContext,
-         PRIMITIVE_TOPOLOGY topology,
-         uint32_t           numVertsPerInstance,
-         uint32_t           numInstances,
-         uint32_t           startVertex,
-         uint32_t           startInstance);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief DrawIndexed
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param topology - Specifies topology for draw.
-/// @param numIndices - Number of indices to read sequentially from index buffer.
-/// @param indexOffset - Starting index into index buffer.
-/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
-SWR_FUNC(void,
-         SwrDrawIndexed,
-         HANDLE             hContext,
-         PRIMITIVE_TOPOLOGY topology,
-         uint32_t           numIndices,
-         uint32_t           indexOffset,
-         int32_t            baseVertex);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrDrawIndexedInstanced
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param topology - Specifies topology for draw.
-/// @param numIndices - Number of indices to read sequentially from index buffer.
-/// @param numInstances - Number of instances to render.
-/// @param indexOffset - Starting index into index buffer.
-/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
-/// @param startInstance - Which instance to start sequentially fetching from in each buffer
-/// (instanced data)
-SWR_FUNC(void,
-         SwrDrawIndexedInstanced,
-         HANDLE             hContext,
-         PRIMITIVE_TOPOLOGY topology,
-         uint32_t           numIndices,
-         uint32_t           numInstances,
-         uint32_t           indexOffset,
-         int32_t            baseVertex,
-         uint32_t           startInstance);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrInvalidateTiles
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to
-/// invalidate.
-/// @param invalidateRect - The pixel-coordinate rectangle to invalidate.  This will be expanded to
-///                         be hottile size-aligned.
-SWR_FUNC(void,
-         SwrInvalidateTiles,
-         HANDLE          hContext,
-         uint32_t        attachmentMask,
-         const SWR_RECT& invalidateRect);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrDiscardRect
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
-/// @param rect - The pixel-coordinate rectangle to discard.  Only fully-covered hottiles will be
-///               discarded.
-SWR_FUNC(void, SwrDiscardRect, HANDLE hContext, uint32_t attachmentMask, const SWR_RECT& rect);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrDispatch
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param threadGroupCountX - Number of thread groups dispatched in X direction
-/// @param threadGroupCountY - Number of thread groups dispatched in Y direction
-/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
-SWR_FUNC(void,
-         SwrDispatch,
-         HANDLE   hContext,
-         uint32_t threadGroupCountX,
-         uint32_t threadGroupCountY,
-         uint32_t threadGroupCountZ);
-
-/// @note this enum needs to be kept in sync with HOTTILE_STATE!
-enum SWR_TILE_STATE
-{
-    SWR_TILE_INVALID = 0, // tile is in uninitialized state and should be loaded with surface contents
-                          // before rendering
-    SWR_TILE_DIRTY    = 2, // tile contains newer data than surface it represents
-    SWR_TILE_RESOLVED = 3, // is in sync with surface it represents
-};
-
-/// @todo Add a good description for what attachments are and when and why you would use the
-/// different SWR_TILE_STATEs.
-SWR_FUNC(void,
-         SwrStoreTiles,
-         HANDLE          hContext,
-         uint32_t        attachmentMask,
-         SWR_TILE_STATE  postStoreTileState,
-         const SWR_RECT& storeRect);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrClearRenderTarget - Clear attached render targets / depth / stencil
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param attachmentMask - combination of SWR_ATTACHMENT_*_BIT attachments to clear
-/// @param renderTargetArrayIndex - the RT array index to clear
-/// @param clearColor - color use for clearing render targets
-/// @param z - depth value use for clearing depth buffer
-/// @param stencil - stencil value used for clearing stencil buffer
-/// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers
-SWR_FUNC(void,
-         SwrClearRenderTarget,
-         HANDLE          hContext,
-         uint32_t        attachmentMask,
-         uint32_t        renderTargetArrayIndex,
-         const float     clearColor[4],
-         float           z,
-         uint8_t         stencil,
-         const SWR_RECT& clearRect);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrSetRastState
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param pRastState - New SWR_RASTSTATE used for SwrDraw* commands
-SWR_FUNC(void, SwrSetRastState, HANDLE hContext, const SWR_RASTSTATE* pRastState);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrSetViewports
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param numViewports - number of viewports passed in
-/// @param pViewports - Specifies extents of viewport.
-/// @param pMatrices - If not specified then SWR computes a default one.
-SWR_FUNC(void,
-         SwrSetViewports,
-         HANDLE                       hContext,
-         uint32_t                     numViewports,
-         const SWR_VIEWPORT*          pViewports,
-         const SWR_VIEWPORT_MATRICES* pMatrices);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief SwrSetScissorRects
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param numScissors - number of scissors passed in
-/// @param pScissors - array of scissors
-SWR_FUNC(
-    void, SwrSetScissorRects, HANDLE hContext, uint32_t numScissors, const SWR_RECT* pScissors);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Returns a pointer to the private context state for the current
-///        draw operation. This is used for external componets such as the
-///        sampler.
-///
-/// @note  Client needs to resend private state prior to each draw call.
-///        Also, SWR is responsible for the private state memory.
-/// @param hContext - Handle passed back from SwrCreateContext
-SWR_FUNC(void*, SwrGetPrivateContextState, HANDLE hContext);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Clients can use this to allocate memory for draw/dispatch
-///        operations. The memory will automatically be freed once operation
-///        has completed. Client can use this to allocate binding tables,
-///        etc. needed for shader execution.
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param size - Size of allocation
-/// @param align - Alignment needed for allocation.
-SWR_FUNC(void*, SwrAllocDrawContextMemory, HANDLE hContext, uint32_t size, uint32_t align);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Enables stats counting
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param enable - If true then counts are incremented.
-SWR_FUNC(void, SwrEnableStatsFE, HANDLE hContext, bool enable);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Enables stats counting
-/// @param hContext - Handle passed back from SwrCreateContext
-/// @param enable - If true then counts are incremented.
-SWR_FUNC(void, SwrEnableStatsBE, HANDLE hContext, bool enable);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Mark end of frame - used for performance profiling
-/// @param hContext - Handle passed back from SwrCreateContext
-SWR_FUNC(void, SwrEndFrame, HANDLE hContext);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Initialize swr backend and memory internal tables
-SWR_FUNC(void, SwrInit);
-
-
-struct SWR_INTERFACE
-{
-    PFNSwrCreateContext          pfnSwrCreateContext;
-    PFNSwrDestroyContext         pfnSwrDestroyContext;
-    PFNSwrBindApiThread          pfnSwrBindApiThread;
-    PFNSwrSaveState              pfnSwrSaveState;
-    PFNSwrRestoreState           pfnSwrRestoreState;
-    PFNSwrSync                   pfnSwrSync;
-    PFNSwrStallBE                pfnSwrStallBE;
-    PFNSwrWaitForIdle            pfnSwrWaitForIdle;
-    PFNSwrWaitForIdleFE          pfnSwrWaitForIdleFE;
-    PFNSwrSetVertexBuffers       pfnSwrSetVertexBuffers;
-    PFNSwrSetIndexBuffer         pfnSwrSetIndexBuffer;
-    PFNSwrSetFetchFunc           pfnSwrSetFetchFunc;
-    PFNSwrSetSoFunc              pfnSwrSetSoFunc;
-    PFNSwrSetSoState             pfnSwrSetSoState;
-    PFNSwrSetSoBuffers           pfnSwrSetSoBuffers;
-    PFNSwrSetVertexFunc          pfnSwrSetVertexFunc;
-    PFNSwrSetFrontendState       pfnSwrSetFrontendState;
-    PFNSwrSetGsState             pfnSwrSetGsState;
-    PFNSwrSetGsFunc              pfnSwrSetGsFunc;
-    PFNSwrSetCsFunc              pfnSwrSetCsFunc;
-    PFNSwrSetTsState             pfnSwrSetTsState;
-    PFNSwrSetHsFunc              pfnSwrSetHsFunc;
-    PFNSwrSetDsFunc              pfnSwrSetDsFunc;
-    PFNSwrSetDepthStencilState   pfnSwrSetDepthStencilState;
-    PFNSwrSetBackendState        pfnSwrSetBackendState;
-    PFNSwrSetDepthBoundsState    pfnSwrSetDepthBoundsState;
-    PFNSwrSetPixelShaderState    pfnSwrSetPixelShaderState;
-    PFNSwrSetBlendState          pfnSwrSetBlendState;
-    PFNSwrSetBlendFunc           pfnSwrSetBlendFunc;
-    PFNSwrDraw                   pfnSwrDraw;
-    PFNSwrDrawInstanced          pfnSwrDrawInstanced;
-    PFNSwrDrawIndexed            pfnSwrDrawIndexed;
-    PFNSwrDrawIndexedInstanced   pfnSwrDrawIndexedInstanced;
-    PFNSwrInvalidateTiles        pfnSwrInvalidateTiles;
-    PFNSwrDiscardRect            pfnSwrDiscardRect;
-    PFNSwrDispatch               pfnSwrDispatch;
-    PFNSwrStoreTiles             pfnSwrStoreTiles;
-    PFNSwrClearRenderTarget      pfnSwrClearRenderTarget;
-    PFNSwrSetRastState           pfnSwrSetRastState;
-    PFNSwrSetViewports           pfnSwrSetViewports;
-    PFNSwrSetScissorRects        pfnSwrSetScissorRects;
-    PFNSwrGetPrivateContextState pfnSwrGetPrivateContextState;
-    PFNSwrAllocDrawContextMemory pfnSwrAllocDrawContextMemory;
-    PFNSwrEnableStatsFE          pfnSwrEnableStatsFE;
-    PFNSwrEnableStatsBE          pfnSwrEnableStatsBE;
-    PFNSwrEndFrame               pfnSwrEndFrame;
-    PFNSwrInit                   pfnSwrInit;
-};
-
-extern "C" {
-typedef void(SWR_API* PFNSwrGetInterface)(SWR_INTERFACE& out_funcs);
-SWR_VISIBLE void SWR_API SwrGetInterface(SWR_INTERFACE& out_funcs);
-}
-
-#endif
--- a/src/gallium/drivers/swr/rasterizer/core/arena.h
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
@ -1,490 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file arena.h
- *
- * @brief Arena memory manager
- *        The arena is convenient and fast for managing allocations for any of
- *        our allocations that are associated with operations and can all be freed
- *        once when their operation has completed. Allocations are cheap since
- *        most of the time its simply an increment of an offset. Also, no need to
- *        free individual allocations. All of the arena memory can be freed at once.
- *
- ******************************************************************************/
-#pragma once
-
-#include <mutex>
-#include <algorithm>
-#include <atomic>
-#include "core/utils.h"
-
-static const size_t ARENA_BLOCK_ALIGN = 64;
-
-struct ArenaBlock
-{
-    size_t      blockSize = 0;
-    ArenaBlock* pNext     = nullptr;
-};
-static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN, "Increase BLOCK_ALIGN size");
-
-class DefaultAllocator
-{
-public:
-    ArenaBlock* AllocateAligned(size_t size, size_t align)
-    {
-        SWR_ASSUME_ASSERT(size >= sizeof(ArenaBlock));
-
-        ArenaBlock* p = new (AlignedMalloc(size, align)) ArenaBlock();
-        p->blockSize  = size;
-        return p;
-    }
-
-    void Free(ArenaBlock* pMem)
-    {
-        if (pMem)
-        {
-            SWR_ASSUME_ASSERT(pMem->blockSize < size_t(0xdddddddd));
-            AlignedFree(pMem);
-        }
-    }
-};
-
-// Caching Allocator for Arena
-template <uint32_t NumBucketsT = 8, uint32_t StartBucketBitT = 12>
-struct CachingAllocatorT : DefaultAllocator
-{
-    ArenaBlock* AllocateAligned(size_t size, size_t align)
-    {
-        SWR_ASSUME_ASSERT(size >= sizeof(ArenaBlock));
-        SWR_ASSUME_ASSERT(size <= uint32_t(-1));
-
-        uint32_t bucket = GetBucketId(size);
-
-        {
-            // search cached blocks
-            std::lock_guard<std::mutex> l(m_mutex);
-            ArenaBlock*                 pPrevBlock = &m_cachedBlocks[bucket];
-            ArenaBlock*                 pBlock     = SearchBlocks(pPrevBlock, size, align);
-
-            if (pBlock)
-            {
-                m_cachedSize -= pBlock->blockSize;
-                if (pBlock == m_pLastCachedBlocks[bucket])
-                {
-                    m_pLastCachedBlocks[bucket] = pPrevBlock;
-                }
-            }
-            else
-            {
-                pPrevBlock = &m_oldCachedBlocks[bucket];
-                pBlock     = SearchBlocks(pPrevBlock, size, align);
-
-                if (pBlock)
-                {
-                    m_oldCachedSize -= pBlock->blockSize;
-                    if (pBlock == m_pOldLastCachedBlocks[bucket])
-                    {
-                        m_pOldLastCachedBlocks[bucket] = pPrevBlock;
-                    }
-                }
-            }
-
-            if (pBlock)
-            {
-                assert(pPrevBlock && pPrevBlock->pNext == pBlock);
-                pPrevBlock->pNext = pBlock->pNext;
-                pBlock->pNext     = nullptr;
-
-                return pBlock;
-            }
-
-            m_totalAllocated += size;
-
-#if 0
-            {
-                static uint32_t count = 0;
-                char buf[128];
-                sprintf_s(buf, "Arena Alloc %d 0x%llx bytes - 0x%llx total\n", ++count, uint64_t(size), uint64_t(m_totalAllocated));
-                OutputDebugStringA(buf);
-            }
-#endif
-        }
-
-        if (bucket && bucket < (CACHE_NUM_BUCKETS - 1))
-        {
-            // Make all blocks in this bucket the same size
-            size = size_t(1) << (bucket + 1 + CACHE_START_BUCKET_BIT);
-        }
-
-        return this->DefaultAllocator::AllocateAligned(size, align);
-    }
-
-    void Free(ArenaBlock* pMem)
-    {
-        if (pMem)
-        {
-            std::unique_lock<std::mutex> l(m_mutex);
-            InsertCachedBlock(GetBucketId(pMem->blockSize), pMem);
-        }
-    }
-
-    void FreeOldBlocks()
-    {
-        if (!m_cachedSize)
-        {
-            return;
-        }
-        std::lock_guard<std::mutex> l(m_mutex);
-
-        bool doFree = (m_oldCachedSize > MAX_UNUSED_SIZE);
-
-        for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
-        {
-            if (doFree)
-            {
-                ArenaBlock* pBlock = m_oldCachedBlocks[i].pNext;
-                while (pBlock)
-                {
-                    ArenaBlock* pNext = pBlock->pNext;
-                    m_oldCachedSize -= pBlock->blockSize;
-                    m_totalAllocated -= pBlock->blockSize;
-                    this->DefaultAllocator::Free(pBlock);
-                    pBlock = pNext;
-                }
-                m_oldCachedBlocks[i].pNext = nullptr;
-                m_pOldLastCachedBlocks[i]  = &m_oldCachedBlocks[i];
-            }
-
-            if (m_pLastCachedBlocks[i] != &m_cachedBlocks[i])
-            {
-                if (i && i < (CACHE_NUM_BUCKETS - 1))
-                {
-                    // We know that all blocks are the same size.
-                    // Just move the list over.
-                    m_pLastCachedBlocks[i]->pNext = m_oldCachedBlocks[i].pNext;
-                    m_oldCachedBlocks[i].pNext    = m_cachedBlocks[i].pNext;
-                    m_cachedBlocks[i].pNext       = nullptr;
-                    if (m_pOldLastCachedBlocks[i]->pNext)
-                    {
-                        m_pOldLastCachedBlocks[i] = m_pLastCachedBlocks[i];
-                    }
-                    m_pLastCachedBlocks[i] = &m_cachedBlocks[i];
-                }
-                else
-                {
-                    // The end buckets can have variable sized lists.
-                    // Insert each block based on size
-                    ArenaBlock* pBlock = m_cachedBlocks[i].pNext;
-                    while (pBlock)
-                    {
-                        ArenaBlock* pNext = pBlock->pNext;
-                        pBlock->pNext     = nullptr;
-                        m_cachedSize -= pBlock->blockSize;
-                        InsertCachedBlock<true>(i, pBlock);
-                        pBlock = pNext;
-                    }
-
-                    m_pLastCachedBlocks[i]  = &m_cachedBlocks[i];
-                    m_cachedBlocks[i].pNext = nullptr;
-                }
-            }
-        }
-
-        m_oldCachedSize += m_cachedSize;
-        m_cachedSize = 0;
-    }
-
-    CachingAllocatorT()
-    {
-        for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
-        {
-            m_pLastCachedBlocks[i]    = &m_cachedBlocks[i];
-            m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i];
-        }
-    }
-
-    ~CachingAllocatorT()
-    {
-        // Free all cached blocks
-        for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
-        {
-            ArenaBlock* pBlock = m_cachedBlocks[i].pNext;
-            while (pBlock)
-            {
-                ArenaBlock* pNext = pBlock->pNext;
-                this->DefaultAllocator::Free(pBlock);
-                pBlock = pNext;
-            }
-            pBlock = m_oldCachedBlocks[i].pNext;
-            while (pBlock)
-            {
-                ArenaBlock* pNext = pBlock->pNext;
-                this->DefaultAllocator::Free(pBlock);
-                pBlock = pNext;
-            }
-        }
-    }
-
-private:
-    static uint32_t GetBucketId(size_t blockSize)
-    {
-        uint32_t bucketId = 0;
-
-#if defined(BitScanReverseSizeT)
-        BitScanReverseSizeT((unsigned long*)&bucketId, (blockSize - 1) >> CACHE_START_BUCKET_BIT);
-        bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1);
-#endif
-
-        return bucketId;
-    }
-
-    template <bool OldBlockT = false>
-    void InsertCachedBlock(uint32_t bucketId, ArenaBlock* pNewBlock)
-    {
-        SWR_ASSUME_ASSERT(bucketId < CACHE_NUM_BUCKETS);
-
-        ArenaBlock* pPrevBlock =
-            OldBlockT ? &m_oldCachedBlocks[bucketId] : &m_cachedBlocks[bucketId];
-        ArenaBlock* pBlock = pPrevBlock->pNext;
-
-        while (pBlock)
-        {
-            if (pNewBlock->blockSize >= pBlock->blockSize)
-            {
-                // Insert here
-                break;
-            }
-            pPrevBlock = pBlock;
-            pBlock     = pBlock->pNext;
-        }
-
-        // Insert into list
-        SWR_ASSUME_ASSERT(pPrevBlock);
-        pPrevBlock->pNext = pNewBlock;
-        pNewBlock->pNext  = pBlock;
-
-        if (OldBlockT)
-        {
-            if (m_pOldLastCachedBlocks[bucketId] == pPrevBlock)
-            {
-                m_pOldLastCachedBlocks[bucketId] = pNewBlock;
-            }
-
-            m_oldCachedSize += pNewBlock->blockSize;
-        }
-        else
-        {
-            if (m_pLastCachedBlocks[bucketId] == pPrevBlock)
-            {
-                m_pLastCachedBlocks[bucketId] = pNewBlock;
-            }
-
-            m_cachedSize += pNewBlock->blockSize;
-        }
-    }
-
-    static ArenaBlock* SearchBlocks(ArenaBlock*& pPrevBlock, size_t blockSize, size_t align)
-    {
-        ArenaBlock* pBlock          = pPrevBlock->pNext;
-        ArenaBlock* pPotentialBlock = nullptr;
-        ArenaBlock* pPotentialPrev  = nullptr;
-
-        while (pBlock)
-        {
-            if (pBlock->blockSize >= blockSize)
-            {
-                if (pBlock == AlignUp(pBlock, align))
-                {
-                    if (pBlock->blockSize == blockSize)
-                    {
-                        // Won't find a better match
-                        break;
-                    }
-
-                    // We could use this as it is larger than we wanted, but
-                    // continue to search for a better match
-                    pPotentialBlock = pBlock;
-                    pPotentialPrev  = pPrevBlock;
-                }
-            }
-            else
-            {
-                // Blocks are sorted by size (biggest first)
-                // So, if we get here, there are no blocks
-                // large enough, fall through to allocation.
-                pBlock = nullptr;
-                break;
-            }
-
-            pPrevBlock = pBlock;
-            pBlock     = pBlock->pNext;
-        }
-
-        if (!pBlock)
-        {
-            // Couldn't find an exact match, use next biggest size
-            pBlock     = pPotentialBlock;
-            pPrevBlock = pPotentialPrev;
-        }
-
-        return pBlock;
-    }
-
-    // buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ...
-    static const uint32_t CACHE_NUM_BUCKETS      = NumBucketsT;
-    static const uint32_t CACHE_START_BUCKET_BIT = StartBucketBitT;
-    static const size_t   MAX_UNUSED_SIZE        = sizeof(MEGABYTE);
-
-    ArenaBlock  m_cachedBlocks[CACHE_NUM_BUCKETS];
-    ArenaBlock* m_pLastCachedBlocks[CACHE_NUM_BUCKETS];
-    ArenaBlock  m_oldCachedBlocks[CACHE_NUM_BUCKETS];
-    ArenaBlock* m_pOldLastCachedBlocks[CACHE_NUM_BUCKETS];
-    std::mutex  m_mutex;
-
-    size_t m_totalAllocated = 0;
-
-    size_t m_cachedSize    = 0;
-    size_t m_oldCachedSize = 0;
-};
-typedef CachingAllocatorT<> CachingAllocator;
-
-template <typename T = DefaultAllocator, size_t BlockSizeT = 128 * sizeof(KILOBYTE)>
-class TArena
-{
-public:
-    TArena(T& in_allocator) : m_allocator(in_allocator) {}
-    TArena() : m_allocator(m_defAllocator) {}
-    ~TArena() { Reset(true); }
-
-    void* AllocAligned(size_t size, size_t align)
-    {
-        if (0 == size)
-        {
-            return nullptr;
-        }
-
-        SWR_ASSERT(align <= ARENA_BLOCK_ALIGN);
-
-        if (m_pCurBlock)
-        {
-            ArenaBlock* pCurBlock = m_pCurBlock;
-            size_t      offset    = AlignUp(m_offset, align);
-
-            if ((offset + size) <= pCurBlock->blockSize)
-            {
-                void* pMem = PtrAdd(pCurBlock, offset);
-                m_offset   = offset + size;
-                return pMem;
-            }
-
-            // Not enough memory in this block, fall through to allocate
-            // a new block
-        }
-
-        static const size_t ArenaBlockSize = BlockSizeT;
-        size_t              blockSize      = std::max(size + ARENA_BLOCK_ALIGN, ArenaBlockSize);
-
-        // Add in one BLOCK_ALIGN unit to store ArenaBlock in.
-        blockSize = AlignUp(blockSize, ARENA_BLOCK_ALIGN);
-
-        ArenaBlock* pNewBlock = m_allocator.AllocateAligned(
-            blockSize, ARENA_BLOCK_ALIGN); // Arena blocks are always simd byte aligned.
-        SWR_ASSERT(pNewBlock != nullptr);
-
-        if (pNewBlock != nullptr)
-        {
-            m_offset         = ARENA_BLOCK_ALIGN;
-            pNewBlock->pNext = m_pCurBlock;
-
-            m_pCurBlock = pNewBlock;
-        }
-
-        return AllocAligned(size, align);
-    }
-
-    void* Alloc(size_t size) { return AllocAligned(size, 1); }
-
-    void* AllocAlignedSync(size_t size, size_t align)
-    {
-        void* pAlloc = nullptr;
-
-        m_mutex.lock();
-        pAlloc = AllocAligned(size, align);
-        m_mutex.unlock();
-
-        return pAlloc;
-    }
-
-    void* AllocSync(size_t size)
-    {
-        void* pAlloc = nullptr;
-
-        m_mutex.lock();
-        pAlloc = Alloc(size);
-        m_mutex.unlock();
-
-        return pAlloc;
-    }
-
-    void Reset(bool removeAll = false)
-    {
-        m_offset = ARENA_BLOCK_ALIGN;
-
-        if (m_pCurBlock)
-        {
-            ArenaBlock* pUsedBlocks = m_pCurBlock->pNext;
-            m_pCurBlock->pNext      = nullptr;
-            while (pUsedBlocks)
-            {
-                ArenaBlock* pBlock = pUsedBlocks;
-                pUsedBlocks        = pBlock->pNext;
-
-                m_allocator.Free(pBlock);
-            }
-
-            if (removeAll)
-            {
-                m_allocator.Free(m_pCurBlock);
-                m_pCurBlock = nullptr;
-            }
-        }
-    }
-
-    bool IsEmpty()
-    {
-        return (m_pCurBlock == nullptr) ||
-               (m_offset == ARENA_BLOCK_ALIGN && m_pCurBlock->pNext == nullptr);
-    }
-
-private:
-    ArenaBlock* m_pCurBlock = nullptr;
-    size_t      m_offset    = ARENA_BLOCK_ALIGN;
-
-    /// @note Mutex is only used by sync allocation functions.
-    std::mutex m_mutex;
-
-    DefaultAllocator m_defAllocator;
-    T&               m_allocator;
-};
-
-using StdArena     = TArena<DefaultAllocator>;
-using CachingArena = TArena<CachingAllocator>;
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@ -1,420 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file backend.cpp
- *
- * @brief Backend handles rasterization, pixel shading and output merger
- *        operations.
- *
- ******************************************************************************/
-
-#include <smmintrin.h>
-
-#include "backend.h"
-#include "backend_impl.h"
-#include "tilemgr.h"
-#include "memory/tilingtraits.h"
-#include "core/multisample.h"
-#include "backends/gen_BackendPixelRate.hpp"
-
-#include <algorithm>
-
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Process compute work.
-/// @param pDC - pointer to draw context (dispatch).
-/// @param workerId - The unique worker ID that is assigned to this thread.
-/// @param threadGroupId - the linear index for the thread group within the dispatch.
-void ProcessComputeBE(DRAW_CONTEXT* pDC,
-                      uint32_t      workerId,
-                      uint32_t      threadGroupId,
-                      void*&        pSpillFillBuffer,
-                      void*&        pScratchSpace)
-{
-    SWR_CONTEXT* pContext = pDC->pContext;
-
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEDispatch, pDC->drawId);
-
-    const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
-    SWR_ASSERT(pTaskData != nullptr);
-
-    // Ensure spill fill memory has been allocated.
-    size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
-    if (spillFillSize && pSpillFillBuffer == nullptr)
-    {
-        pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD16_BYTES);
-    }
-
-    size_t scratchSpaceSize =
-        pDC->pState->state.scratchSpaceSizePerWarp * pDC->pState->state.scratchSpaceNumWarps;
-    if (scratchSpaceSize && pScratchSpace == nullptr)
-    {
-        pScratchSpace = pDC->pArena->AllocAlignedSync(scratchSpaceSize, KNOB_SIMD16_BYTES);
-    }
-
-    const API_STATE& state = GetApiState(pDC);
-
-    SWR_CS_CONTEXT csContext{0};
-    csContext.tileCounter         = threadGroupId;
-    csContext.dispatchDims[0]     = pTaskData->threadGroupCountX;
-    csContext.dispatchDims[1]     = pTaskData->threadGroupCountY;
-    csContext.dispatchDims[2]     = pTaskData->threadGroupCountZ;
-    csContext.pTGSM               = pContext->ppScratch[workerId];
-    csContext.pSpillFillBuffer    = (uint8_t*)pSpillFillBuffer;
-    csContext.pScratchSpace       = (uint8_t*)pScratchSpace;
-    csContext.scratchSpacePerWarp = pDC->pState->state.scratchSpaceSizePerWarp;
-
-    state.pfnCsFunc(GetPrivateState(pDC),
-                    pContext->threadPool.pThreadData[workerId].pWorkerPrivateData,
-                    &csContext);
-
-    UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);
-    AR_EVENT(CSStats((HANDLE)&csContext.stats));
-
-    RDTSC_END(pDC->pContext->pBucketMgr, BEDispatch, 1);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Process shutdown.
-/// @param pDC - pointer to draw context (dispatch).
-/// @param workerId - The unique worker ID that is assigned to this thread.
-/// @param threadGroupId - the linear index for the thread group within the dispatch.
-void ProcessShutdownBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
-{
-    // Dummy function
-}
-
-void ProcessSyncBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
-{
-    uint32_t x, y;
-    MacroTileMgr::getTileIndices(macroTile, x, y);
-    SWR_ASSERT(x == 0 && y == 0);
-}
-
-void ProcessStoreTileBE(DRAW_CONTEXT*               pDC,
-                        uint32_t                    workerId,
-                        uint32_t                    macroTile,
-                        STORE_TILES_DESC*           pDesc,
-                        SWR_RENDERTARGET_ATTACHMENT attachment)
-{
-    SWR_CONTEXT* pContext           = pDC->pContext;
-    HANDLE       hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
-
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEStoreTiles, pDC->drawId);
-
-    SWR_FORMAT srcFormat;
-    switch (attachment)
-    {
-    case SWR_ATTACHMENT_COLOR0:
-    case SWR_ATTACHMENT_COLOR1:
-    case SWR_ATTACHMENT_COLOR2:
-    case SWR_ATTACHMENT_COLOR3:
-    case SWR_ATTACHMENT_COLOR4:
-    case SWR_ATTACHMENT_COLOR5:
-    case SWR_ATTACHMENT_COLOR6:
-    case SWR_ATTACHMENT_COLOR7:
-        srcFormat = KNOB_COLOR_HOT_TILE_FORMAT;
-        break;
-    case SWR_ATTACHMENT_DEPTH:
-        srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT;
-        break;
-    case SWR_ATTACHMENT_STENCIL:
-        srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT;
-        break;
-    default:
-        SWR_INVALID("Unknown attachment: %d", attachment);
-        srcFormat = KNOB_COLOR_HOT_TILE_FORMAT;
-        break;
-    }
-
-    uint32_t x, y;
-    MacroTileMgr::getTileIndices(macroTile, x, y);
-
-    // Only need to store the hottile if it's been rendered to...
-    HOTTILE* pHotTile =
-        pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false);
-    if (pHotTile)
-    {
-        // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
-        if (pHotTile->state == HOTTILE_CLEAR)
-        {
-            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat];
-            SWR_ASSERT(pfnClearTiles != nullptr);
-
-            pfnClearTiles(pDC,
-                          hWorkerPrivateData,
-                          attachment,
-                          macroTile,
-                          pHotTile->renderTargetArrayIndex,
-                          pHotTile->clearData,
-                          pDesc->rect);
-        }
-
-        if (pHotTile->state == HOTTILE_DIRTY ||
-            pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
-        {
-            int32_t destX = KNOB_MACROTILE_X_DIM * x;
-            int32_t destY = KNOB_MACROTILE_Y_DIM * y;
-
-            pContext->pfnStoreTile(pDC,
-                                   hWorkerPrivateData,
-                                   srcFormat,
-                                   attachment,
-                                   destX,
-                                   destY,
-                                   pHotTile->renderTargetArrayIndex,
-                                   pHotTile->pBuffer);
-        }
-
-        if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED) 
-        {
-            if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY &&
-                  pHotTile->state == HOTTILE_RESOLVED))
-            {
-                pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
-            }
-        }
-    }
-    RDTSC_END(pDC->pContext->pBucketMgr, BEStoreTiles, 1);
-}
-
-void ProcessStoreTilesBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
-{
-    STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pData;
-
-    unsigned long rt   = 0;
-    uint32_t      mask = pDesc->attachmentMask;
-    while (_BitScanForward(&rt, mask))
-    {
-        mask &= ~(1 << rt);
-        ProcessStoreTileBE(pDC, workerId, macroTile, pDesc, (SWR_RENDERTARGET_ATTACHMENT)rt);
-    }
-}
-
-void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT* pDC,
-                                     uint32_t      workerId,
-                                     uint32_t      macroTile,
-                                     void*         pData)
-{
-    DISCARD_INVALIDATE_TILES_DESC* pDesc    = (DISCARD_INVALIDATE_TILES_DESC*)pData;
-    SWR_CONTEXT*                   pContext = pDC->pContext;
-
-    const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
-
-    for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
-    {
-        if (pDesc->attachmentMask & (1 << i))
-        {
-            HOTTILE* pHotTile =
-                pContext->pHotTileMgr->GetHotTileNoLoad(pContext,
-                                                        pDC,
-                                                        macroTile,
-                                                        (SWR_RENDERTARGET_ATTACHMENT)i,
-                                                        pDesc->createNewTiles,
-                                                        numSamples);
-            if (pHotTile)
-            {
-                HOTTILE_STATE newState = (HOTTILE_STATE)pDesc->newTileState;;
-                if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_CLEAR)
-                {
-                    if (newState == HOTTILE_INVALID)
-                    {
-                        // This is OK for APIs that explicitly allow discards
-                        // (for e.g. depth / stencil data)
-                        //SWR_INVALID("Discarding valid data!");
-                    }
-                }
-                pHotTile->state = newState;
-            }
-        }
-    }
-}
-
-template <uint32_t sampleCountT>
-void BackendNullPS(DRAW_CONTEXT*        pDC,
-                   uint32_t             workerId,
-                   uint32_t             x,
-                   uint32_t             y,
-                   SWR_TRIANGLE_DESC&   work,
-                   RenderOutputBuffers& renderBuffers)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BENullBackend, pDC->drawId);
-    ///@todo: handle center multisample pattern
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
-
-    const API_STATE& state = GetApiState(pDC);
-
-    BarycentricCoeffs coeffs;
-    SetupBarycentricCoeffs(&coeffs, work);
-
-    uint8_t *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers);
-
-    SWR_PS_CONTEXT psContext;
-    // skip SetupPixelShaderContext(&psContext, ...); // not needed here
-
-    RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 0);
-
-    simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-
-    const simdscalar           dy        = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
-    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
-    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
-    {
-        simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-
-        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
-
-        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
-        {
-            // iterate over active samples
-            unsigned long sample     = 0;
-            uint32_t      sampleMask = state.blendState.sampleMask;
-            while (_BitScanForward(&sample, sampleMask))
-            {
-                sampleMask &= ~(1 << sample);
-
-                simdmask coverageMask = work.coverageMask[sample] & MASK;
-
-                if (coverageMask)
-                {
-                    // offset depth/stencil buffers current sample
-                    uint8_t* pDepthSample   = pDepthBuffer + RasterTileDepthOffset(sample);
-                    uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
-
-                    if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
-                    {
-                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
-                                      "Unsupported depth hot tile format");
-
-                        const simdscalar z =
-                            _simd_load_ps(reinterpret_cast<const float*>(pDepthSample));
-
-                        const float minz = state.depthBoundsState.depthBoundsTestMinValue;
-                        const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
-                        coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
-                    }
-
-                    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
-
-                    // calculate per sample positions
-                    psContext.vX.sample = _simd_add_ps(vXSamplePosUL, samplePos.vX(sample));
-                    psContext.vY.sample = _simd_add_ps(vYSamplePosUL, samplePos.vY(sample));
-
-                    CalcSampleBarycentrics(coeffs, psContext);
-
-                    // interpolate and quantize z
-                    psContext.vZ = vplaneps(coeffs.vZa,
-                                            coeffs.vZb,
-                                            coeffs.vZc,
-                                            psContext.vI.sample,
-                                            psContext.vJ.sample);
-                    psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-
-                    RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
-
-                    // interpolate user clip distance if available
-                    if (state.backendState.clipDistanceMask)
-                    {
-                        coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
-                                                             work.pUserClipBuffer,
-                                                             psContext.vI.sample,
-                                                             psContext.vJ.sample);
-                    }
-
-                    simdscalar vCoverageMask   = _simd_vmask_ps(coverageMask);
-                    simdscalar stencilPassMask = vCoverageMask;
-
-                    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId);
-                    simdscalar depthPassMask = DepthStencilTest(&state,
-                                                                work.triFlags.frontFacing,
-                                                                work.triFlags.viewportIndex,
-                                                                psContext.vZ,
-                                                                pDepthSample,
-                                                                vCoverageMask,
-                                                                pStencilSample,
-                                                                &stencilPassMask);
-                    AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask),
-                                                         _simd_movemask_ps(stencilPassMask),
-                                                         _simd_movemask_ps(vCoverageMask)));
-                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
-                                      &state.depthStencilState,
-                                      work.triFlags.frontFacing,
-                                      psContext.vZ,
-                                      pDepthSample,
-                                      depthPassMask,
-                                      vCoverageMask,
-                                      pStencilSample,
-                                      stencilPassMask);
-                    RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0);
-
-                    uint32_t statMask  = _simd_movemask_ps(depthPassMask);
-                    uint32_t statCount = _mm_popcnt_u32(statMask);
-                    UPDATE_STAT_BE(DepthPassCount, statCount);
-                }
-
-            Endtile:
-                ATTR_UNUSED;
-                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBuffer +=
-                (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-
-            vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx);
-        }
-
-        vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy);
-    }
-
-    RDTSC_END(pDC->pContext->pBucketMgr, BENullBackend, 0);
-}
-
-PFN_CLEAR_TILES  gClearTilesTable[NUM_SWR_FORMATS] = {};
-PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
-PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2] // centroid
-                                     [2]                           // canEarlyZ
-    = {};
-PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][2] // isCenterPattern
-                                       [SWR_INPUT_COVERAGE_COUNT][2]   // centroid
-                                       [2]                             // forcedSampleCount
-                                       [2]                             // canEarlyZ
-    = {};
-PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT]
-                                        [2] // centroid
-                                        [2] // canEarlyZ
-    = {};
-
-void InitBackendFuncTables()
-{
-    InitBackendPixelRate();
-    InitBackendSingleFuncTable(gBackendSingleSample);
-    InitBackendSampleFuncTable(gBackendSampleRateTable);
-
-    gBackendNullPs[SWR_MULTISAMPLE_1X]  = &BackendNullPS<SWR_MULTISAMPLE_1X>;
-    gBackendNullPs[SWR_MULTISAMPLE_2X]  = &BackendNullPS<SWR_MULTISAMPLE_2X>;
-    gBackendNullPs[SWR_MULTISAMPLE_4X]  = &BackendNullPS<SWR_MULTISAMPLE_4X>;
-    gBackendNullPs[SWR_MULTISAMPLE_8X]  = &BackendNullPS<SWR_MULTISAMPLE_8X>;
-    gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS<SWR_MULTISAMPLE_16X>;
-}
--- a/src/gallium/drivers/swr/rasterizer/core/backend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h
@ -1,70 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file backend.h
- *
- * @brief Backend handles rasterization, pixel shading and output merger
- *        operations.
- *
- ******************************************************************************/
-#pragma once
-
-#include "common/os.h"
-#include "core/context.h"
-#include "core/multisample.h"
-#include "depthstencil.h"
-#include "rdtsc_core.h"
-
-void ProcessComputeBE(DRAW_CONTEXT* pDC,
-                      uint32_t      workerId,
-                      uint32_t      threadGroupId,
-                      void*&        pSpillFillBuffer,
-                      void*&        pScratchSpace);
-void ProcessSyncBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData);
-void ProcessClearBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData);
-void ProcessStoreTilesBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData);
-void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT* pDC,
-                                     uint32_t      workerId,
-                                     uint32_t      macroTile,
-                                     void*         pData);
-void ProcessShutdownBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData);
-
-typedef void (*PFN_CLEAR_TILES)(DRAW_CONTEXT*,
-                                HANDLE                      hWorkerData,
-                                SWR_RENDERTARGET_ATTACHMENT rt,
-                                uint32_t,
-                                uint32_t,
-                                uint32_t[4],
-                                const SWR_RECT& rect);
-
-extern PFN_CLEAR_TILES  gClearTilesTable[NUM_SWR_FORMATS];
-extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
-extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2]     // centroid
-                                            [2];                              // canEarlyZ
-extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][2] // isCenterPattern
-                                              [SWR_INPUT_COVERAGE_COUNT][2]   // centroid
-                                              [2]                             // forcedSampleCount
-                                              [2]                             // canEarlyZ
-    ;
-extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
-                                               [SWR_INPUT_COVERAGE_COUNT][2] // centroid
-                                               [2];                          // canEarlyZ
--- a/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp
@ -1,308 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file backend.cpp
- *
- * @brief Backend handles rasterization, pixel shading and output merger
- *        operations.
- *
- ******************************************************************************/
-
-#include <smmintrin.h>
-
-#include "backend.h"
-#include "backend_impl.h"
-#include "tilemgr.h"
-#include "memory/tilingtraits.h"
-#include "core/multisample.h"
-
-#include <algorithm>
-
-template <SWR_FORMAT format>
-void ClearRasterTile(uint8_t* pTileBuffer, simd16vector& value)
-{
-    auto lambda = [&](int32_t comp)
-    {
-        FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
-
-        pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
-    };
-
-    const uint32_t numIter =
-        (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
-
-    for (uint32_t i = 0; i < numIter; ++i)
-    {
-        UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
-    }
-}
-
-template <SWR_FORMAT format>
-INLINE void ClearMacroTile(DRAW_CONTEXT*               pDC,
-                           HANDLE                      hWorkerPrivateData,
-                           SWR_RENDERTARGET_ATTACHMENT rt,
-                           uint32_t                    macroTile,
-                           uint32_t                    renderTargetArrayIndex,
-                           uint32_t                    clear[4],
-                           const SWR_RECT&             rect)
-{
-    // convert clear color to hottile format
-    // clear color is in RGBA float/uint32
-
-    simd16vector vClear;
-    for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
-    {
-        simd16scalar vComp = _simd16_load1_ps((const float*)&clear[comp]);
-
-        if (FormatTraits<format>::isNormalized(comp))
-        {
-            vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
-            vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
-        }
-        vComp = FormatTraits<format>::pack(comp, vComp);
-
-        vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
-    }
-
-    uint32_t tileX, tileY;
-    MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
-
-    // Init to full macrotile
-    SWR_RECT clearTile = {
-        KNOB_MACROTILE_X_DIM * int32_t(tileX),
-        KNOB_MACROTILE_Y_DIM * int32_t(tileY),
-        KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
-        KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
-    };
-
-    // intersect with clear rect
-    clearTile &= rect;
-
-    // translate to local hottile origin
-    clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM,
-                        -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
-
-    // Make maximums inclusive (needed for convert to raster tiles)
-    clearTile.xmax -= 1;
-    clearTile.ymax -= 1;
-
-    // convert to raster tiles
-    clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
-    clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
-    clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
-    clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
-
-    const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
-    // compute steps between raster tile samples / raster tiles / macro tile rows
-    const uint32_t rasterTileSampleStep =
-        KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
-    const uint32_t rasterTileStep =
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
-    const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
-    const uint32_t pitch            = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
-
-    HOTTILE* pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext,
-                                                               pDC,
-                                                               hWorkerPrivateData,
-                                                               macroTile,
-                                                               rt,
-                                                               true,
-                                                               numSamples,
-                                                               renderTargetArrayIndex);
-    uint32_t rasterTileStartOffset =
-        (ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp>>(
-            pitch, clearTile.xmin, clearTile.ymin)) *
-        numSamples;
-    uint8_t* pRasterTileRow =
-        pHotTile->pBuffer +
-        rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ,
-                               // FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
-
-    // loop over all raster tiles in the current hot tile
-    for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
-    {
-        uint8_t* pRasterTile = pRasterTileRow;
-        for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
-        {
-            for (int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
-            {
-                ClearRasterTile<format>(pRasterTile, vClear);
-                pRasterTile += rasterTileSampleStep;
-            }
-        }
-        pRasterTileRow += macroTileRowStep;
-    }
-
-    pHotTile->state = HOTTILE_DIRTY;
-}
-
-void ProcessClearBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
-{
-    SWR_CONTEXT* pContext           = pDC->pContext;
-    HANDLE       hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
-
-    if (KNOB_FAST_CLEAR)
-    {
-        CLEAR_DESC*           pClear      = (CLEAR_DESC*)pUserData;
-        SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
-        uint32_t              numSamples  = GetNumSamples(sampleCount);
-
-        SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.
-
-        RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEClear, pDC->drawId);
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
-        {
-            unsigned long rt   = 0;
-            uint32_t      mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
-            while (_BitScanForward(&rt, mask))
-            {
-                mask &= ~(1 << rt);
-
-                HOTTILE* pHotTile =
-                    pContext->pHotTileMgr->GetHotTile(pContext,
-                                                      pDC,
-                                                      hWorkerPrivateData,
-                                                      macroTile,
-                                                      (SWR_RENDERTARGET_ATTACHMENT)rt,
-                                                      true,
-                                                      numSamples,
-                                                      pClear->renderTargetArrayIndex);
-
-                // All we want to do here is to mark the hot tile as being in a "needs clear" state.
-                pHotTile->clearData[0] = *(uint32_t*)&(pClear->clearRTColor[0]);
-                pHotTile->clearData[1] = *(uint32_t*)&(pClear->clearRTColor[1]);
-                pHotTile->clearData[2] = *(uint32_t*)&(pClear->clearRTColor[2]);
-                pHotTile->clearData[3] = *(uint32_t*)&(pClear->clearRTColor[3]);
-                pHotTile->state        = HOTTILE_CLEAR;
-            }
-        }
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
-        {
-            HOTTILE* pHotTile      = pContext->pHotTileMgr->GetHotTile(pContext,
-                                                                  pDC,
-                                                                  hWorkerPrivateData,
-                                                                  macroTile,
-                                                                  SWR_ATTACHMENT_DEPTH,
-                                                                  true,
-                                                                  numSamples,
-                                                                  pClear->renderTargetArrayIndex);
-            pHotTile->clearData[0] = *(uint32_t*)&pClear->clearDepth;
-            pHotTile->state        = HOTTILE_CLEAR;
-        }
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
-        {
-            HOTTILE* pHotTile = pContext->pHotTileMgr->GetHotTile(pContext,
-                                                                  pDC,
-                                                                  hWorkerPrivateData,
-                                                                  macroTile,
-                                                                  SWR_ATTACHMENT_STENCIL,
-                                                                  true,
-                                                                  numSamples,
-                                                                  pClear->renderTargetArrayIndex);
-
-            pHotTile->clearData[0] = pClear->clearStencil;
-            pHotTile->state        = HOTTILE_CLEAR;
-        }
-
-        RDTSC_END(pDC->pContext->pBucketMgr, BEClear, 1);
-    }
-    else
-    {
-        // Legacy clear
-        CLEAR_DESC* pClear = (CLEAR_DESC*)pUserData;
-        RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEClear, pDC->drawId);
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
-        {
-            uint32_t clearData[4];
-            clearData[0] = *(uint32_t*)&(pClear->clearRTColor[0]);
-            clearData[1] = *(uint32_t*)&(pClear->clearRTColor[1]);
-            clearData[2] = *(uint32_t*)&(pClear->clearRTColor[2]);
-            clearData[3] = *(uint32_t*)&(pClear->clearRTColor[3]);
-
-            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
-            SWR_ASSERT(pfnClearTiles != nullptr);
-
-            unsigned long rt   = 0;
-            uint32_t      mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
-            while (_BitScanForward(&rt, mask))
-            {
-                mask &= ~(1 << rt);
-
-                pfnClearTiles(pDC,
-                              hWorkerPrivateData,
-                              (SWR_RENDERTARGET_ATTACHMENT)rt,
-                              macroTile,
-                              pClear->renderTargetArrayIndex,
-                              clearData,
-                              pClear->rect);
-            }
-        }
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
-        {
-            uint32_t clearData[4];
-            clearData[0]                  = *(uint32_t*)&pClear->clearDepth;
-            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
-            SWR_ASSERT(pfnClearTiles != nullptr);
-
-            pfnClearTiles(pDC,
-                          hWorkerPrivateData,
-                          SWR_ATTACHMENT_DEPTH,
-                          macroTile,
-                          pClear->renderTargetArrayIndex,
-                          clearData,
-                          pClear->rect);
-        }
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
-        {
-            uint32_t clearData[4];
-            clearData[0]                  = pClear->clearStencil;
-            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
-
-            pfnClearTiles(pDC,
-                          hWorkerPrivateData,
-                          SWR_ATTACHMENT_STENCIL,
-                          macroTile,
-                          pClear->renderTargetArrayIndex,
-                          clearData,
-                          pClear->rect);
-        }
-
-        RDTSC_END(pDC->pContext->pBucketMgr, BEClear, 1);
-    }
-}
-
-void InitClearTilesTable()
-{
-    memset(gClearTilesTable, 0, sizeof(gClearTilesTable));
-
-    gClearTilesTable[R8G8B8A8_UNORM]     = ClearMacroTile<R8G8B8A8_UNORM>;
-    gClearTilesTable[B8G8R8A8_UNORM]     = ClearMacroTile<B8G8R8A8_UNORM>;
-    gClearTilesTable[R32_FLOAT]          = ClearMacroTile<R32_FLOAT>;
-    gClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
-    gClearTilesTable[R8_UINT]            = ClearMacroTile<R8_UINT>;
-}
--- a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
--- a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
@ -1,454 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file backend.cpp
- *
- * @brief Backend handles rasterization, pixel shading and output merger
- *        operations.
- *
- ******************************************************************************/
-
-#include <smmintrin.h>
-
-#include "backend.h"
-#include "backend_impl.h"
-#include "tilemgr.h"
-#include "memory/tilingtraits.h"
-#include "core/multisample.h"
-
-#include <algorithm>
-
-template <typename T>
-void BackendSampleRate(DRAW_CONTEXT*        pDC,
-                       uint32_t             workerId,
-                       uint32_t             x,
-                       uint32_t             y,
-                       SWR_TRIANGLE_DESC&   work,
-                       RenderOutputBuffers& renderBuffers)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESampleRateBackend, pDC->drawId);
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
-
-    void* pWorkerData      = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
-    const API_STATE& state = GetApiState(pDC);
-
-    BarycentricCoeffs coeffs;
-    SetupBarycentricCoeffs(&coeffs, work);
-
-    SWR_PS_CONTEXT             psContext;
-    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
-    SetupPixelShaderContext<T>(&psContext, samplePos, work);
-
-    uint8_t *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(psContext.pColorBuffer,
-                       &pDepthBuffer,
-                       &pStencilBuffer,
-                       state.colorHottileEnable,
-                       renderBuffers);
-
-    bool isTileDirty = false;
-
-    RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 0);
-
-    psContext.vY.UL     = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-
-    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
-
-    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
-    {
-        psContext.vX.UL     = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-
-        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
-
-        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
-        {
-            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
-
-
-            if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
-            {
-                const uint64_t* pCoverageMask =
-                    (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
-                        ? &work.innerCoverageMask
-                        : &work.coverageMask[0];
-
-                generateInputCoverage<T, T::InputCoverage>(
-                    pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
-            }
-
-            RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
-
-            CalcPixelBarycentrics(coeffs, psContext);
-
-            CalcCentroid<T, false>(
-                &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
-
-            RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
-
-            for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
-            {
-                simdmask coverageMask = work.coverageMask[sample] & MASK;
-
-                if (coverageMask)
-                {
-                    // offset depth/stencil buffers current sample
-                    uint8_t* pDepthSample   = pDepthBuffer + RasterTileDepthOffset(sample);
-                    uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
-
-                    if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
-                    {
-                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
-                                      "Unsupported depth hot tile format");
-
-                        const simdscalar z =
-                            _simd_load_ps(reinterpret_cast<const float*>(pDepthSample));
-
-                        const float minz = state.depthBoundsState.depthBoundsTestMinValue;
-                        const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
-                        coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
-                    }
-
-                    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
-
-                    // calculate per sample positions
-                    psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
-                    psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
-
-                    CalcSampleBarycentrics(coeffs, psContext);
-
-                    // interpolate and quantize z
-                    psContext.vZ = vplaneps(coeffs.vZa,
-                                            coeffs.vZb,
-                                            coeffs.vZc,
-                                            psContext.vI.sample,
-                                            psContext.vJ.sample);
-                    psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-
-                    RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
-
-                    // interpolate user clip distance if available
-                    if (state.backendState.clipDistanceMask)
-                    {
-                        coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
-                                                             work.pUserClipBuffer,
-                                                             psContext.vI.sample,
-                                                             psContext.vJ.sample);
-                    }
-
-                    simdscalar vCoverageMask   = _simd_vmask_ps(coverageMask);
-                    simdscalar depthPassMask   = vCoverageMask;
-                    simdscalar stencilPassMask = vCoverageMask;
-
-                    // Early-Z?
-                    if (T::bCanEarlyZ)
-                    {
-                        RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId);
-                        depthPassMask = DepthStencilTest(&state,
-                                                         work.triFlags.frontFacing,
-                                                         work.triFlags.viewportIndex,
-                                                         psContext.vZ,
-                                                         pDepthSample,
-                                                         vCoverageMask,
-                                                         pStencilSample,
-                                                         &stencilPassMask);
-                        AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask),
-                                                                 _simd_movemask_ps(stencilPassMask),
-                                                                 _simd_movemask_ps(vCoverageMask)));
-                        RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0);
-
-                        // early-exit if no samples passed depth or earlyZ is forced on.
-                        if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
-                        {
-                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
-                                              &state.depthStencilState,
-                                              work.triFlags.frontFacing,
-                                              psContext.vZ,
-                                              pDepthSample,
-                                              depthPassMask,
-                                              vCoverageMask,
-                                              pStencilSample,
-                                              stencilPassMask);
-
-                            if (!_simd_movemask_ps(depthPassMask))
-                            {
-                                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-                                continue;
-                            }
-                        }
-                    }
-
-                    psContext.sampleIndex = sample;
-                    psContext.activeMask  = _simd_castps_si(vCoverageMask);
-
-                    // execute pixel shader
-                    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId);
-                    state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
-                    RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0);
-
-                    // update stats
-                    UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
-                    AR_EVENT(PSStats((HANDLE)&psContext.stats));
-
-                    vCoverageMask = _simd_castsi_ps(psContext.activeMask);
-
-                    if (_simd_movemask_ps(vCoverageMask))
-                    {
-                        isTileDirty = true;
-                    }
-
-                    // late-Z
-                    if (!T::bCanEarlyZ)
-                    {
-                        RDTSC_BEGIN(pDC->pContext->pBucketMgr, BELateDepthTest, pDC->drawId);
-                        depthPassMask = DepthStencilTest(&state,
-                                                         work.triFlags.frontFacing,
-                                                         work.triFlags.viewportIndex,
-                                                         psContext.vZ,
-                                                         pDepthSample,
-                                                         vCoverageMask,
-                                                         pStencilSample,
-                                                         &stencilPassMask);
-                        AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask),
-                                                                _simd_movemask_ps(stencilPassMask),
-                                                                _simd_movemask_ps(vCoverageMask)));
-                        RDTSC_END(pDC->pContext->pBucketMgr, BELateDepthTest, 0);
-
-                        if (!_simd_movemask_ps(depthPassMask))
-                        {
-                            // need to call depth/stencil write for stencil write
-                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
-                                              &state.depthStencilState,
-                                              work.triFlags.frontFacing,
-                                              psContext.vZ,
-                                              pDepthSample,
-                                              depthPassMask,
-                                              vCoverageMask,
-                                              pStencilSample,
-                                              stencilPassMask);
-
-                            work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-                            continue;
-                        }
-                    }
-
-                    uint32_t statMask  = _simd_movemask_ps(depthPassMask);
-                    uint32_t statCount = _mm_popcnt_u32(statMask);
-                    UPDATE_STAT_BE(DepthPassCount, statCount);
-
-                    // output merger
-                    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId);
-
-                    OutputMerger8x2(pDC,
-                                    psContext,
-                                    psContext.pColorBuffer,
-                                    sample,
-                                    &state.blendState,
-                                    state.pfnBlendFunc,
-                                    vCoverageMask,
-                                    depthPassMask,
-                                    state.psState.renderTargetMask,
-                                    useAlternateOffset,
-                                    workerId);
-
-                    // do final depth write after all pixel kills
-                    if (!state.psState.forceEarlyZ)
-                    {
-                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
-                                          &state.depthStencilState,
-                                          work.triFlags.frontFacing,
-                                          psContext.vZ,
-                                          pDepthSample,
-                                          depthPassMask,
-                                          vCoverageMask,
-                                          pStencilSample,
-                                          stencilPassMask);
-                    }
-                    RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0);
-                }
-                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-        Endtile:
-            ATTR_UNUSED;
-
-            RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId);
-
-            if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
-            {
-                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-            if (useAlternateOffset)
-            {
-                unsigned long rt;
-                uint32_t rtMask = state.colorHottileEnable;
-                while (_BitScanForward(&rt, rtMask))
-                {
-                    rtMask &= ~(1 << rt);
-                    psContext.pColorBuffer[rt] +=
-                        (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-                }
-            }
-
-            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBuffer +=
-                (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-
-            RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0);
-
-            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL, dx);
-            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
-        }
-
-        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL, dy);
-        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
-    }
-
-    if (isTileDirty)
-    {
-        SetRenderHotTilesDirty(pDC, renderBuffers);
-    }
-
-    RDTSC_END(pDC->pContext->pBucketMgr, BESampleRateBackend, 0);
-}
-
-// Recursive template used to auto-nest conditionals.  Converts dynamic enum function
-// arguments to static template arguments.
-template <uint32_t... ArgsT>
-struct BEChooserSampleRate
-{
-    // Last Arg Terminator
-    static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
-    {
-        switch (tArg)
-        {
-        case SWR_BACKEND_MSAA_SAMPLE_RATE:
-            return BackendSampleRate<SwrBackendTraits<ArgsT...>>;
-            break;
-        case SWR_BACKEND_SINGLE_SAMPLE:
-        case SWR_BACKEND_MSAA_PIXEL_RATE:
-            SWR_ASSERT(0 && "Invalid backend func\n");
-            return nullptr;
-            break;
-        default:
-            SWR_ASSERT(0 && "Invalid backend func\n");
-            return nullptr;
-            break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
-    {
-        switch (tArg)
-        {
-        case SWR_INPUT_COVERAGE_NONE:
-            return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
-                remainingArgs...);
-            break;
-        case SWR_INPUT_COVERAGE_NORMAL:
-            return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(
-                remainingArgs...);
-            break;
-        case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE:
-            return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(
-                remainingArgs...);
-            break;
-        default:
-            SWR_ASSERT(0 && "Invalid sample pattern\n");
-            return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
-                remainingArgs...);
-            break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
-    {
-        switch (tArg)
-        {
-        case SWR_MULTISAMPLE_1X:
-            return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
-            break;
-        case SWR_MULTISAMPLE_2X:
-            return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...);
-            break;
-        case SWR_MULTISAMPLE_4X:
-            return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...);
-            break;
-        case SWR_MULTISAMPLE_8X:
-            return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...);
-            break;
-        case SWR_MULTISAMPLE_16X:
-            return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...);
-            break;
-        default:
-            SWR_ASSERT(0 && "Invalid sample count\n");
-            return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
-            break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
-    {
-        if (tArg == true)
-        {
-            return BEChooserSampleRate<ArgsT..., 1>::GetFunc(remainingArgs...);
-        }
-
-        return BEChooserSampleRate<ArgsT..., 0>::GetFunc(remainingArgs...);
-    }
-};
-
-void InitBackendSampleFuncTable(
-    PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
-{
-    for (uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT;
-         sampleCount++)
-    {
-        for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
-        {
-            for (uint32_t centroid = 0; centroid < 2; centroid++)
-            {
-                for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
-                {
-                    table[sampleCount][inputCoverage][centroid][canEarlyZ] =
-                        BEChooserSampleRate<>::GetFunc(
-                            (SWR_MULTISAMPLE_COUNT)sampleCount,
-                            false,
-                            (SWR_INPUT_COVERAGE)inputCoverage,
-                            (centroid > 0),
-                            false,
-                            (canEarlyZ > 0),
-                            (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
-                }
-            }
-        }
-    }
-}
--- a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
@ -1,428 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file backend.cpp
- *
- * @brief Backend handles rasterization, pixel shading and output merger
- *        operations.
- *
- ******************************************************************************/
-
-#include <smmintrin.h>
-
-#include "backend.h"
-#include "backend_impl.h"
-#include "tilemgr.h"
-#include "memory/tilingtraits.h"
-#include "core/multisample.h"
-
-#include <algorithm>
-
-template <typename T>
-void BackendSingleSample(DRAW_CONTEXT*        pDC,
-                         uint32_t             workerId,
-                         uint32_t             x,
-                         uint32_t             y,
-                         SWR_TRIANGLE_DESC&   work,
-                         RenderOutputBuffers& renderBuffers)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESingleSampleBackend, pDC->drawId);
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
-
-    void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
-
-    const API_STATE& state = GetApiState(pDC);
-
-    BarycentricCoeffs coeffs;
-    SetupBarycentricCoeffs(&coeffs, work);
-
-    SWR_PS_CONTEXT             psContext;
-    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
-    SetupPixelShaderContext<T>(&psContext, samplePos, work);
-
-    uint8_t *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(psContext.pColorBuffer,
-                       &pDepthBuffer,
-                       &pStencilBuffer,
-                       state.colorHottileEnable,
-                       renderBuffers);
-
-    // Indicates backend rendered something to the color buffer
-    bool isTileDirty = false;
-
-    RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 1);
-
-    psContext.vY.UL     = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-
-    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
-
-    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
-    {
-        psContext.vX.UL     = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-
-        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
-
-        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
-        {
-            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
-
-
-            simdmask coverageMask = work.coverageMask[0] & MASK;
-
-            if (coverageMask)
-            {
-                if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
-                {
-                    static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
-                                  "Unsupported depth hot tile format");
-
-                    const simdscalar z =
-                        _simd_load_ps(reinterpret_cast<const float*>(pDepthBuffer));
-
-                    const float minz = state.depthBoundsState.depthBoundsTestMinValue;
-                    const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
-                    coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
-                }
-
-                if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
-                {
-                    const uint64_t* pCoverageMask =
-                        (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
-                            ? &work.innerCoverageMask
-                            : &work.coverageMask[0];
-
-                    generateInputCoverage<T, T::InputCoverage>(
-                        pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
-                }
-
-                RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
-
-                CalcPixelBarycentrics(coeffs, psContext);
-
-                CalcCentroid<T, true>(
-                    &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
-
-                // interpolate and quantize z
-                psContext.vZ = vplaneps(
-                    coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
-                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-
-                RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 1);
-
-                // interpolate user clip distance if available
-                if (state.backendState.clipDistanceMask)
-                {
-                    coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
-                                                         work.pUserClipBuffer,
-                                                         psContext.vI.center,
-                                                         psContext.vJ.center);
-                }
-
-                simdscalar vCoverageMask   = _simd_vmask_ps(coverageMask);
-                simdscalar depthPassMask   = vCoverageMask;
-                simdscalar stencilPassMask = vCoverageMask;
-
-                // Early-Z?
-                if (T::bCanEarlyZ)
-                {
-                    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId);
-                    depthPassMask = DepthStencilTest(&state,
-                                                     work.triFlags.frontFacing,
-                                                     work.triFlags.viewportIndex,
-                                                     psContext.vZ,
-                                                     pDepthBuffer,
-                                                     vCoverageMask,
-                                                     pStencilBuffer,
-                                                     &stencilPassMask);
-                    AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
-                                                               _simd_movemask_ps(stencilPassMask),
-                                                               _simd_movemask_ps(vCoverageMask)));
-                    RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0);
-
-                    // early-exit if no pixels passed depth or earlyZ is forced on
-                    if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
-                    {
-                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
-                                          &state.depthStencilState,
-                                          work.triFlags.frontFacing,
-                                          psContext.vZ,
-                                          pDepthBuffer,
-                                          depthPassMask,
-                                          vCoverageMask,
-                                          pStencilBuffer,
-                                          stencilPassMask);
-
-                        if (!_simd_movemask_ps(depthPassMask))
-                        {
-                            goto Endtile;
-                        }
-                    }
-                }
-
-                psContext.sampleIndex = 0;
-                psContext.activeMask  = _simd_castps_si(vCoverageMask);
-
-                // execute pixel shader
-                RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId);
-                state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
-                RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0);
-
-                // update stats
-                UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
-                AR_EVENT(PSStats((HANDLE)&psContext.stats));
-
-                vCoverageMask = _simd_castsi_ps(psContext.activeMask);
-
-                if (_simd_movemask_ps(vCoverageMask))
-                {
-                    isTileDirty = true;
-                }
-
-                // late-Z
-                if (!T::bCanEarlyZ)
-                {
-                    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BELateDepthTest, pDC->drawId);
-                    depthPassMask = DepthStencilTest(&state,
-                                                     work.triFlags.frontFacing,
-                                                     work.triFlags.viewportIndex,
-                                                     psContext.vZ,
-                                                     pDepthBuffer,
-                                                     vCoverageMask,
-                                                     pStencilBuffer,
-                                                     &stencilPassMask);
-                    AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
-                                                              _simd_movemask_ps(stencilPassMask),
-                                                              _simd_movemask_ps(vCoverageMask)));
-                    RDTSC_END(pDC->pContext->pBucketMgr, BELateDepthTest, 0);
-
-                    if (!_simd_movemask_ps(depthPassMask))
-                    {
-                        // need to call depth/stencil write for stencil write
-                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
-                                          &state.depthStencilState,
-                                          work.triFlags.frontFacing,
-                                          psContext.vZ,
-                                          pDepthBuffer,
-                                          depthPassMask,
-                                          vCoverageMask,
-                                          pStencilBuffer,
-                                          stencilPassMask);
-                        goto Endtile;
-                    }
-                }
-                else
-                {
-                    // for early z, consolidate discards from shader
-                    // into depthPassMask
-                    depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
-                }
-
-                uint32_t statMask  = _simd_movemask_ps(depthPassMask);
-                uint32_t statCount = _mm_popcnt_u32(statMask);
-                UPDATE_STAT_BE(DepthPassCount, statCount);
-
-                // output merger
-                RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId);
-
-                OutputMerger8x2(pDC,
-                                psContext,
-                                psContext.pColorBuffer,
-                                0,
-                                &state.blendState,
-                                state.pfnBlendFunc,
-                                vCoverageMask,
-                                depthPassMask,
-                                state.psState.renderTargetMask,
-                                useAlternateOffset,
-                                workerId);
-
-                // do final depth write after all pixel kills
-                if (!state.psState.forceEarlyZ)
-                {
-                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
-                                      &state.depthStencilState,
-                                      work.triFlags.frontFacing,
-                                      psContext.vZ,
-                                      pDepthBuffer,
-                                      depthPassMask,
-                                      vCoverageMask,
-                                      pStencilBuffer,
-                                      stencilPassMask);
-                }
-                RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0);
-            }
-
-        Endtile:
-            RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId);
-
-            work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
-            {
-                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-            if (useAlternateOffset)
-            {
-                unsigned long rt;
-                uint32_t rtMask = state.colorHottileEnable;
-                while (_BitScanForward(&rt, rtMask))
-                {
-                    rtMask &= ~(1 << rt);
-                    psContext.pColorBuffer[rt] +=
-                        (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-                }
-            }
-
-            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBuffer +=
-                (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-
-            RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0);
-
-            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL, dx);
-            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
-        }
-
-        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL, dy);
-        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
-    }
-
-    if (isTileDirty)
-    {
-        SetRenderHotTilesDirty(pDC, renderBuffers);
-    }
-
-    RDTSC_END(pDC->pContext->pBucketMgr, BESingleSampleBackend, 0);
-}
-
-// Recursive template used to auto-nest conditionals.  Converts dynamic enum function
-// arguments to static template arguments.
-template <uint32_t... ArgsT>
-struct BEChooserSingleSample
-{
-    // Last Arg Terminator
-    static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
-    {
-        switch (tArg)
-        {
-        case SWR_BACKEND_SINGLE_SAMPLE:
-            return BackendSingleSample<SwrBackendTraits<ArgsT...>>;
-            break;
-        case SWR_BACKEND_MSAA_PIXEL_RATE:
-        case SWR_BACKEND_MSAA_SAMPLE_RATE:
-        default:
-            SWR_ASSERT(0 && "Invalid backend func\n");
-            return nullptr;
-            break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
-    {
-        switch (tArg)
-        {
-        case SWR_INPUT_COVERAGE_NONE:
-            return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
-                remainingArgs...);
-            break;
-        case SWR_INPUT_COVERAGE_NORMAL:
-            return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(
-                remainingArgs...);
-            break;
-        case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE:
-            return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(
-                remainingArgs...);
-            break;
-        default:
-            SWR_ASSERT(0 && "Invalid sample pattern\n");
-            return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
-                remainingArgs...);
-            break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
-    {
-        switch (tArg)
-        {
-        case SWR_MULTISAMPLE_1X:
-            return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
-            break;
-        case SWR_MULTISAMPLE_2X:
-            return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...);
-            break;
-        case SWR_MULTISAMPLE_4X:
-            return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...);
-            break;
-        case SWR_MULTISAMPLE_8X:
-            return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...);
-            break;
-        case SWR_MULTISAMPLE_16X:
-            return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...);
-            break;
-        default:
-            SWR_ASSERT(0 && "Invalid sample count\n");
-            return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
-            break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
-    {
-        if (tArg == true)
-        {
-            return BEChooserSingleSample<ArgsT..., 1>::GetFunc(remainingArgs...);
-        }
-
-        return BEChooserSingleSample<ArgsT..., 0>::GetFunc(remainingArgs...);
-    }
-};
-
-void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
-{
-    for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
-    {
-        for (uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
-        {
-            for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
-            {
-                table[inputCoverage][isCentroid][canEarlyZ] =
-                    BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X,
-                                                     false,
-                                                     (SWR_INPUT_COVERAGE)inputCoverage,
-                                                     (isCentroid > 0),
-                                                     false,
-                                                     (canEarlyZ > 0),
-                                                     SWR_BACKEND_SINGLE_SAMPLE);
-            }
-        }
-    }
-}
--- a/src/gallium/drivers/swr/rasterizer/core/backends/meson.build
+++ b/src/gallium/drivers/swr/rasterizer/core/backends/meson.build
@ -1,57 +0,0 @@
-# Copyright © 2017-2018 Intel Corporation
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-
-files_swr_common += custom_target(
-  'gen_backend_pixel',
-  input : swr_gen_backends_py,
-  output : [
-    'gen_BackendPixelRate0.cpp', 'gen_BackendPixelRate1.cpp',
-    'gen_BackendPixelRate2.cpp', 'gen_BackendPixelRate3.cpp',
-    'gen_BackendPixelRate.hpp',
-  ],
-  command : [
-    prog_python, '@INPUT@',
-    '--outdir', '@OUTDIR@',
-    '--dim', '5', '2', '3', '2', '2', '2',
-    '--numfiles', '4',
-    '--cpp', '--hpp',
-  ],
-  depend_files : [ swr_gen_backend_files, swr_gen_header_init_files ],
-)
-
-files_swr_common += custom_target(
-  'gen_backend_raster',
-  input : swr_gen_backends_py,
-  output : [
-    'gen_rasterizer0.cpp', 'gen_rasterizer1.cpp',
-    'gen_rasterizer2.cpp', 'gen_rasterizer3.cpp',
-    'gen_rasterizer.hpp',
-  ],
-  command : [
-    prog_python, '@INPUT@',
-    '--outdir', '@OUTDIR@',
-    '--rast',
-    '--dim', '5', '2', '2', '3', '5', '2',
-    '--numfiles', '4',
-    '--cpp', '--hpp',
-  ],
-  depend_files : [ swr_gen_rasterizer_files, swr_gen_header_init_files ],
-)
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
--- a/src/gallium/drivers/swr/rasterizer/core/binner.h
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.h
@ -1,254 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file binner.h
- *
- * @brief Declaration for the macrotile binner
- *
- ******************************************************************************/
-#include "state.h"
-#include "conservativeRast.h"
-#include "utils.h"
-//////////////////////////////////////////////////////////////////////////
-/// @brief Offsets added to post-viewport vertex positions based on
-/// raster state.
-///
-/// Can't use templated variable because we must stick with C++11 features.
-/// Template variables were introduced with C++14
-template <typename SIMD_T>
-struct SwrPixelOffsets
-{
-public:
-    INLINE static Float<SIMD_T> GetOffset(uint32_t loc)
-    {
-        SWR_ASSERT(loc <= 1);
-
-        return SIMD_T::set1_ps(loc ? 0.5f : 0.0f);
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Convert the X,Y coords of a triangle to the requested Fixed
-/// Point precision from FP32.
-template <typename SIMD_T, typename PT = FixedPointTraits<Fixed_16_8>>
-INLINE Integer<SIMD_T> fpToFixedPointVertical(const Float<SIMD_T>& vIn)
-{
-    return SIMD_T::cvtps_epi32(SIMD_T::mul_ps(vIn, SIMD_T::set1_ps(PT::ScaleT::value)));
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Helper function to set the X,Y coords of a triangle to the
-/// requested Fixed Point precision from FP32.
-/// @param tri: simdvector[3] of FP triangle verts
-/// @param vXi: fixed point X coords of tri verts
-/// @param vYi: fixed point Y coords of tri verts
-template <typename SIMD_T>
-INLINE static void
-FPToFixedPoint(const Vec4<SIMD_T>* const tri, Integer<SIMD_T> (&vXi)[3], Integer<SIMD_T> (&vYi)[3])
-{
-    vXi[0] = fpToFixedPointVertical<SIMD_T>(tri[0].x);
-    vYi[0] = fpToFixedPointVertical<SIMD_T>(tri[0].y);
-    vXi[1] = fpToFixedPointVertical<SIMD_T>(tri[1].x);
-    vYi[1] = fpToFixedPointVertical<SIMD_T>(tri[1].y);
-    vXi[2] = fpToFixedPointVertical<SIMD_T>(tri[2].x);
-    vYi[2] = fpToFixedPointVertical<SIMD_T>(tri[2].y);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Calculate bounding box for current triangle
-/// @tparam CT: ConservativeRastFETraits type
-/// @param vX: fixed point X position for triangle verts
-/// @param vY: fixed point Y position for triangle verts
-/// @param bbox: fixed point bbox
-/// *Note*: expects vX, vY to be in the correct precision for the type
-/// of rasterization. This avoids unnecessary FP->fixed conversions.
-template <typename SIMD_T, typename CT>
-INLINE void calcBoundingBoxIntVertical(const Integer<SIMD_T> (&vX)[3],
-                                       const Integer<SIMD_T> (&vY)[3],
-                                       SIMDBBOX_T<SIMD_T>& bbox)
-{
-    Integer<SIMD_T> vMinX = vX[0];
-
-    vMinX = SIMD_T::min_epi32(vMinX, vX[1]);
-    vMinX = SIMD_T::min_epi32(vMinX, vX[2]);
-
-    Integer<SIMD_T> vMaxX = vX[0];
-
-    vMaxX = SIMD_T::max_epi32(vMaxX, vX[1]);
-    vMaxX = SIMD_T::max_epi32(vMaxX, vX[2]);
-
-    Integer<SIMD_T> vMinY = vY[0];
-
-    vMinY = SIMD_T::min_epi32(vMinY, vY[1]);
-    vMinY = SIMD_T::min_epi32(vMinY, vY[2]);
-
-    Integer<SIMD_T> vMaxY = vY[0];
-
-    vMaxY = SIMD_T::max_epi32(vMaxY, vY[1]);
-    vMaxY = SIMD_T::max_epi32(vMaxY, vY[2]);
-
-    if (CT::BoundingBoxOffsetT::value != 0)
-    {
-        /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative
-        /// rasterization expand bbox by 1/256; coverage will be correctly handled in the
-        /// rasterizer.
-
-        const Integer<SIMD_T> value = SIMD_T::set1_epi32(CT::BoundingBoxOffsetT::value);
-
-        vMinX = SIMD_T::sub_epi32(vMinX, value);
-        vMaxX = SIMD_T::add_epi32(vMaxX, value);
-        vMinY = SIMD_T::sub_epi32(vMinY, value);
-        vMaxY = SIMD_T::add_epi32(vMaxY, value);
-    }
-
-    bbox.xmin = vMinX;
-    bbox.xmax = vMaxX;
-    bbox.ymin = vMinY;
-    bbox.ymax = vMaxY;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief  Gather scissor rect data based on per-prim viewport indices.
-/// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
-/// @param pViewportIndex - array of per-primitive viewport indexes.
-/// @param scisXmin - output vector of per-primitive scissor rect Xmin data.
-/// @param scisYmin - output vector of per-primitive scissor rect Ymin data.
-/// @param scisXmax - output vector of per-primitive scissor rect Xmax data.
-/// @param scisYmax - output vector of per-primitive scissor rect Ymax data.
-//
-/// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-static void GatherScissors(const SWR_RECT* pScissorsInFixedPoint,
-                           const uint32_t* pViewportIndex,
-                           simdscalari&    scisXmin,
-                           simdscalari&    scisYmin,
-                           simdscalari&    scisXmax,
-                           simdscalari&    scisYmax)
-{
-    scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].xmin,
-                               pScissorsInFixedPoint[pViewportIndex[6]].xmin,
-                               pScissorsInFixedPoint[pViewportIndex[5]].xmin,
-                               pScissorsInFixedPoint[pViewportIndex[4]].xmin,
-                               pScissorsInFixedPoint[pViewportIndex[3]].xmin,
-                               pScissorsInFixedPoint[pViewportIndex[2]].xmin,
-                               pScissorsInFixedPoint[pViewportIndex[1]].xmin,
-                               pScissorsInFixedPoint[pViewportIndex[0]].xmin);
-    scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].ymin,
-                               pScissorsInFixedPoint[pViewportIndex[6]].ymin,
-                               pScissorsInFixedPoint[pViewportIndex[5]].ymin,
-                               pScissorsInFixedPoint[pViewportIndex[4]].ymin,
-                               pScissorsInFixedPoint[pViewportIndex[3]].ymin,
-                               pScissorsInFixedPoint[pViewportIndex[2]].ymin,
-                               pScissorsInFixedPoint[pViewportIndex[1]].ymin,
-                               pScissorsInFixedPoint[pViewportIndex[0]].ymin);
-    scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].xmax,
-                               pScissorsInFixedPoint[pViewportIndex[6]].xmax,
-                               pScissorsInFixedPoint[pViewportIndex[5]].xmax,
-                               pScissorsInFixedPoint[pViewportIndex[4]].xmax,
-                               pScissorsInFixedPoint[pViewportIndex[3]].xmax,
-                               pScissorsInFixedPoint[pViewportIndex[2]].xmax,
-                               pScissorsInFixedPoint[pViewportIndex[1]].xmax,
-                               pScissorsInFixedPoint[pViewportIndex[0]].xmax);
-    scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].ymax,
-                               pScissorsInFixedPoint[pViewportIndex[6]].ymax,
-                               pScissorsInFixedPoint[pViewportIndex[5]].ymax,
-                               pScissorsInFixedPoint[pViewportIndex[4]].ymax,
-                               pScissorsInFixedPoint[pViewportIndex[3]].ymax,
-                               pScissorsInFixedPoint[pViewportIndex[2]].ymax,
-                               pScissorsInFixedPoint[pViewportIndex[1]].ymax,
-                               pScissorsInFixedPoint[pViewportIndex[0]].ymax);
-}
-
-static void GatherScissors(const SWR_RECT* pScissorsInFixedPoint,
-                           const uint32_t* pViewportIndex,
-                           simd16scalari&  scisXmin,
-                           simd16scalari&  scisYmin,
-                           simd16scalari&  scisXmax,
-                           simd16scalari&  scisYmax)
-{
-    scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[14]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[13]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[12]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[11]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[10]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[9]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[8]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[7]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[5]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[4]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[3]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[2]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[1]].xmin,
-                                 pScissorsInFixedPoint[pViewportIndex[0]].xmin);
-
-    scisYmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[14]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[13]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[12]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[11]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[10]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[9]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[8]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[7]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[5]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[4]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[3]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[2]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[1]].ymin,
-                                 pScissorsInFixedPoint[pViewportIndex[0]].ymin);
-
-    scisXmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[14]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[13]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[12]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[11]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[10]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[9]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[8]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[7]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[5]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[4]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[3]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[2]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[1]].xmax,
-                                 pScissorsInFixedPoint[pViewportIndex[0]].xmax);
-
-    scisYmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[14]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[13]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[12]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[11]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[10]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[9]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[8]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[7]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[5]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[4]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[3]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[2]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[1]].ymax,
-                                 pScissorsInFixedPoint[pViewportIndex[0]].ymax);
-}
--- a/src/gallium/drivers/swr/rasterizer/core/blend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/blend.h
@ -1,348 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file blend.cpp
- *
- * @brief Implementation for blending operations.
- *
- ******************************************************************************/
-#include "state.h"
-
-template <bool Color, bool Alpha>
-INLINE void GenerateBlendFactor(SWR_BLEND_FACTOR func,
-                                simdvector&      constantColor,
-                                simdvector&      src,
-                                simdvector&      src1,
-                                simdvector&      dst,
-                                simdvector&      out)
-{
-    simdvector result;
-
-    switch (func)
-    {
-    case BLENDFACTOR_ZERO:
-        result.x = _simd_setzero_ps();
-        result.y = _simd_setzero_ps();
-        result.z = _simd_setzero_ps();
-        result.w = _simd_setzero_ps();
-        break;
-
-    case BLENDFACTOR_ONE:
-        result.x = _simd_set1_ps(1.0);
-        result.y = _simd_set1_ps(1.0);
-        result.z = _simd_set1_ps(1.0);
-        result.w = _simd_set1_ps(1.0);
-        break;
-
-    case BLENDFACTOR_SRC_COLOR:
-        result = src;
-        break;
-
-    case BLENDFACTOR_DST_COLOR:
-        result = dst;
-        break;
-
-    case BLENDFACTOR_INV_SRC_COLOR:
-        result.x = _simd_sub_ps(_simd_set1_ps(1.0), src.x);
-        result.y = _simd_sub_ps(_simd_set1_ps(1.0), src.y);
-        result.z = _simd_sub_ps(_simd_set1_ps(1.0), src.z);
-        result.w = _simd_sub_ps(_simd_set1_ps(1.0), src.w);
-        break;
-
-    case BLENDFACTOR_INV_DST_COLOR:
-        result.x = _simd_sub_ps(_simd_set1_ps(1.0), dst.x);
-        result.y = _simd_sub_ps(_simd_set1_ps(1.0), dst.y);
-        result.z = _simd_sub_ps(_simd_set1_ps(1.0), dst.z);
-        result.w = _simd_sub_ps(_simd_set1_ps(1.0), dst.w);
-        break;
-
-    case BLENDFACTOR_SRC_ALPHA:
-        result.x = src.w;
-        result.y = src.w;
-        result.z = src.w;
-        result.w = src.w;
-        break;
-
-    case BLENDFACTOR_INV_SRC_ALPHA:
-    {
-        simdscalar oneMinusSrcA = _simd_sub_ps(_simd_set1_ps(1.0), src.w);
-        result.x                = oneMinusSrcA;
-        result.y                = oneMinusSrcA;
-        result.z                = oneMinusSrcA;
-        result.w                = oneMinusSrcA;
-        break;
-    }
-
-    case BLENDFACTOR_DST_ALPHA:
-        result.x = dst.w;
-        result.y = dst.w;
-        result.z = dst.w;
-        result.w = dst.w;
-        break;
-
-    case BLENDFACTOR_INV_DST_ALPHA:
-    {
-        simdscalar oneMinusDstA = _simd_sub_ps(_simd_set1_ps(1.0), dst.w);
-        result.x                = oneMinusDstA;
-        result.y                = oneMinusDstA;
-        result.z                = oneMinusDstA;
-        result.w                = oneMinusDstA;
-        break;
-    }
-
-    case BLENDFACTOR_SRC_ALPHA_SATURATE:
-    {
-        simdscalar sat = _simd_min_ps(src.w, _simd_sub_ps(_simd_set1_ps(1.0), dst.w));
-        result.x       = sat;
-        result.y       = sat;
-        result.z       = sat;
-        result.w       = _simd_set1_ps(1.0);
-        break;
-    }
-
-    case BLENDFACTOR_CONST_COLOR:
-        result.x = constantColor[0];
-        result.y = constantColor[1];
-        result.z = constantColor[2];
-        result.w = constantColor[3];
-        break;
-
-    case BLENDFACTOR_CONST_ALPHA:
-        result.x = result.y = result.z = result.w = constantColor[3];
-        break;
-
-    case BLENDFACTOR_INV_CONST_COLOR:
-    {
-        result.x = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[0]);
-        result.y = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[1]);
-        result.z = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[2]);
-        result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]);
-        break;
-    }
-
-    case BLENDFACTOR_INV_CONST_ALPHA:
-    {
-        result.x = result.y = result.z = result.w =
-            _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]);
-        break;
-    }
-
-    case BLENDFACTOR_SRC1_COLOR:
-        result.x = src1.x;
-        result.y = src1.y;
-        result.z = src1.z;
-        result.w = src1.w;
-        break;
-
-    case BLENDFACTOR_SRC1_ALPHA:
-        result.x = result.y = result.z = result.w = src1.w;
-        break;
-
-    case BLENDFACTOR_INV_SRC1_COLOR:
-        result.x = _simd_sub_ps(_simd_set1_ps(1.0f), src1.x);
-        result.y = _simd_sub_ps(_simd_set1_ps(1.0f), src1.y);
-        result.z = _simd_sub_ps(_simd_set1_ps(1.0f), src1.z);
-        result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w);
-        break;
-
-    case BLENDFACTOR_INV_SRC1_ALPHA:
-        result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w);
-        break;
-
-    default:
-        SWR_INVALID("Unimplemented blend factor: %d", func);
-    }
-
-    if (Color)
-    {
-        out.x = result.x;
-        out.y = result.y;
-        out.z = result.z;
-    }
-    if (Alpha)
-    {
-        out.w = result.w;
-    }
-}
-
-template <bool Color, bool Alpha>
-INLINE void BlendFunc(SWR_BLEND_OP blendOp,
-                      simdvector&  src,
-                      simdvector&  srcFactor,
-                      simdvector&  dst,
-                      simdvector&  dstFactor,
-                      simdvector&  out)
-{
-    simdvector result;
-
-    switch (blendOp)
-    {
-    case BLENDOP_ADD:
-        result.x = _simd_fmadd_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x));
-        result.y = _simd_fmadd_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y));
-        result.z = _simd_fmadd_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z));
-        result.w = _simd_fmadd_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w));
-        break;
-
-    case BLENDOP_SUBTRACT:
-        result.x = _simd_fmsub_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x));
-        result.y = _simd_fmsub_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y));
-        result.z = _simd_fmsub_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z));
-        result.w = _simd_fmsub_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w));
-        break;
-
-    case BLENDOP_REVSUBTRACT:
-        result.x = _simd_fmsub_ps(dstFactor.x, dst.x, _simd_mul_ps(srcFactor.x, src.x));
-        result.y = _simd_fmsub_ps(dstFactor.y, dst.y, _simd_mul_ps(srcFactor.y, src.y));
-        result.z = _simd_fmsub_ps(dstFactor.z, dst.z, _simd_mul_ps(srcFactor.z, src.z));
-        result.w = _simd_fmsub_ps(dstFactor.w, dst.w, _simd_mul_ps(srcFactor.w, src.w));
-        break;
-
-    case BLENDOP_MIN:
-        result.x = _simd_min_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x));
-        result.y = _simd_min_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y));
-        result.z = _simd_min_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z));
-        result.w = _simd_min_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w));
-        break;
-
-    case BLENDOP_MAX:
-        result.x = _simd_max_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x));
-        result.y = _simd_max_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y));
-        result.z = _simd_max_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z));
-        result.w = _simd_max_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w));
-        break;
-
-    default:
-        SWR_INVALID("Unimplemented blend function: %d", blendOp);
-    }
-
-    if (Color)
-    {
-        out.x = result.x;
-        out.y = result.y;
-        out.z = result.z;
-    }
-    if (Alpha)
-    {
-        out.w = result.w;
-    }
-}
-
-template <SWR_TYPE type>
-INLINE void Clamp(simdvector& src)
-{
-    switch (type)
-    {
-    case SWR_TYPE_FLOAT:
-        break;
-
-    case SWR_TYPE_UNORM:
-        src.x = _simd_max_ps(src.x, _simd_setzero_ps());
-        src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f));
-
-        src.y = _simd_max_ps(src.y, _simd_setzero_ps());
-        src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f));
-
-        src.z = _simd_max_ps(src.z, _simd_setzero_ps());
-        src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f));
-
-        src.w = _simd_max_ps(src.w, _simd_setzero_ps());
-        src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f));
-        break;
-
-    case SWR_TYPE_SNORM:
-        src.x = _simd_max_ps(src.x, _simd_set1_ps(-1.0f));
-        src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f));
-
-        src.y = _simd_max_ps(src.y, _simd_set1_ps(-1.0f));
-        src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f));
-
-        src.z = _simd_max_ps(src.z, _simd_set1_ps(-1.0f));
-        src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f));
-
-        src.w = _simd_max_ps(src.w, _simd_set1_ps(-1.0f));
-        src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f));
-        break;
-
-    default:
-        SWR_INVALID("Unimplemented clamp: %d", type);
-        break;
-    }
-}
-
-template <SWR_TYPE type>
-void Blend(const SWR_BLEND_STATE*               pBlendState,
-           const SWR_RENDER_TARGET_BLEND_STATE* pState,
-           simdvector&                          src,
-           simdvector&                          src1,
-           uint8_t*                             pDst,
-           simdvector&                          result)
-{
-    // load render target
-    simdvector dst;
-    LoadSOA<KNOB_COLOR_HOT_TILE_FORMAT>(pDst, dst);
-
-    simdvector constColor;
-    constColor.x = _simd_broadcast_ss(&pBlendState->constantColor[0]);
-    constColor.y = _simd_broadcast_ss(&pBlendState->constantColor[1]);
-    constColor.z = _simd_broadcast_ss(&pBlendState->constantColor[2]);
-    constColor.w = _simd_broadcast_ss(&pBlendState->constantColor[3]);
-
-    // clamp src/dst/constant
-    Clamp<type>(src);
-    Clamp<type>(src1);
-    Clamp<type>(dst);
-    Clamp<type>(constColor);
-
-    simdvector srcFactor, dstFactor;
-    if (pBlendState->independentAlphaBlendEnable)
-    {
-        GenerateBlendFactor<true, false>(
-            (SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor);
-        GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->sourceAlphaBlendFactor,
-                                         constColor,
-                                         src,
-                                         src1,
-                                         dst,
-                                         srcFactor);
-
-        GenerateBlendFactor<true, false>(
-            (SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor);
-        GenerateBlendFactor<false, true>(
-            (SWR_BLEND_FACTOR)pState->destAlphaBlendFactor, constColor, src, src1, dst, dstFactor);
-
-        BlendFunc<true, false>(
-            (SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result);
-        BlendFunc<false, true>(
-            (SWR_BLEND_OP)pState->alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
-    }
-    else
-    {
-        GenerateBlendFactor<true, true>(
-            (SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor);
-        GenerateBlendFactor<true, true>(
-            (SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor);
-
-        BlendFunc<true, true>(
-            (SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result);
-    }
-}
--- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
@ -1,336 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file clip.cpp
- *
- * @brief Implementation for clipping
- *
- ******************************************************************************/
-
-#include <assert.h>
-
-#include "common/os.h"
-#include "core/clip.h"
-
-float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1)
-{
-    return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1));
-}
-
-template <SWR_CLIPCODES ClippingPlane>
-inline void intersect(
-    int          s,          // index to first edge vertex v0 in pInPts.
-    int          p,          // index to second edge vertex v1 in pInPts.
-    const float* pInPts,     // array of all the input positions.
-    const float* pInAttribs, // array of all attributes for all vertex. All the attributes for each
-                             // vertex is contiguous.
-    int    numInAttribs,     // number of attributes per vertex.
-    int    i,                // output index.
-    float* pOutPts,     // array of output positions. We'll write our new intersection point at i*4.
-    float* pOutAttribs) // array of output attributes. We'll write our new attributes at
-                        // i*numInAttribs.
-{
-    float t;
-
-    // Find the parameter of the intersection.
-    //        t = (v1.w - v1.x) / ((v2.x - v1.x) - (v2.w - v1.w)) for x = w (RIGHT) plane, etc.
-    const float* v1 = &pInPts[s * 4];
-    const float* v2 = &pInPts[p * 4];
-
-    switch (ClippingPlane)
-    {
-    case FRUSTUM_LEFT:
-        t = ComputeInterpFactor(v1[3] + v1[0], v2[3] + v2[0]);
-        break;
-    case FRUSTUM_RIGHT:
-        t = ComputeInterpFactor(v1[3] - v1[0], v2[3] - v2[0]);
-        break;
-    case FRUSTUM_TOP:
-        t = ComputeInterpFactor(v1[3] + v1[1], v2[3] + v2[1]);
-        break;
-    case FRUSTUM_BOTTOM:
-        t = ComputeInterpFactor(v1[3] - v1[1], v2[3] - v2[1]);
-        break;
-    case FRUSTUM_NEAR:
-        t = ComputeInterpFactor(v1[2], v2[2]);
-        break;
-    case FRUSTUM_FAR:
-        t = ComputeInterpFactor(v1[3] - v1[2], v2[3] - v2[2]);
-        break;
-    default:
-        SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
-    };
-
-    const float* a1 = &pInAttribs[s * numInAttribs];
-    const float* a2 = &pInAttribs[p * numInAttribs];
-
-    float* pOutP = &pOutPts[i * 4];
-    float* pOutA = &pOutAttribs[i * numInAttribs];
-
-    // Interpolate new position.
-    for (int j = 0; j < 4; ++j)
-    {
-        pOutP[j] = v1[j] + (v2[j] - v1[j]) * t;
-    }
-
-    // Interpolate Attributes
-    for (int attr = 0; attr < numInAttribs; ++attr)
-    {
-        pOutA[attr] = a1[attr] + (a2[attr] - a1[attr]) * t;
-    }
-}
-
-// Checks whether vertex v lies inside clipping plane
-// in homogenous coords check -w < {x,y,z} < w;
-//
-template <SWR_CLIPCODES ClippingPlane>
-inline int inside(const float v[4])
-{
-    switch (ClippingPlane)
-    {
-    case FRUSTUM_LEFT:
-        return (v[0] >= -v[3]);
-    case FRUSTUM_RIGHT:
-        return (v[0] <= v[3]);
-    case FRUSTUM_TOP:
-        return (v[1] >= -v[3]);
-    case FRUSTUM_BOTTOM:
-        return (v[1] <= v[3]);
-    case FRUSTUM_NEAR:
-        return (v[2] >= 0.0f);
-    case FRUSTUM_FAR:
-        return (v[2] <= v[3]);
-    default:
-        SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
-        return 0;
-    }
-}
-
-// Clips a polygon in homogenous coordinates to a particular clipping plane.
-// Takes in vertices of the polygon (InPts) and the clipping plane
-// Puts the vertices of the clipped polygon in OutPts
-// Returns number of points in clipped polygon
-//
-template <SWR_CLIPCODES ClippingPlane>
-int ClipTriToPlane(const float* pInPts,
-                   int          numInPts,
-                   const float* pInAttribs,
-                   int          numInAttribs,
-                   float*       pOutPts,
-                   float*       pOutAttribs)
-{
-    int i = 0; // index number of OutPts, # of vertices in OutPts = i div 4;
-
-    for (int j = 0; j < numInPts; ++j)
-    {
-        int s = j;
-        int p = (j + 1) % numInPts;
-
-        int s_in = inside<ClippingPlane>(&pInPts[s * 4]);
-        int p_in = inside<ClippingPlane>(&pInPts[p * 4]);
-
-        // test if vertex is to be added to output vertices
-        if (s_in != p_in) // edge crosses clipping plane
-        {
-            // find point of intersection
-            intersect<ClippingPlane>(
-                s, p, pInPts, pInAttribs, numInAttribs, i, pOutPts, pOutAttribs);
-            i++;
-        }
-        if (p_in) // 2nd vertex is inside clipping volume, add it to output
-        {
-            // Copy 2nd vertex position of edge over to output.
-            for (int k = 0; k < 4; ++k)
-            {
-                pOutPts[i * 4 + k] = pInPts[p * 4 + k];
-            }
-            // Copy 2nd vertex attributes of edge over to output.
-            for (int attr = 0; attr < numInAttribs; ++attr)
-            {
-                pOutAttribs[i * numInAttribs + attr] = pInAttribs[p * numInAttribs + attr];
-            }
-            i++;
-        }
-        // edge does not cross clipping plane and vertex outside clipping volume
-        //  => do not add vertex
-    }
-    return i;
-}
-
-void ClipRectangles(DRAW_CONTEXT*      pDC,
-                    PA_STATE&          pa,
-                    uint32_t           workerId,
-                    simdvector         prims[],
-                    uint32_t           primMask,
-                    simdscalari const& primId,
-                    simdscalari const& viewportIdx,
-                    simdscalari const& rtIdx)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipRectangles, pDC->drawId);
-    Clipper<SIMD256, 3> clipper(workerId, pDC);
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
-    RDTSC_END(pDC->pContext->pBucketMgr, FEClipRectangles, 1);
-}
-
-void ClipTriangles(DRAW_CONTEXT*      pDC,
-                   PA_STATE&          pa,
-                   uint32_t           workerId,
-                   simdvector         prims[],
-                   uint32_t           primMask,
-                   simdscalari const& primId,
-                   simdscalari const& viewportIdx,
-                   simdscalari const& rtIdx)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipTriangles, pDC->drawId);
-    Clipper<SIMD256, 3> clipper(workerId, pDC);
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
-    RDTSC_END(pDC->pContext->pBucketMgr, FEClipTriangles, 1);
-}
-
-void ClipLines(DRAW_CONTEXT*      pDC,
-               PA_STATE&          pa,
-               uint32_t           workerId,
-               simdvector         prims[],
-               uint32_t           primMask,
-               simdscalari const& primId,
-               simdscalari const& viewportIdx,
-               simdscalari const& rtIdx)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipLines, pDC->drawId);
-    Clipper<SIMD256, 2> clipper(workerId, pDC);
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
-    RDTSC_END(pDC->pContext->pBucketMgr, FEClipLines, 1);
-}
-
-void ClipPoints(DRAW_CONTEXT*      pDC,
-                PA_STATE&          pa,
-                uint32_t           workerId,
-                simdvector         prims[],
-                uint32_t           primMask,
-                simdscalari const& primId,
-                simdscalari const& viewportIdx,
-                simdscalari const& rtIdx)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipPoints, pDC->drawId);
-    Clipper<SIMD256, 1> clipper(workerId, pDC);
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
-    RDTSC_END(pDC->pContext->pBucketMgr, FEClipPoints, 1);
-}
-
-#if USE_SIMD16_FRONTEND
-void SIMDCALL ClipRectangles_simd16(DRAW_CONTEXT*        pDC,
-                                    PA_STATE&            pa,
-                                    uint32_t             workerId,
-                                    simd16vector         prims[],
-                                    uint32_t             primMask,
-                                    simd16scalari const& primId,
-                                    simd16scalari const& viewportIdx,
-                                    simd16scalari const& rtIdx)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipRectangles, pDC->drawId);
-
-    enum
-    {
-        VERTS_PER_PRIM = 3
-    };
-
-    Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC);
-
-    pa.useAlternateOffset = false;
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
-
-    RDTSC_END(pDC->pContext->pBucketMgr, FEClipRectangles, 1);
-}
-
-void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT*        pDC,
-                                   PA_STATE&            pa,
-                                   uint32_t             workerId,
-                                   simd16vector         prims[],
-                                   uint32_t             primMask,
-                                   simd16scalari const& primId,
-                                   simd16scalari const& viewportIdx,
-                                   simd16scalari const& rtIdx)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipTriangles, pDC->drawId);
-
-    enum
-    {
-        VERTS_PER_PRIM = 3
-    };
-
-    Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC);
-
-    pa.useAlternateOffset = false;
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
-
-    RDTSC_END(pDC->pContext->pBucketMgr, FEClipTriangles, 1);
-}
-
-void SIMDCALL ClipLines_simd16(DRAW_CONTEXT*        pDC,
-                               PA_STATE&            pa,
-                               uint32_t             workerId,
-                               simd16vector         prims[],
-                               uint32_t             primMask,
-                               simd16scalari const& primId,
-                               simd16scalari const& viewportIdx,
-                               simd16scalari const& rtIdx)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipLines, pDC->drawId);
-
-    enum
-    {
-        VERTS_PER_PRIM = 2
-    };
-
-    Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC);
-
-    pa.useAlternateOffset = false;
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
-
-    RDTSC_END(pDC->pContext->pBucketMgr, FEClipLines, 1);
-}
-
-void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT*        pDC,
-                                PA_STATE&            pa,
-                                uint32_t             workerId,
-                                simd16vector         prims[],
-                                uint32_t             primMask,
-                                simd16scalari const& primId,
-                                simd16scalari const& viewportIdx,
-                                simd16scalari const& rtIdx)
-{
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEClipPoints, pDC->drawId);
-
-    enum
-    {
-        VERTS_PER_PRIM = 1
-    };
-
-    Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC);
-
-    pa.useAlternateOffset = false;
-    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
-
-    RDTSC_END(pDC->pContext->pBucketMgr, FEClipPoints, 1);
-}
-
-#endif
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
--- a/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h
+++ b/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h
@ -1,229 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file conservativerast.h
- *
- ******************************************************************************/
-#pragma once
-#include <type_traits>
-#include "common/simdintrin.h"
-
-enum FixedPointFmt
-{
-    FP_UNINIT,
-    _16_8,
-    _16_9,
-    _X_16,
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief convenience typedefs for supported Fixed Point precisions
-typedef std::integral_constant<uint32_t, FP_UNINIT> Fixed_Uninit;
-typedef std::integral_constant<uint32_t, _16_8>     Fixed_16_8;
-typedef std::integral_constant<uint32_t, _16_9>     Fixed_16_9;
-typedef std::integral_constant<uint32_t, _X_16>     Fixed_X_16;
-
-//////////////////////////////////////////////////////////////////////////
-/// @struct FixedPointTraits
-/// @brief holds constants relating to converting between FP and Fixed point
-/// @tparam FT: fixed precision type
-template <typename FT>
-struct FixedPointTraits
-{
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Fixed_16_8 specialization of FixedPointTraits
-template <>
-struct FixedPointTraits<Fixed_16_8>
-{
-    /// multiplier to go from FP32 to Fixed Point 16.8
-    typedef std::integral_constant<uint32_t, 256> ScaleT;
-    /// number of bits to shift to go from 16.8 fixed => int32
-    typedef std::integral_constant<uint32_t, 8> BitsT;
-    typedef Fixed_16_8                          TypeT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Fixed_16_9 specialization of FixedPointTraits
-template <>
-struct FixedPointTraits<Fixed_16_9>
-{
-    /// multiplier to go from FP32 to Fixed Point 16.9
-    typedef std::integral_constant<uint32_t, 512> ScaleT;
-    /// number of bits to shift to go from 16.9 fixed => int32
-    typedef std::integral_constant<uint32_t, 9> BitsT;
-    typedef Fixed_16_9                          TypeT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Fixed_16_9 specialization of FixedPointTraits
-template <>
-struct FixedPointTraits<Fixed_X_16>
-{
-    /// multiplier to go from FP32 to Fixed Point X.16
-    typedef std::integral_constant<uint32_t, 65536> ScaleT;
-    /// number of bits to shift to go from X.16 fixed => int32
-    typedef std::integral_constant<uint32_t, 16> BitsT;
-    typedef Fixed_X_16                           TypeT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief convenience typedefs for conservative rasterization modes
-typedef std::false_type StandardRastT;
-typedef std::true_type  ConservativeRastT;
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief convenience typedefs for Input Coverage rasterization modes
-typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_NONE>   NoInputCoverageT;
-typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_NORMAL> OuterConservativeCoverageT;
-typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
-    InnerConservativeCoverageT;
-
-//////////////////////////////////////////////////////////////////////////
-/// @struct ConservativeRastTraits
-/// @brief primary ConservativeRastTraits template. Shouldn't be instantiated
-/// @tparam ConservativeT: type of conservative rasterization
-template <typename ConservativeT>
-struct ConservativeRastFETraits
-{
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief StandardRast specialization of ConservativeRastTraits
-template <>
-struct ConservativeRastFETraits<StandardRastT>
-{
-    typedef std::false_type                     IsConservativeT;
-    typedef std::integral_constant<uint32_t, 0> BoundingBoxOffsetT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief ConservativeRastT specialization of ConservativeRastTraits
-template <>
-struct ConservativeRastFETraits<ConservativeRastT>
-{
-    typedef std::true_type                      IsConservativeT;
-    typedef std::integral_constant<uint32_t, 1> BoundingBoxOffsetT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief convenience typedefs for ConservativeRastFETraits
-typedef ConservativeRastFETraits<StandardRastT>     FEStandardRastT;
-typedef ConservativeRastFETraits<ConservativeRastT> FEConservativeRastT;
-
-//////////////////////////////////////////////////////////////////////////
-/// @struct ConservativeRastBETraits
-/// @brief primary ConservativeRastBETraits template. Shouldn't be instantiated;
-/// default to standard rasterization behavior
-/// @tparam ConservativeT: type of conservative rasterization
-/// @tparam InputCoverageT: type of input coverage requested, if any
-template <typename ConservativeT, typename _InputCoverageT>
-struct ConservativeRastBETraits
-{
-    typedef std::false_type                    IsConservativeT;
-    typedef _InputCoverageT                    InputCoverageT;
-    typedef FixedPointTraits<Fixed_16_8>       ConservativePrecisionT;
-    typedef std::integral_constant<int32_t, 0> ConservativeEdgeOffsetT;
-    typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief StandardRastT specialization of ConservativeRastBETraits
-template <typename _InputCoverageT>
-struct ConservativeRastBETraits<StandardRastT, _InputCoverageT>
-{
-    typedef std::false_type                    IsConservativeT;
-    typedef _InputCoverageT                    InputCoverageT;
-    typedef FixedPointTraits<Fixed_16_8>       ConservativePrecisionT;
-    typedef std::integral_constant<int32_t, 0> ConservativeEdgeOffsetT;
-    typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief ConservativeRastT specialization of ConservativeRastBETraits
-/// with no input coverage
-template <>
-struct ConservativeRastBETraits<ConservativeRastT, NoInputCoverageT>
-{
-    typedef std::true_type   IsConservativeT;
-    typedef NoInputCoverageT InputCoverageT;
-
-    typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT;
-
-    /// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision
-    /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead
-    /// of of having to compare individual edges to pixel corners to check if any part of the
-    /// triangle intersects a pixel
-    typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1>
-                                               ConservativeEdgeOffsetT;
-    typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief ConservativeRastT specialization of ConservativeRastBETraits
-/// with OuterConservativeCoverage
-template <>
-struct ConservativeRastBETraits<ConservativeRastT, OuterConservativeCoverageT>
-{
-    typedef std::true_type             IsConservativeT;
-    typedef OuterConservativeCoverageT InputCoverageT;
-
-    typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT;
-
-    /// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision
-    /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead
-    /// of of having to compare individual edges to pixel corners to check if any part of the
-    /// triangle intersects a pixel
-    typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1>
-                                               ConservativeEdgeOffsetT;
-    typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief ConservativeRastT specialization of ConservativeRastBETraits
-/// with InnerConservativeCoverage
-template <>
-struct ConservativeRastBETraits<ConservativeRastT, InnerConservativeCoverageT>
-{
-    typedef std::true_type             IsConservativeT;
-    typedef InnerConservativeCoverageT InputCoverageT;
-
-    typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT;
-
-    /// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision
-    /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead
-    /// of of having to compare individual edges to pixel corners to check if any part of the
-    /// triangle intersects a pixel
-    typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1>
-        ConservativeEdgeOffsetT;
-
-    /// undo the outer conservative offset and offset edge towards from pixel center by 1/2 pixel +
-    /// 1/512, in Fixed 16.9 precision this allows the rasterizer to do the 3 edge coverage tests
-    /// against a single point, instead of of having to compare individual edges to pixel corners to
-    /// check if a pixel is fully covered by a triangle
-    typedef std::integral_constant<int32_t,
-                                   static_cast<int32_t>(
-                                       -((ConservativePrecisionT::ScaleT::value / 2) + 1) -
-                                       ConservativeEdgeOffsetT::value)>
-        InnerConservativeEdgeOffsetT;
-};
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@ -1,608 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file context.h
- *
- * @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
- *        The SWR_CONTEXT is our global context and contains the DC ring,
- *        thread state, etc.
- *
- *        The DRAW_CONTEXT contains all state associated with a draw operation.
- *
- ******************************************************************************/
-#pragma once
-
-#include <condition_variable>
-#include <algorithm>
-
-#include "core/api.h"
-#include "core/utils.h"
-#include "core/arena.h"
-#include "core/fifo.hpp"
-#include "core/knobs.h"
-#include "common/intrin.h"
-#include "common/rdtsc_buckets.h"
-#include "core/threads.h"
-#include "ringbuffer.h"
-#include "archrast/archrast.h"
-
-// x.8 fixed point precision values
-#define FIXED_POINT_SHIFT 8
-#define FIXED_POINT_SCALE 256
-
-// x.16 fixed point precision values
-#define FIXED_POINT16_SHIFT 16
-#define FIXED_POINT16_SCALE 65536
-
-struct SWR_CONTEXT;
-struct DRAW_CONTEXT;
-
-struct TRI_FLAGS
-{
-    uint32_t frontFacing : 1;
-    uint32_t yMajor : 1;
-    uint32_t coverageMask : (SIMD_TILE_X_DIM* SIMD_TILE_Y_DIM);
-    uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
-    float    pointSize;
-    uint32_t renderTargetArrayIndex;
-    uint32_t viewportIndex;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// SWR_TRIANGLE_DESC
-/////////////////////////////////////////////////////////////////////////
-struct SWR_TRIANGLE_DESC
-{
-    float I[3];
-    float J[3];
-    float Z[3];
-    float OneOverW[3];
-    float recipDet;
-
-    float* pRecipW;
-    float* pAttribs;
-    float* pPerspAttribs;
-    float* pSamplePos;
-    float* pUserClipBuffer;
-
-    uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
-    uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if
-                                // entire pixel is covered
-    uint64_t anyCoveredSamples;
-
-    TRI_FLAGS triFlags;
-};
-
-struct TRIANGLE_WORK_DESC
-{
-    float* pTriBuffer;
-    float* pAttribs;
-    float* pUserClipBuffer;
-    uint32_t  numAttribs;
-    TRI_FLAGS triFlags;
-};
-
-struct CLEAR_DESC
-{
-    SWR_RECT rect;
-    uint32_t attachmentMask;
-    uint32_t renderTargetArrayIndex;
-    float    clearRTColor[4]; // RGBA_32F
-    float    clearDepth;      // [0..1]
-    uint8_t  clearStencil;
-};
-
-struct DISCARD_INVALIDATE_TILES_DESC
-{
-    uint32_t       attachmentMask;
-    SWR_RECT       rect;
-    SWR_TILE_STATE newTileState;
-    bool           createNewTiles;
-    bool           fullTilesOnly;
-};
-
-struct SYNC_DESC
-{
-    PFN_CALLBACK_FUNC pfnCallbackFunc;
-    uint64_t          userData;
-    uint64_t          userData2;
-    uint64_t          userData3;
-};
-
-struct STORE_TILES_DESC
-{
-    uint32_t       attachmentMask;
-    SWR_TILE_STATE postStoreTileState;
-    SWR_RECT       rect;
-};
-
-struct COMPUTE_DESC
-{
-    uint32_t threadGroupCountX;
-    uint32_t threadGroupCountY;
-    uint32_t threadGroupCountZ;
-    bool     enableThreadDispatch;
-};
-
-typedef void (*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC,
-                              uint32_t      workerId,
-                              uint32_t      macroTile,
-                              void*         pDesc);
-
-enum WORK_TYPE
-{
-    SYNC,
-    DRAW,
-    CLEAR,
-    DISCARDINVALIDATETILES,
-    STORETILES,
-    SHUTDOWN,
-};
-
-OSALIGNSIMD(struct) BE_WORK
-{
-    WORK_TYPE     type;
-    PFN_WORK_FUNC pfnWork;
-    union
-    {
-        SYNC_DESC                     sync;
-        TRIANGLE_WORK_DESC            tri;
-        CLEAR_DESC                    clear;
-        DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
-        STORE_TILES_DESC              storeTiles;
-    } desc;
-};
-
-struct DRAW_WORK
-{
-    DRAW_CONTEXT* pDC;
-    union
-    {
-        uint32_t numIndices; // DrawIndexed: Number of indices for draw.
-        uint32_t numVerts;   // Draw: Number of verts (triangles, lines, etc)
-    };
-    union
-    {
-        gfxptr_t xpIB;        // DrawIndexed: App supplied int32 indices
-        uint32_t startVertex; // Draw: Starting vertex in VB to render from.
-    };
-    int32_t  baseVertex;
-    uint32_t numInstances;  // Number of instances
-    uint32_t startInstance; // Instance offset
-    uint32_t startPrimID;   // starting primitiveID for this draw batch
-    uint32_t
-               startVertexID; // starting VertexID for this draw batch (only needed for non-indexed draws)
-    SWR_FORMAT type;          // index buffer type
-};
-
-typedef void (*PFN_FE_WORK_FUNC)(SWR_CONTEXT*  pContext,
-                                 DRAW_CONTEXT* pDC,
-                                 uint32_t      workerId,
-                                 void*         pDesc);
-struct FE_WORK
-{
-    WORK_TYPE        type;
-    PFN_FE_WORK_FUNC pfnWork;
-    union
-    {
-        SYNC_DESC                     sync;
-        DRAW_WORK                     draw;
-        CLEAR_DESC                    clear;
-        DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
-        STORE_TILES_DESC              storeTiles;
-    } desc;
-};
-
-struct GUARDBANDS
-{
-    float left[KNOB_NUM_VIEWPORTS_SCISSORS];
-    float right[KNOB_NUM_VIEWPORTS_SCISSORS];
-    float top[KNOB_NUM_VIEWPORTS_SCISSORS];
-    float bottom[KNOB_NUM_VIEWPORTS_SCISSORS];
-};
-
-struct PA_STATE;
-
-// function signature for pipeline stages that execute after primitive assembly
-typedef void (*PFN_PROCESS_PRIMS)(DRAW_CONTEXT*      pDC,
-                                  PA_STATE&          pa,
-                                  uint32_t           workerId,
-                                  simdvector         prims[],
-                                  uint32_t           primMask,
-                                  simdscalari const& primID,
-                                  simdscalari const& viewportIdx,
-                                  simdscalari const& rtIdx);
-
-// function signature for pipeline stages that execute after primitive assembly
-typedef void(SIMDCALL* PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT*        pDC,
-                                                 PA_STATE&            pa,
-                                                 uint32_t             workerId,
-                                                 simd16vector         prims[],
-                                                 uint32_t             primMask,
-                                                 simd16scalari const& primID,
-                                                 simd16scalari const& viewportIdx,
-                                                 simd16scalari const& rtIdx);
-
-OSALIGNLINE(struct) API_STATE
-{
-    // Vertex Buffers
-    SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS];
-
-    // GS - Geometry Shader State
-    SWR_GS_STATE gsState;
-    PFN_GS_FUNC  pfnGsFunc;
-
-    // FS - Fetch Shader State
-    PFN_FETCH_FUNC pfnFetchFunc;
-
-    // VS - Vertex Shader State
-    PFN_VERTEX_FUNC pfnVertexFunc;
-
-    // Index Buffer
-    SWR_INDEX_BUFFER_STATE indexBuffer;
-
-    // CS - Compute Shader
-    PFN_CS_FUNC pfnCsFunc;
-    uint32_t    totalThreadsInGroup;
-    uint32_t    totalSpillFillSize;
-    uint32_t    scratchSpaceSizePerWarp;
-    uint32_t    scratchSpaceNumWarps;
-
-    // FE - Frontend State
-    SWR_FRONTEND_STATE frontendState;
-
-    // SOS - Streamout Shader State
-    PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS];
-
-    // Streamout state
-    SWR_STREAMOUT_STATE          soState;
-    mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS];
-    mutable SWR_STREAMOUT_BUFFER soPausedBuffer[MAX_SO_STREAMS];
-
-    // Tessellation State
-    PFN_HS_FUNC  pfnHsFunc;
-    PFN_DS_FUNC  pfnDsFunc;
-    SWR_TS_STATE tsState;
-
-    // Number of attributes used by the frontend (vs, so, gs)
-    uint32_t feNumAttributes;
-
-    // RS - Rasterizer State
-    SWR_RASTSTATE rastState;
-    // floating point multisample offsets
-    float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2];
-
-    GUARDBANDS gbState;
-
-    SWR_VIEWPORT          vp[KNOB_NUM_VIEWPORTS_SCISSORS];
-    SWR_VIEWPORT_MATRICES vpMatrices;
-
-    SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
-    SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS];
-    bool     scissorsTileAligned;
-
-    bool               forceFront;
-    PRIMITIVE_TOPOLOGY topology;
-
-
-    // Backend state
-    OSALIGNLINE(SWR_BACKEND_STATE) backendState;
-
-    SWR_DEPTH_BOUNDS_STATE depthBoundsState;
-
-    // PS - Pixel shader state
-    SWR_PS_STATE psState;
-
-    SWR_DEPTH_STENCIL_STATE depthStencilState;
-
-    // OM - Output Merger State
-    SWR_BLEND_STATE    blendState;
-    PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS];
-
-    struct
-    {
-        uint32_t enableStatsFE : 1;        // Enable frontend pipeline stats
-        uint32_t enableStatsBE : 1;        // Enable backend pipeline stats
-        uint32_t colorHottileEnable : 8;   // Bitmask of enabled color hottiles
-        uint32_t depthHottileEnable : 1;   // Enable depth buffer hottile
-        uint32_t stencilHottileEnable : 1; // Enable stencil buffer hottile
-    };
-
-    PFN_QUANTIZE_DEPTH pfnQuantizeDepth;
-};
-
-class MacroTileMgr;
-class DispatchQueue;
-class HOTTILE;
-
-struct RenderOutputBuffers
-{
-    uint8_t* pColor[SWR_NUM_RENDERTARGETS];
-    uint8_t* pDepth;
-    uint8_t* pStencil;
-
-    HOTTILE* pColorHotTile[SWR_NUM_RENDERTARGETS];
-    HOTTILE* pDepthHotTile;
-    HOTTILE* pStencilHotTile;
-};
-
-// Plane equation A/B/C coeffs used to evaluate I/J barycentric coords
-struct BarycentricCoeffs
-{
-    simdscalar vIa;
-    simdscalar vIb;
-    simdscalar vIc;
-
-    simdscalar vJa;
-    simdscalar vJb;
-    simdscalar vJc;
-
-    simdscalar vZa;
-    simdscalar vZb;
-    simdscalar vZc;
-
-    simdscalar vRecipDet;
-
-    simdscalar vAOneOverW;
-    simdscalar vBOneOverW;
-    simdscalar vCOneOverW;
-};
-
-// pipeline function pointer types
-typedef void (*PFN_BACKEND_FUNC)(
-    DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
-typedef void (*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT&,
-                                  uint8_t* (&)[SWR_NUM_RENDERTARGETS],
-                                  uint32_t,
-                                  const SWR_BLEND_STATE*,
-                                  const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS],
-                                  simdscalar&,
-                                  simdscalar const&);
-typedef void (*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
-typedef void (*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
-typedef void (*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&,
-                                               SWR_PS_CONTEXT&,
-                                               const uint64_t* const,
-                                               const uint32_t,
-                                               simdscalar const&,
-                                               simdscalar const&);
-
-struct BACKEND_FUNCS
-{
-    PFN_BACKEND_FUNC pfnBackend;
-};
-
-// Draw State
-struct DRAW_STATE
-{
-    API_STATE state;
-
-    void* pPrivateState; // Its required the driver sets this up for each draw.
-
-    // pipeline function pointers, filled in by API thread when setting up the draw
-    BACKEND_FUNCS     backendFuncs;
-    PFN_PROCESS_PRIMS pfnProcessPrims;
-#if USE_SIMD16_FRONTEND
-    PFN_PROCESS_PRIMS_SIMD16 pfnProcessPrims_simd16;
-#endif
-
-    CachingArena* pArena; // This should only be used by API thread.
-};
-
-struct DRAW_DYNAMIC_STATE
-{
-    void Reset(uint32_t numThreads)
-    {
-        SWR_STATS* pSavePtr = pStats;
-        memset(this, 0, sizeof(*this));
-        pStats = pSavePtr;
-        memset(pStats, 0, sizeof(SWR_STATS) * numThreads);
-    }
-    ///@todo Currently assumes only a single FE can do stream output for a draw.
-    uint32_t SoWriteOffset[4];
-    bool     SoWriteOffsetDirty[4];
-
-    SWR_STATS_FE statsFE; // Only one FE thread per DC.
-    SWR_STATS*   pStats;
-    uint64_t     soPrims; // number of primitives written to StreamOut buffer
-};
-
-// Draw Context
-//    The api thread sets up a draw context that exists for the life of the draw.
-//    This draw context maintains all of the state needed for the draw operation.
-struct DRAW_CONTEXT
-{
-    SWR_CONTEXT* pContext;
-    union
-    {
-        MacroTileMgr*  pTileMgr;
-        DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
-    };
-    DRAW_STATE*   pState; // Read-only state. Core should not update this outside of API thread.
-    CachingArena* pArena;
-
-    uint32_t drawId;
-    bool     dependentFE;  // Frontend work is dependent on all previous FE
-    bool     dependent;    // Backend work is dependent on all previous BE
-    bool     isCompute;    // Is this DC a compute context?
-    bool     cleanupState; // True if this is the last draw using an entry in the state ring.
-
-    FE_WORK FeWork;
-
-    SYNC_DESC retireCallback; // Call this func when this DC is retired.
-
-    DRAW_DYNAMIC_STATE dynState;
-
-    volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw?
-    volatile OSALIGNLINE(uint32_t) FeLock;
-    volatile OSALIGNLINE(uint32_t) threadsDone;
-};
-
-static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
-
-INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
-{
-    SWR_ASSERT(pDC != nullptr);
-    SWR_ASSERT(pDC->pState != nullptr);
-
-    return pDC->pState->state;
-}
-
-INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC)
-{
-    SWR_ASSERT(pDC != nullptr);
-    SWR_ASSERT(pDC->pState != nullptr);
-
-    return pDC->pState->pPrivateState;
-}
-
-class HotTileMgr;
-
-struct SWR_CONTEXT
-{
-    // Draw Context Ring
-    //  Each draw needs its own state in order to support multiple draws in flight across multiple
-    //  threads. We maintain N draw contexts configured as a ring. The size of the ring limits the
-    //  maximum number of draws that can be in flight at any given time.
-    //
-    //  Description:
-    //  1. State - When an application first sets state we'll request a new draw context to use.
-    //     a. If there are no available draw contexts then we'll have to wait until one becomes
-    //     free. b. If one is available then set pCurDrawContext to point to it and mark it in use.
-    //     c. All state calls set state on pCurDrawContext.
-    //  2. Draw - Creates submits a work item that is associated with current draw context.
-    //     a. Set pPrevDrawContext = pCurDrawContext
-    //     b. Set pCurDrawContext to NULL.
-    //  3. State - When an applications sets state after draw
-    //     a. Same as step 1.
-    //     b. State is copied from prev draw context to current.
-    RingBuffer<DRAW_CONTEXT> dcRing;
-
-    DRAW_CONTEXT* pCurDrawContext;  // This points to DC entry in ring for an unsubmitted draw.
-    DRAW_CONTEXT* pPrevDrawContext; // This points to DC entry for the previous context submitted
-                                    // that we can copy state from.
-
-    MacroTileMgr*  pMacroTileManagerArray;
-    DispatchQueue* pDispatchQueueArray;
-
-    // Draw State Ring
-    //  When draw are very large (lots of primitives) then the API thread will break these up.
-    //  These split draws all have identical state. So instead of storing the state directly
-    //  in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
-    //  to reference a single entry in the DS ring.
-    RingBuffer<DRAW_STATE> dsRing;
-
-    uint32_t curStateId; // Current index to the next available entry in the DS ring.
-
-    uint32_t NumWorkerThreads;
-    uint32_t NumFEThreads;
-    uint32_t NumBEThreads;
-
-    THREAD_POOL              threadPool; // Thread pool associated with this context
-    SWR_THREADING_INFO       threadInfo;
-    SWR_API_THREADING_INFO   apiThreadInfo;
-    SWR_WORKER_PRIVATE_STATE workerPrivateState;
-
-    uint32_t MAX_DRAWS_IN_FLIGHT;
-
-    std::condition_variable FifosNotEmpty;
-    std::mutex              WaitLock;
-
-    uint32_t privateStateSize;
-
-    HotTileMgr* pHotTileMgr;
-
-    // Callback functions, passed in at create context time
-    PFN_LOAD_TILE                  pfnLoadTile;
-    PFN_STORE_TILE                 pfnStoreTile;
-    PFN_TRANSLATE_GFXPTR_FOR_READ  pfnTranslateGfxptrForRead;
-    PFN_TRANSLATE_GFXPTR_FOR_WRITE pfnTranslateGfxptrForWrite;
-    PFN_MAKE_GFXPTR                pfnMakeGfxPtr;
-    PFN_CREATE_MEMORY_CONTEXT      pfnCreateMemoryContext;
-    PFN_DESTROY_MEMORY_CONTEXT     pfnDestroyMemoryContext;
-    PFN_UPDATE_SO_WRITE_OFFSET     pfnUpdateSoWriteOffset;
-    PFN_UPDATE_STATS               pfnUpdateStats;
-    PFN_UPDATE_STATS_FE            pfnUpdateStatsFE;
-    PFN_UPDATE_STREAMOUT           pfnUpdateStreamOut;
-
-
-    // Global Stats
-    SWR_STATS* pStats;
-
-    // Scratch space for workers.
-    uint8_t** ppScratch;
-
-    volatile OSALIGNLINE(uint32_t) drawsOutstandingFE;
-
-    OSALIGNLINE(CachingAllocator) cachingArenaAllocator;
-    uint32_t frameCount;
-
-    uint32_t lastFrameChecked;
-    uint64_t lastDrawChecked;
-    TileSet* pSingleThreadLockedTiles;
-
-    // ArchRast thread contexts.
-    HANDLE* pArContext;
-
-    // handle to external memory for worker data to create memory contexts
-    HANDLE hExternalMemory;
-
-    BucketManager *pBucketMgr;
-};
-
-#define UPDATE_STAT_BE(name, count)                   \
-    if (GetApiState(pDC).enableStatsBE)               \
-    {                                                 \
-        pDC->dynState.pStats[workerId].name += count; \
-    }
-#define UPDATE_STAT_FE(name, count)          \
-    if (GetApiState(pDC).enableStatsFE)      \
-    {                                        \
-        pDC->dynState.statsFE.name += count; \
-    }
-
-// ArchRast instrumentation framework
-#define AR_WORKER_CTX pDC->pContext->pArContext[workerId]
-#define AR_API_CTX pDC->pContext->pArContext[pContext->NumWorkerThreads]
-
-#ifdef KNOB_ENABLE_RDTSC
-#define RDTSC_BEGIN(pBucketMgr, type, drawid) RDTSC_START(pBucketMgr, type)
-#define RDTSC_END(pBucketMgr, type, count) RDTSC_STOP(pBucketMgr, type, count, 0)
-#else
-#define RDTSC_BEGIN(pBucketMgr, type, drawid)
-#define RDTSC_END(pBucketMgr, type, count)
-#endif
-
-#ifdef KNOB_ENABLE_AR
-#define _AR_EVENT(ctx, event) ArchRast::Dispatch(ctx, ArchRast::event)
-#define _AR_FLUSH(ctx, id) ArchRast::FlushDraw(ctx, id)
-#else
-#define _AR_EVENT(ctx, event)
-#define _AR_FLUSH(ctx, id)
-#endif
-
-// Use these macros for api thread.
-#define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event)
-
-// Use these macros for worker threads.
-#define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event)
-#define AR_FLUSH(id) _AR_FLUSH(AR_WORKER_CTX, id)
--- a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
+++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
@ -1,335 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file depthstencil.h
- *
- * @brief Implements depth/stencil functionality
- *
- ******************************************************************************/
-#pragma once
-#include "common/os.h"
-#include "format_conversion.h"
-
-INLINE
-void StencilOp(SWR_STENCILOP     op,
-               simdscalar const& mask,
-               simdscalar const& stencilRefps,
-               simdscalar&       stencilps)
-{
-    simdscalari stencil = _simd_castps_si(stencilps);
-
-    switch (op)
-    {
-    case STENCILOP_KEEP:
-        break;
-    case STENCILOP_ZERO:
-        stencilps = _simd_blendv_ps(stencilps, _simd_setzero_ps(), mask);
-        break;
-    case STENCILOP_REPLACE:
-        stencilps = _simd_blendv_ps(stencilps, stencilRefps, mask);
-        break;
-    case STENCILOP_INCRSAT:
-    {
-        simdscalari stencilincr = _simd_adds_epu8(stencil, _simd_set1_epi32(1));
-        stencilps               = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask);
-        break;
-    }
-    case STENCILOP_DECRSAT:
-    {
-        simdscalari stencildecr = _simd_subs_epu8(stencil, _simd_set1_epi32(1));
-        stencilps               = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask);
-        break;
-    }
-    case STENCILOP_INCR:
-    {
-        simdscalari stencilincr = _simd_add_epi8(stencil, _simd_set1_epi32(1));
-        stencilps               = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask);
-        break;
-    }
-    case STENCILOP_DECR:
-    {
-        simdscalari stencildecr = _simd_add_epi8(stencil, _simd_set1_epi32((-1) & 0xff));
-        stencilps               = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask);
-        break;
-    }
-    case STENCILOP_INVERT:
-    {
-        simdscalar stencilinvert =
-            _simd_andnot_ps(stencilps, _simd_cmpeq_ps(_simd_setzero_ps(), _simd_setzero_ps()));
-        stencilps = _simd_blendv_ps(stencilps, stencilinvert, mask);
-        break;
-    }
-    default:
-        break;
-    }
-}
-
-template <SWR_FORMAT depthFormatT>
-simdscalar QuantizeDepth(simdscalar const& depth)
-{
-    SWR_TYPE depthType = FormatTraits<depthFormatT>::GetType(0);
-    uint32_t depthBpc  = FormatTraits<depthFormatT>::GetBPC(0);
-
-    if (depthType == SWR_TYPE_FLOAT)
-    {
-        // assume only 32bit float depth supported
-        SWR_ASSERT(depthBpc == 32);
-
-        // matches shader precision, no quantizing needed
-        return depth;
-    }
-
-    // should be unorm depth if not float
-    SWR_ASSERT(depthType == SWR_TYPE_UNORM);
-
-    float      quantize = (float)((1 << depthBpc) - 1);
-    simdscalar result   = _simd_mul_ps(depth, _simd_set1_ps(quantize));
-    result              = _simd_add_ps(result, _simd_set1_ps(0.5f));
-    result              = _simd_round_ps(result, _MM_FROUND_TO_ZERO);
-
-    if (depthBpc > 16)
-    {
-        result = _simd_div_ps(result, _simd_set1_ps(quantize));
-    }
-    else
-    {
-        result = _simd_mul_ps(result, _simd_set1_ps(1.0f / quantize));
-    }
-
-    return result;
-}
-
-INLINE
-simdscalar DepthStencilTest(const API_STATE*  pState,
-                            bool              frontFacing,
-                            uint32_t          viewportIndex,
-                            simdscalar const& iZ,
-                            uint8_t*          pDepthBase,
-                            simdscalar const& coverageMask,
-                            uint8_t*          pStencilBase,
-                            simdscalar*       pStencilMask)
-{
-    static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
-    static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format");
-
-    const SWR_DEPTH_STENCIL_STATE* pDSState  = &pState->depthStencilState;
-    const SWR_VIEWPORT*            pViewport = &pState->vp[viewportIndex];
-
-    simdscalar depthResult = _simd_set1_ps(-1.0f);
-    simdscalar zbuf;
-
-    // clamp Z to viewport [minZ..maxZ]
-    simdscalar vMinZ   = _simd_broadcast_ss(&pViewport->minZ);
-    simdscalar vMaxZ   = _simd_broadcast_ss(&pViewport->maxZ);
-    simdscalar interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, iZ));
-
-    if (pDSState->depthTestEnable)
-    {
-        switch (pDSState->depthTestFunc)
-        {
-        case ZFUNC_NEVER:
-            depthResult = _simd_setzero_ps();
-            break;
-        case ZFUNC_ALWAYS:
-            break;
-        default:
-            zbuf = _simd_load_ps((const float*)pDepthBase);
-        }
-
-        switch (pDSState->depthTestFunc)
-        {
-        case ZFUNC_LE:
-            depthResult = _simd_cmple_ps(interpZ, zbuf);
-            break;
-        case ZFUNC_LT:
-            depthResult = _simd_cmplt_ps(interpZ, zbuf);
-            break;
-        case ZFUNC_GT:
-            depthResult = _simd_cmpgt_ps(interpZ, zbuf);
-            break;
-        case ZFUNC_GE:
-            depthResult = _simd_cmpge_ps(interpZ, zbuf);
-            break;
-        case ZFUNC_EQ:
-            depthResult = _simd_cmpeq_ps(interpZ, zbuf);
-            break;
-        case ZFUNC_NE:
-            depthResult = _simd_cmpneq_ps(interpZ, zbuf);
-            break;
-        }
-    }
-
-    simdscalar stencilMask = _simd_set1_ps(-1.0f);
-
-    if (pDSState->stencilTestEnable)
-    {
-        uint8_t  stencilRefValue;
-        uint32_t stencilTestFunc;
-        uint8_t  stencilTestMask;
-        if (frontFacing || !pDSState->doubleSidedStencilTestEnable)
-        {
-            stencilRefValue = pDSState->stencilRefValue;
-            stencilTestFunc = pDSState->stencilTestFunc;
-            stencilTestMask = pDSState->stencilTestMask;
-        }
-        else
-        {
-            stencilRefValue = pDSState->backfaceStencilRefValue;
-            stencilTestFunc = pDSState->backfaceStencilTestFunc;
-            stencilTestMask = pDSState->backfaceStencilTestMask;
-        }
-
-        simdvector sbuf;
-        simdscalar stencilWithMask;
-        simdscalar stencilRef;
-        switch (stencilTestFunc)
-        {
-        case ZFUNC_NEVER:
-            stencilMask = _simd_setzero_ps();
-            break;
-        case ZFUNC_ALWAYS:
-            break;
-        default:
-            LoadSOA<R8_UINT>(pStencilBase, sbuf);
-
-            // apply stencil read mask
-            stencilWithMask = _simd_castsi_ps(
-                _simd_and_si(_simd_castps_si(sbuf.v[0]), _simd_set1_epi32(stencilTestMask)));
-
-            // do stencil compare in float to avoid simd integer emulation in AVX1
-            stencilWithMask = _simd_cvtepi32_ps(_simd_castps_si(stencilWithMask));
-
-            stencilRef = _simd_set1_ps((float)(stencilRefValue & stencilTestMask));
-            break;
-        }
-
-        switch (stencilTestFunc)
-        {
-        case ZFUNC_LE:
-            stencilMask = _simd_cmple_ps(stencilRef, stencilWithMask);
-            break;
-        case ZFUNC_LT:
-            stencilMask = _simd_cmplt_ps(stencilRef, stencilWithMask);
-            break;
-        case ZFUNC_GT:
-            stencilMask = _simd_cmpgt_ps(stencilRef, stencilWithMask);
-            break;
-        case ZFUNC_GE:
-            stencilMask = _simd_cmpge_ps(stencilRef, stencilWithMask);
-            break;
-        case ZFUNC_EQ:
-            stencilMask = _simd_cmpeq_ps(stencilRef, stencilWithMask);
-            break;
-        case ZFUNC_NE:
-            stencilMask = _simd_cmpneq_ps(stencilRef, stencilWithMask);
-            break;
-        }
-    }
-
-    simdscalar depthWriteMask = _simd_and_ps(depthResult, stencilMask);
-    depthWriteMask            = _simd_and_ps(depthWriteMask, coverageMask);
-
-    *pStencilMask = stencilMask;
-    return depthWriteMask;
-}
-
-INLINE
-void DepthStencilWrite(const SWR_VIEWPORT*            pViewport,
-                       const SWR_DEPTH_STENCIL_STATE* pDSState,
-                       bool                           frontFacing,
-                       simdscalar const&              iZ,
-                       uint8_t*                       pDepthBase,
-                       const simdscalar&              depthMask,
-                       const simdscalar&              coverageMask,
-                       uint8_t*                       pStencilBase,
-                       const simdscalar&              stencilMask)
-{
-    if (pDSState->depthWriteEnable)
-    {
-        // clamp Z to viewport [minZ..maxZ]
-        simdscalar vMinZ   = _simd_broadcast_ss(&pViewport->minZ);
-        simdscalar vMaxZ   = _simd_broadcast_ss(&pViewport->maxZ);
-        simdscalar interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, iZ));
-
-        simdscalar vMask = _simd_and_ps(depthMask, coverageMask);
-        _simd_maskstore_ps((float*)pDepthBase, _simd_castps_si(vMask), interpZ);
-    }
-
-    if (pDSState->stencilWriteEnable)
-    {
-        simdvector sbuf;
-        LoadSOA<R8_UINT>(pStencilBase, sbuf);
-        simdscalar stencilbuf = sbuf.v[0];
-
-        uint8_t  stencilRefValue;
-        uint32_t stencilFailOp;
-        uint32_t stencilPassDepthPassOp;
-        uint32_t stencilPassDepthFailOp;
-        uint8_t  stencilWriteMask;
-        if (frontFacing || !pDSState->doubleSidedStencilTestEnable)
-        {
-            stencilRefValue        = pDSState->stencilRefValue;
-            stencilFailOp          = pDSState->stencilFailOp;
-            stencilPassDepthPassOp = pDSState->stencilPassDepthPassOp;
-            stencilPassDepthFailOp = pDSState->stencilPassDepthFailOp;
-            stencilWriteMask       = pDSState->stencilWriteMask;
-        }
-        else
-        {
-            stencilRefValue        = pDSState->backfaceStencilRefValue;
-            stencilFailOp          = pDSState->backfaceStencilFailOp;
-            stencilPassDepthPassOp = pDSState->backfaceStencilPassDepthPassOp;
-            stencilPassDepthFailOp = pDSState->backfaceStencilPassDepthFailOp;
-            stencilWriteMask       = pDSState->backfaceStencilWriteMask;
-        }
-
-        simdscalar stencilps    = stencilbuf;
-        simdscalar stencilRefps = _simd_castsi_ps(_simd_set1_epi32(stencilRefValue));
-
-        simdscalar stencilFailMask          = _simd_andnot_ps(stencilMask, coverageMask);
-        simdscalar stencilPassDepthPassMask = _simd_and_ps(stencilMask, depthMask);
-        simdscalar stencilPassDepthFailMask =
-            _simd_and_ps(stencilMask, _simd_andnot_ps(depthMask, _simd_set1_ps(-1)));
-
-        simdscalar origStencil = stencilps;
-
-        StencilOp((SWR_STENCILOP)stencilFailOp, stencilFailMask, stencilRefps, stencilps);
-        StencilOp((SWR_STENCILOP)stencilPassDepthFailOp,
-                  stencilPassDepthFailMask,
-                  stencilRefps,
-                  stencilps);
-        StencilOp((SWR_STENCILOP)stencilPassDepthPassOp,
-                  stencilPassDepthPassMask,
-                  stencilRefps,
-                  stencilps);
-
-        // apply stencil write mask
-        simdscalari vWriteMask = _simd_set1_epi32(stencilWriteMask);
-        stencilps              = _simd_and_ps(stencilps, _simd_castsi_ps(vWriteMask));
-        stencilps =
-            _simd_or_ps(_simd_andnot_ps(_simd_castsi_ps(vWriteMask), origStencil), stencilps);
-
-        simdvector stencilResult;
-        stencilResult.v[0] = _simd_blendv_ps(origStencil, stencilps, coverageMask);
-        StoreSOA<R8_UINT>(stencilResult, pStencilBase);
-    }
-}
--- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
+++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
@ -1,138 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file fifo.hpp
- *
- * @brief Definitions for our fifos used for thread communication.
- *
- ******************************************************************************/
-#pragma once
-
-#include "common/os.h"
-#include "arena.h"
-
-#include <vector>
-#include <cassert>
-
-template <class T>
-struct QUEUE
-{
-    OSALIGNLINE(volatile uint32_t) mLock{0};
-    OSALIGNLINE(volatile uint32_t) mNumEntries{0};
-    std::vector<T*> mBlocks;
-    T*              mCurBlock{nullptr};
-    uint32_t        mHead{0};
-    uint32_t        mTail{0};
-    uint32_t        mCurBlockIdx{0};
-
-    // power of 2
-    static const uint32_t mBlockSizeShift = 6;
-    static const uint32_t mBlockSize      = 1 << mBlockSizeShift;
-
-    template <typename ArenaT>
-    void clear(ArenaT& arena)
-    {
-        mHead = 0;
-        mTail = 0;
-        mBlocks.clear();
-        T* pNewBlock = (T*)arena.AllocAligned(sizeof(T) * mBlockSize, KNOB_SIMD_WIDTH * 4);
-        mBlocks.push_back(pNewBlock);
-        mCurBlock    = pNewBlock;
-        mCurBlockIdx = 0;
-        mNumEntries  = 0;
-        mLock        = 0;
-    }
-
-    uint32_t getNumQueued() { return mNumEntries; }
-
-    bool tryLock()
-    {
-        if (mLock)
-        {
-            return false;
-        }
-
-        // try to lock the FIFO
-        long initial = InterlockedCompareExchange(&mLock, 1, 0);
-        return (initial == 0);
-    }
-
-    void unlock() { mLock = 0; }
-
-    T* peek()
-    {
-        if (mNumEntries == 0)
-        {
-            return nullptr;
-        }
-        uint32_t block = mHead >> mBlockSizeShift;
-        return &mBlocks[block][mHead & (mBlockSize - 1)];
-    }
-
-    void dequeue_noinc()
-    {
-        mHead++;
-        mNumEntries--;
-    }
-
-    template <typename ArenaT>
-    bool enqueue_try_nosync(ArenaT& arena, const T* entry)
-    {
-        const float* pSrc = (const float*)entry;
-        float*       pDst = (float*)&mCurBlock[mTail];
-
-        auto lambda = [&](int32_t i) {
-            __m256 vSrc = _mm256_load_ps(pSrc + i * KNOB_SIMD_WIDTH);
-            _mm256_stream_ps(pDst + i * KNOB_SIMD_WIDTH, vSrc);
-        };
-
-        const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH * 4);
-        static_assert(numSimdLines * KNOB_SIMD_WIDTH * 4 == sizeof(T),
-                      "FIFO element size should be multiple of SIMD width.");
-
-        UnrollerL<0, numSimdLines, 1>::step(lambda);
-
-        mTail++;
-        if (mTail == mBlockSize)
-        {
-            if (++mCurBlockIdx < mBlocks.size())
-            {
-                mCurBlock = mBlocks[mCurBlockIdx];
-            }
-            else
-            {
-                T* newBlock = (T*)arena.AllocAligned(sizeof(T) * mBlockSize, KNOB_SIMD_WIDTH * 4);
-                SWR_ASSERT(newBlock);
-
-                mBlocks.push_back(newBlock);
-                mCurBlock = newBlock;
-            }
-
-            mTail = 0;
-        }
-
-        mNumEntries++;
-        return true;
-    }
-
-    void destroy() {}
-};
--- a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
@ -1,262 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file format_conversion.h
- *
- * @brief API implementation
- *
- ******************************************************************************/
-#include "format_types.h"
-#include "format_traits.h"
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Load SIMD packed pixels in SOA format and converts to
-///        SOA RGBA32_FLOAT format.
-/// @param pSrc - source data in SOA form
-/// @param dst - output data in SOA form
-template <typename SIMD_T, SWR_FORMAT SrcFormat>
-INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, Vec4<SIMD_T>& dst)
-{
-    // fast path for float32
-    if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) &&
-        (FormatTraits<SrcFormat>::GetBPC(0) == 32))
-    {
-        auto lambda = [&](int comp)
-        {
-            Float<SIMD_T> vComp =
-                SIMD_T::load_ps(reinterpret_cast<const float*>(pSrc + comp * sizeof(Float<SIMD_T>)));
-
-            dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
-        };
-
-        UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
-        return;
-    }
-
-    auto lambda = [&](int comp)
-    {
-        // load SIMD components
-        Float<SIMD_T> vComp;
-        FormatTraits<SrcFormat>::loadSOA(comp, pSrc, vComp);
-
-        // unpack
-        vComp = FormatTraits<SrcFormat>::unpack(comp, vComp);
-
-        // convert
-        if (FormatTraits<SrcFormat>::isNormalized(comp))
-        {
-            vComp = SIMD_T::cvtepi32_ps(SIMD_T::castps_si(vComp));
-            vComp = SIMD_T::mul_ps(vComp, SIMD_T::set1_ps(FormatTraits<SrcFormat>::toFloat(comp)));
-        }
-
-        dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
-
-        // is there a better way to get this from the SIMD traits?
-        const uint32_t SIMD_WIDTH = sizeof(typename SIMD_T::Float) / sizeof(float);
-
-        pSrc += (FormatTraits<SrcFormat>::GetBPC(comp) * SIMD_WIDTH) / 8;
-    };
-
-    UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
-}
-
-template <SWR_FORMAT SrcFormat>
-INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simdvector& dst)
-{
-    LoadSOA<SIMD256, SrcFormat>(pSrc, dst);
-}
-
-template <SWR_FORMAT SrcFormat>
-INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simd16vector& dst)
-{
-    LoadSOA<SIMD512, SrcFormat>(pSrc, dst);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Clamps the given component based on the requirements on the
-///        Format template arg
-/// @param vComp - SIMD vector of floats
-/// @param Component - component
-template <typename SIMD_T, SWR_FORMAT Format>
-INLINE Float<SIMD_T> SIMDCALL Clamp(Float<SIMD_T> const& v, uint32_t Component)
-{
-    Float<SIMD_T> vComp = v;
-    if (Component >= 4 || Component < 0)
-    {
-	// Component shouldn't out of <0;3> range
-	assert(false);
-	return vComp;
-    }
-    if (FormatTraits<Format>::isNormalized(Component))
-    {
-        if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UNORM)
-        {
-            vComp = SIMD_T::max_ps(vComp, SIMD_T::setzero_ps());
-        }
-
-        if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SNORM)
-        {
-            vComp = SIMD_T::max_ps(vComp, SIMD_T::set1_ps(-1.0f));
-        }
-        vComp = SIMD_T::min_ps(vComp, SIMD_T::set1_ps(1.0f));
-    }
-    else if (FormatTraits<Format>::GetBPC(Component) < 32)
-    {
-        if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UINT)
-        {
-            int           iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1;
-            int           iMin = 0;
-            Integer<SIMD_T> vCompi = SIMD_T::castps_si(vComp);
-            vCompi = SIMD_T::max_epu32(vCompi, SIMD_T::set1_epi32(iMin));
-            vCompi = SIMD_T::min_epu32(vCompi, SIMD_T::set1_epi32(iMax));
-            vComp = SIMD_T::castsi_ps(vCompi);
-        }
-        else if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SINT)
-        {
-            int           iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1;
-            int           iMin = -1 - iMax;
-            Integer<SIMD_T> vCompi = SIMD_T::castps_si(vComp);
-            vCompi = SIMD_T::max_epi32(vCompi, SIMD_T::set1_epi32(iMin));
-            vCompi = SIMD_T::min_epi32(vCompi, SIMD_T::set1_epi32(iMax));
-            vComp = SIMD_T::castsi_ps(vCompi);
-        }
-    }
-
-    return vComp;
-}
-
-template <SWR_FORMAT Format>
-INLINE simdscalar SIMDCALL Clamp(simdscalar const& v, uint32_t Component)
-{
-    return Clamp<SIMD256, Format>(v, Component);
-}
-
-template <SWR_FORMAT Format>
-INLINE simd16scalar SIMDCALL Clamp(simd16scalar const& v, uint32_t Component)
-{
-    return Clamp<SIMD512, Format>(v, Component);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Normalize the given component based on the requirements on the
-///        Format template arg
-/// @param vComp - SIMD vector of floats
-/// @param Component - component
-template <typename SIMD_T, SWR_FORMAT Format>
-INLINE Float<SIMD_T> SIMDCALL Normalize(Float<SIMD_T> const& vComp, uint32_t Component)
-{
-    Float<SIMD_T> r = vComp;
-    if (FormatTraits<Format>::isNormalized(Component))
-    {
-        r = SIMD_T::mul_ps(r, SIMD_T::set1_ps(FormatTraits<Format>::fromFloat(Component)));
-        r = SIMD_T::castsi_ps(SIMD_T::cvtps_epi32(r));
-    }
-    return r;
-}
-
-template <SWR_FORMAT Format>
-INLINE simdscalar SIMDCALL Normalize(simdscalar const& vComp, uint32_t Component)
-{
-    return Normalize<SIMD256, Format>(vComp, Component);
-}
-
-template <SWR_FORMAT Format>
-INLINE simd16scalar SIMDCALL Normalize(simd16scalar const& vComp, uint32_t Component)
-{
-    return Normalize<SIMD512, Format>(vComp, Component);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Convert and store simdvector of pixels in SOA
-///        RGBA32_FLOAT to SOA format
-/// @param src - source data in SOA form
-/// @param dst - output data in SOA form
-template <typename SIMD_T, SWR_FORMAT DstFormat>
-INLINE void SIMDCALL StoreSOA(const Vec4<SIMD_T>& src, uint8_t* pDst)
-{
-    // fast path for float32
-    if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) &&
-        (FormatTraits<DstFormat>::GetBPC(0) == 32))
-    {
-        for (uint32_t comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
-        {
-            Float<SIMD_T> vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
-
-            // Gamma-correct
-            if (FormatTraits<DstFormat>::isSRGB)
-            {
-                if (comp < 3) // Input format is always RGBA32_FLOAT.
-                {
-                    vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
-                }
-            }
-
-            SIMD_T::store_ps(reinterpret_cast<float*>(pDst + comp * sizeof(simd16scalar)), vComp);
-        }
-        return;
-    }
-
-    auto lambda = [&](int comp) {
-        Float<SIMD_T> vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
-
-        // Gamma-correct
-        if (FormatTraits<DstFormat>::isSRGB)
-        {
-            if (comp < 3) // Input format is always RGBA32_FLOAT.
-            {
-                vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
-            }
-        }
-
-        // clamp
-        vComp = Clamp<SIMD_T, DstFormat>(vComp, comp);
-
-        // normalize
-        vComp = Normalize<SIMD_T, DstFormat>(vComp, comp);
-
-        // pack
-        vComp = FormatTraits<DstFormat>::pack(comp, vComp);
-
-        // store
-        FormatTraits<DstFormat>::storeSOA(comp, pDst, vComp);
-
-        // is there a better way to get this from the SIMD traits?
-        const uint32_t SIMD_WIDTH = sizeof(typename SIMD_T::Float) / sizeof(float);
-
-        pDst += (FormatTraits<DstFormat>::GetBPC(comp) * SIMD_WIDTH) / 8;
-    };
-
-    UnrollerL<0, FormatTraits<DstFormat>::numComps, 1>::step(lambda);
-}
-
-template <SWR_FORMAT DstFormat>
-INLINE void SIMDCALL StoreSOA(const simdvector& src, uint8_t* pDst)
-{
-    StoreSOA<SIMD256, DstFormat>(src, pDst);
-}
-
-template <SWR_FORMAT DstFormat>
-INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst)
-{
-    StoreSOA<SIMD512, DstFormat>(src, pDst);
-}
-
--- a/src/gallium/drivers/swr/rasterizer/core/format_traits.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_traits.h
--- a/src/gallium/drivers/swr/rasterizer/core/format_types.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h
--- a/src/gallium/drivers/swr/rasterizer/core/format_utils.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_utils.h
@ -1,939 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file utils.h
- *
- * @brief Utilities used by SWR core related to pixel formats.
- *
- ******************************************************************************/
-#pragma once
-
-#include "core/utils.h"
-#include "common/simdintrin.h"
-
-INLINE
-void vTranspose(simd4scalar& row0, simd4scalar& row1, simd4scalar& row2, simd4scalar& row3)
-{
-    simd4scalari row0i = SIMD128::castps_si(row0);
-    simd4scalari row1i = SIMD128::castps_si(row1);
-    simd4scalari row2i = SIMD128::castps_si(row2);
-    simd4scalari row3i = SIMD128::castps_si(row3);
-
-    simd4scalari vTemp = row2i;
-    row2i              = SIMD128::unpacklo_epi32(row2i, row3i);
-    vTemp              = SIMD128::unpackhi_epi32(vTemp, row3i);
-
-    row3i = row0i;
-    row0i = SIMD128::unpacklo_epi32(row0i, row1i);
-    row3i = SIMD128::unpackhi_epi32(row3i, row1i);
-
-    row1i = row0i;
-    row0i = SIMD128::unpacklo_epi64(row0i, row2i);
-    row1i = SIMD128::unpackhi_epi64(row1i, row2i);
-
-    row2i = row3i;
-    row2i = SIMD128::unpacklo_epi64(row2i, vTemp);
-    row3i = SIMD128::unpackhi_epi64(row3i, vTemp);
-
-    row0 = SIMD128::castsi_ps(row0i);
-    row1 = SIMD128::castsi_ps(row1i);
-    row2 = SIMD128::castsi_ps(row2i);
-    row3 = SIMD128::castsi_ps(row3i);
-}
-
-INLINE
-void vTranspose(simd4scalari& row0, simd4scalari& row1, simd4scalari& row2, simd4scalari& row3)
-{
-    simd4scalari vTemp = row2;
-    row2               = SIMD128::unpacklo_epi32(row2, row3);
-    vTemp              = SIMD128::unpackhi_epi32(vTemp, row3);
-
-    row3 = row0;
-    row0 = SIMD128::unpacklo_epi32(row0, row1);
-    row3 = SIMD128::unpackhi_epi32(row3, row1);
-
-    row1 = row0;
-    row0 = SIMD128::unpacklo_epi64(row0, row2);
-    row1 = SIMD128::unpackhi_epi64(row1, row2);
-
-    row2 = row3;
-    row2 = SIMD128::unpacklo_epi64(row2, vTemp);
-    row3 = SIMD128::unpackhi_epi64(row3, vTemp);
-}
-
-#if KNOB_SIMD_WIDTH == 8
-INLINE
-void vTranspose3x8(simd4scalar (&vDst)[8],
-                   const simdscalar& vSrc0,
-                   const simdscalar& vSrc1,
-                   const simdscalar& vSrc2)
-{
-    simdscalar r0r2       = _simd_unpacklo_ps(vSrc0, vSrc2);              // x0z0x1z1 x4z4x5z5
-    simdscalar r1rx       = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); // y0w0y1w1 y4w4y5w5
-    simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx);                // x0y0z0w0 x4y4z4w4
-    simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx);                // x1y1z1w1 x5y5z5w5
-
-    r0r2                  = _simd_unpackhi_ps(vSrc0, vSrc2);              // x2z2x3z3 x6z6x7z7
-    r1rx                  = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps()); // y2w2y3w3 y6w6yw77
-    simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);                // x2y2z2w2 x6y6z6w6
-    simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);                // x3y3z3w3 x7y7z7w7
-
-    vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
-    vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
-    vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
-    vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
-
-    vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
-    vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
-    vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
-    vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
-}
-
-INLINE
-void vTranspose4x8(simd4scalar (&vDst)[8],
-                   const simdscalar& vSrc0,
-                   const simdscalar& vSrc1,
-                   const simdscalar& vSrc2,
-                   const simdscalar& vSrc3)
-{
-    simdscalar r0r2       = _simd_unpacklo_ps(vSrc0, vSrc2); // x0z0x1z1 x4z4x5z5
-    simdscalar r1rx       = _simd_unpacklo_ps(vSrc1, vSrc3); // y0w0y1w1 y4w4y5w5
-    simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx);   // x0y0z0w0 x4y4z4w4
-    simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx);   // x1y1z1w1 x5y5z5w5
-
-    r0r2                  = _simd_unpackhi_ps(vSrc0, vSrc2); // x2z2x3z3 x6z6x7z7
-    r1rx                  = _simd_unpackhi_ps(vSrc1, vSrc3); // y2w2y3w3 y6w6yw77
-    simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);   // x2y2z2w2 x6y6z6w6
-    simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);   // x3y3z3w3 x7y7z7w7
-
-    vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
-    vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
-    vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
-    vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
-
-    vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
-    vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
-    vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
-    vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
-}
-
-INLINE
-void vTranspose4x16(simd16scalar (&dst)[4],
-                    const simd16scalar& src0,
-                    const simd16scalar& src1,
-                    const simd16scalar& src2,
-                    const simd16scalar& src3)
-{
-    const simd16scalari perm =
-        _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
-
-    // pre-permute input to setup the right order after all the unpacking
-
-    simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r
-    simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g
-    simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b
-    simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a
-
-    simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2);
-    simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3);
-    simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2);
-    simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3);
-
-    dst[0] = _simd16_unpacklo_ps(rblo, galo);
-    dst[1] = _simd16_unpackhi_ps(rblo, galo);
-    dst[2] = _simd16_unpacklo_ps(rbhi, gahi);
-    dst[3] = _simd16_unpackhi_ps(rbhi, gahi);
-}
-
-INLINE
-void vTranspose8x8(simdscalar (&vDst)[8],
-                   const simdscalar& vMask0,
-                   const simdscalar& vMask1,
-                   const simdscalar& vMask2,
-                   const simdscalar& vMask3,
-                   const simdscalar& vMask4,
-                   const simdscalar& vMask5,
-                   const simdscalar& vMask6,
-                   const simdscalar& vMask7)
-{
-    simdscalar __t0  = _simd_unpacklo_ps(vMask0, vMask1);
-    simdscalar __t1  = _simd_unpackhi_ps(vMask0, vMask1);
-    simdscalar __t2  = _simd_unpacklo_ps(vMask2, vMask3);
-    simdscalar __t3  = _simd_unpackhi_ps(vMask2, vMask3);
-    simdscalar __t4  = _simd_unpacklo_ps(vMask4, vMask5);
-    simdscalar __t5  = _simd_unpackhi_ps(vMask4, vMask5);
-    simdscalar __t6  = _simd_unpacklo_ps(vMask6, vMask7);
-    simdscalar __t7  = _simd_unpackhi_ps(vMask6, vMask7);
-    simdscalar __tt0 = _simd_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0));
-    simdscalar __tt1 = _simd_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2));
-    simdscalar __tt2 = _simd_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0));
-    simdscalar __tt3 = _simd_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2));
-    simdscalar __tt4 = _simd_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0));
-    simdscalar __tt5 = _simd_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2));
-    simdscalar __tt6 = _simd_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0));
-    simdscalar __tt7 = _simd_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2));
-    vDst[0]          = _simd_permute2f128_ps(__tt0, __tt4, 0x20);
-    vDst[1]          = _simd_permute2f128_ps(__tt1, __tt5, 0x20);
-    vDst[2]          = _simd_permute2f128_ps(__tt2, __tt6, 0x20);
-    vDst[3]          = _simd_permute2f128_ps(__tt3, __tt7, 0x20);
-    vDst[4]          = _simd_permute2f128_ps(__tt0, __tt4, 0x31);
-    vDst[5]          = _simd_permute2f128_ps(__tt1, __tt5, 0x31);
-    vDst[6]          = _simd_permute2f128_ps(__tt2, __tt6, 0x31);
-    vDst[7]          = _simd_permute2f128_ps(__tt3, __tt7, 0x31);
-}
-
-INLINE
-void vTranspose8x8(simdscalar (&vDst)[8],
-                   const simdscalari& vMask0,
-                   const simdscalari& vMask1,
-                   const simdscalari& vMask2,
-                   const simdscalari& vMask3,
-                   const simdscalari& vMask4,
-                   const simdscalari& vMask5,
-                   const simdscalari& vMask6,
-                   const simdscalari& vMask7)
-{
-    vTranspose8x8(vDst,
-                  _simd_castsi_ps(vMask0),
-                  _simd_castsi_ps(vMask1),
-                  _simd_castsi_ps(vMask2),
-                  _simd_castsi_ps(vMask3),
-                  _simd_castsi_ps(vMask4),
-                  _simd_castsi_ps(vMask5),
-                  _simd_castsi_ps(vMask6),
-                  _simd_castsi_ps(vMask7));
-}
-#endif
-
-//////////////////////////////////////////////////////////////////////////
-/// TranposeSingleComponent
-//////////////////////////////////////////////////////////////////////////
-template <uint32_t bpp>
-struct TransposeSingleComponent
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Pass-thru for single component.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-        memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
-    }
-
-    INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-        memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose8_8_8_8
-//////////////////////////////////////////////////////////////////////////
-struct Transpose8_8_8_8
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
-
-#if KNOB_SIMD_WIDTH == 8
-#if KNOB_ARCH <= KNOB_ARCH_AVX
-        simd4scalari c0c1 = src.v4[0]; // rrrrrrrrgggggggg
-        simd4scalari c2c3 =
-            SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1)); // bbbbbbbbaaaaaaaa
-        simd4scalari c0c2    = SIMD128::unpacklo_epi64(c0c1, c2c3);            // rrrrrrrrbbbbbbbb
-        simd4scalari c1c3    = SIMD128::unpackhi_epi64(c0c1, c2c3);            // ggggggggaaaaaaaa
-        simd4scalari c01     = SIMD128::unpacklo_epi8(c0c2, c1c3);             // rgrgrgrgrgrgrgrg
-        simd4scalari c23     = SIMD128::unpackhi_epi8(c0c2, c1c3);             // babababababababa
-        simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23);              // rgbargbargbargba
-        simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23);              // rgbargbargbargba
-        SIMD128::store_si((simd4scalari*)pDst, c0123lo);
-        SIMD128::store_si((simd4scalari*)(pDst + 16), c0123hi);
-#else
-        simdscalari dst01 = _simd_shuffle_epi8(src,
-                                               _simd_set_epi32(0x0f078080,
-                                                               0x0e068080,
-                                                               0x0d058080,
-                                                               0x0c048080,
-                                                               0x80800b03,
-                                                               0x80800a02,
-                                                               0x80800901,
-                                                               0x80800800));
-        simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
-        dst23             = _simd_shuffle_epi8(dst23,
-                                   _simd_set_epi32(0x80800f07,
-                                                   0x80800e06,
-                                                   0x80800d05,
-                                                   0x80800c04,
-                                                   0x0b038080,
-                                                   0x0a028080,
-                                                   0x09018080,
-                                                   0x08008080));
-        simdscalari dst   = _simd_or_si(dst01, dst23);
-        _simd_store_si((simdscalari*)pDst, dst);
-#endif
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD16_WIDTH == 16
-        // clang-format off
-
-        simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc));      // rrrrrrrrrrrrrrrr
-        simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1);  // gggggggggggggggg
-        simd4scalari src2 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 2);  // bbbbbbbbbbbbbbbb
-        simd4scalari src3 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 3);  // aaaaaaaaaaaaaaaa
-
-        simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
-        simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
-        simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2);
-        simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3);
-
-        simd16scalari shl1 = _simd16_slli_epi32(cvt1,  8);
-        simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16);
-        simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24);
-
-        simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3));
-
-        _simd16_store_si(reinterpret_cast<simd16scalari*>(pDst), dst);  // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
-
-        // clang-format on
-#else
-#error Unsupported vector width
-#endif
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose8_8_8
-//////////////////////////////////////////////////////////////////////////
-struct Transpose8_8_8
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose8_8
-//////////////////////////////////////////////////////////////////////////
-struct Transpose8_8
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
-
-        simd4scalari rg = src.v4[0];                       // rrrrrrrr gggggggg
-        simd4scalari g  = SIMD128::unpackhi_epi64(rg, rg); // gggggggg gggggggg
-        rg              = SIMD128::unpacklo_epi8(rg, g);
-        SIMD128::store_si((simd4scalari*)pDst, rg);
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD16_WIDTH == 16
-        // clang-format off
-
-        simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc));      // rrrrrrrrrrrrrrrr
-        simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1);  // gggggggggggggggg
-
-        simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
-        simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
-
-        simdscalari shl1 = _simd_slli_epi32(cvt1, 8);
-
-        simdscalari dst = _simd_or_si(cvt0, shl1);
-
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst), dst);  // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
-
-        // clang-format on
-#else
-#error Unsupported vector width
-#endif
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose32_32_32_32
-//////////////////////////////////////////////////////////////////////////
-struct Transpose32_32_32_32
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalar src0 = _simd_load_ps((const float*)pSrc);
-        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
-        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
-        simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
-
-        simd4scalar vDst[8];
-        vTranspose4x8(vDst, src0, src1, src2, src3);
-        SIMD128::store_ps((float*)pDst, vDst[0]);
-        SIMD128::store_ps((float*)pDst + 4, vDst[1]);
-        SIMD128::store_ps((float*)pDst + 8, vDst[2]);
-        SIMD128::store_ps((float*)pDst + 12, vDst[3]);
-        SIMD128::store_ps((float*)pDst + 16, vDst[4]);
-        SIMD128::store_ps((float*)pDst + 20, vDst[5]);
-        SIMD128::store_ps((float*)pDst + 24, vDst[6]);
-        SIMD128::store_ps((float*)pDst + 28, vDst[7]);
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD16_WIDTH == 16
-        // clang-format off
-
-        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc));
-        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
-        simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
-        simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 48);
-
-        simd16scalar dst[4];
-
-        vTranspose4x16(dst, src0, src1, src2, src3);
-
-        _simd16_store_ps(reinterpret_cast<float*>(pDst) +  0, dst[0]);
-        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]);
-        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]);
-        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]);
-
-        // clang-format on
-#else
-#error Unsupported vector width
-#endif
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose32_32_32
-//////////////////////////////////////////////////////////////////////////
-struct Transpose32_32_32
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalar src0 = _simd_load_ps((const float*)pSrc);
-        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
-        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
-
-        simd4scalar vDst[8];
-        vTranspose3x8(vDst, src0, src1, src2);
-        SIMD128::store_ps((float*)pDst, vDst[0]);
-        SIMD128::store_ps((float*)pDst + 4, vDst[1]);
-        SIMD128::store_ps((float*)pDst + 8, vDst[2]);
-        SIMD128::store_ps((float*)pDst + 12, vDst[3]);
-        SIMD128::store_ps((float*)pDst + 16, vDst[4]);
-        SIMD128::store_ps((float*)pDst + 20, vDst[5]);
-        SIMD128::store_ps((float*)pDst + 24, vDst[6]);
-        SIMD128::store_ps((float*)pDst + 28, vDst[7]);
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD16_WIDTH == 16
-        // clang-format off
-
-        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc));
-        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
-        simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
-        simd16scalar src3 = _simd16_setzero_ps();
-
-        simd16scalar dst[4];
-
-        vTranspose4x16(dst, src0, src1, src2, src3);
-
-        _simd16_store_ps(reinterpret_cast<float*>(pDst) +  0, dst[0]);
-        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]);
-        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]);
-        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]);
-
-        // clang-format on
-#else
-#error Unsupported vector width
-#endif
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose32_32
-//////////////////////////////////////////////////////////////////////////
-struct Transpose32_32
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        const float* pfSrc  = (const float*)pSrc;
-        simd4scalar  src_r0 = SIMD128::load_ps(pfSrc + 0);
-        simd4scalar  src_r1 = SIMD128::load_ps(pfSrc + 4);
-        simd4scalar  src_g0 = SIMD128::load_ps(pfSrc + 8);
-        simd4scalar  src_g1 = SIMD128::load_ps(pfSrc + 12);
-
-        simd4scalar dst0 = SIMD128::unpacklo_ps(src_r0, src_g0);
-        simd4scalar dst1 = SIMD128::unpackhi_ps(src_r0, src_g0);
-        simd4scalar dst2 = SIMD128::unpacklo_ps(src_r1, src_g1);
-        simd4scalar dst3 = SIMD128::unpackhi_ps(src_r1, src_g1);
-
-        float* pfDst = (float*)pDst;
-        SIMD128::store_ps(pfDst + 0, dst0);
-        SIMD128::store_ps(pfDst + 4, dst1);
-        SIMD128::store_ps(pfDst + 8, dst2);
-        SIMD128::store_ps(pfDst + 12, dst3);
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD16_WIDTH == 16
-        // clang-format off
-
-        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc));      // rrrrrrrrrrrrrrrr
-        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16); // gggggggggggggggg
-
-        simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1);                            // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
-        simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1);                            // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
-
-        simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0)  // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
-        simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2)  // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
-
-        simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0)  // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
-        simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0)  // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
-
-        _simd16_store_ps(reinterpret_cast<float*>(pDst) +  0, dst0);                    // rgrgrgrgrgrgrgrg
-        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst1);                    // rgrgrgrgrgrgrgrg
-
-        // clang-format on
-#else
-#error Unsupported vector width
-#endif
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose16_16_16_16
-//////////////////////////////////////////////////////////////////////////
-struct Transpose16_16_16_16
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
-        simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
-
-        simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
-        simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
-        simd4scalari src_b = _simd_extractf128_si(src_ba, 0);
-        simd4scalari src_a = _simd_extractf128_si(src_ba, 1);
-
-        simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
-        simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
-        simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
-        simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
-
-        simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
-        simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
-        simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
-        simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
-
-        SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
-        SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
-        SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
-        SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD16_WIDTH == 16
-        // clang-format off
-
-        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));       // rrrrrrrrrrrrrrrr
-        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1);   // gggggggggggggggg
-        simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2);   // bbbbbbbbbbbbbbbb
-        simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 3);   // aaaaaaaaaaaaaaaa
-
-        simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                    // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
-        simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                    // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
-        simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                    // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
-        simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                    // ba4 ba5 ba6 ba7 baC baD baE baF
-
-        simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                    // rbga0 rbga1 rbga8 rbga9
-        simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                    // rbga2 rbga3 rbgaA rbgaB
-        simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                    // rbga4 rbga5 rgbaC rbgaD
-        simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                    // rbga6 rbga7 rbgaE rbgaF
-
-        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)   // rbga0 rbga1 rbga2 rbga3
-        simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)   // rbga4 rbga5 rbga6 rbga7
-        simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)   // rbga8 rbga9 rbgaA rbgaB
-        simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)   // rbgaC rbgaD rbgaE rbgaF
-
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0);         // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1);         // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2);         // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3);         // rgbargbargbargba
-
-        // clang-format on
-#else
-#error Unsupported vector width
-#endif
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose16_16_16
-//////////////////////////////////////////////////////////////////////////
-struct Transpose16_16_16
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
-
-        simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
-        simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
-        simd4scalari src_b = SIMD128::load_si((const simd4scalari*)(pSrc + sizeof(simdscalari)));
-        simd4scalari src_a = SIMD128::setzero_si();
-
-        simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
-        simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
-        simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
-        simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
-
-        simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
-        simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
-        simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
-        simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
-
-        SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
-        SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
-        SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
-        SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD16_WIDTH == 16
-        // clang-format off
-
-        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));       // rrrrrrrrrrrrrrrr
-        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1);   // gggggggggggggggg
-        simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2);   // bbbbbbbbbbbbbbbb
-        simdscalari src3 = _simd_setzero_si();                                              // aaaaaaaaaaaaaaaa
-
-        simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                    // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
-        simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                    // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
-        simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                    // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
-        simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                    // ba4 ba5 ba6 ba7 baC baD baE baF
-
-        simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                    // rbga0 rbga1 rbga8 rbga9
-        simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                    // rbga2 rbga3 rbgaA rbgaB
-        simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                    // rbga4 rbga5 rgbaC rbgaD
-        simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                    // rbga6 rbga7 rbgaE rbgaF
-
-        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)  // rbga0 rbga1 rbga2 rbga3
-        simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)  // rbga4 rbga5 rbga6 rbga7
-        simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)  // rbga8 rbga9 rbgaA rbgaB
-        simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)  // rbgaC rbgaD rbgaE rbgaF
-
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0);         // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1);         // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2);         // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3);         // rgbargbargbargba
-
-        // clang-format on
-#else
-#error Unsupported vector width
-#endif
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose16_16
-//////////////////////////////////////////////////////////////////////////
-struct Transpose16_16
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD_WIDTH == 8
-        simdscalar src = _simd_load_ps((const float*)pSrc);
-
-        simd4scalar comp0 = _simd_extractf128_ps(src, 0);
-        simd4scalar comp1 = _simd_extractf128_ps(src, 1);
-
-        simd4scalari comp0i = SIMD128::castps_si(comp0);
-        simd4scalari comp1i = SIMD128::castps_si(comp1);
-
-        simd4scalari resLo = SIMD128::unpacklo_epi16(comp0i, comp1i);
-        simd4scalari resHi = SIMD128::unpackhi_epi16(comp0i, comp1i);
-
-        SIMD128::store_si((simd4scalari*)pDst, resLo);
-        SIMD128::store_si((simd4scalari*)pDst + 1, resHi);
-#else
-#error Unsupported vector width
-#endif
-    }
-
-    INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
-    {
-#if KNOB_SIMD16_WIDTH == 16
-        // clang-format off
-
-        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));       // rrrrrrrrrrrrrrrr
-        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1);   // gggggggggggggggg
-
-        simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1);                    // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
-        simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1);                    // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
-
-        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)   // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
-        simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)   // rg8 rg9 rgA rgB rgC rgD rgE rgF
-
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0);         // rgrgrgrgrgrgrgrg
-        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1);         // rgrgrgrgrgrgrgrg
-
-        // clang-format on
-#else
-#error Unsupported vector width
-#endif
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose24_8
-//////////////////////////////////////////////////////////////////////////
-struct Transpose24_8
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose32_8_24
-//////////////////////////////////////////////////////////////////////////
-struct Transpose32_8_24
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose4_4_4_4
-//////////////////////////////////////////////////////////////////////////
-struct Transpose4_4_4_4
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose5_6_5
-//////////////////////////////////////////////////////////////////////////
-struct Transpose5_6_5
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose9_9_9_5
-//////////////////////////////////////////////////////////////////////////
-struct Transpose9_9_9_5
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose5_5_5_1
-//////////////////////////////////////////////////////////////////////////
-struct Transpose5_5_5_1
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose1_5_5_5
-//////////////////////////////////////////////////////////////////////////
-struct Transpose1_5_5_5
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose10_10_10_2
-//////////////////////////////////////////////////////////////////////////
-struct Transpose10_10_10_2
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose11_11_10
-//////////////////////////////////////////////////////////////////////////
-struct Transpose11_11_10
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose64
-//////////////////////////////////////////////////////////////////////////
-struct Transpose64
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose64_64
-//////////////////////////////////////////////////////////////////////////
-struct Transpose64_64
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose64_64_64
-//////////////////////////////////////////////////////////////////////////
-struct Transpose64_64_64
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// Transpose64_64_64_64
-//////////////////////////////////////////////////////////////////////////
-struct Transpose64_64_64_64
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs an SOA to AOS conversion
-    /// @param pSrc - source data in SOA form
-    /// @param pDst - output data in AOS form
-    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-    static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-};
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h
@ -1,448 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file frontend.h
- *
- * @brief Definitions for Frontend which handles vertex processing,
- *        primitive assembly, clipping, binning, etc.
- *
- ******************************************************************************/
-#pragma once
-#include "context.h"
-#include "common/simdintrin.h"
-#include <type_traits>
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Helper macro to generate a bitmask
-static INLINE uint32_t
-              GenMask(uint32_t numBits)
-{
-    SWR_ASSERT(
-        numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
-    return ((1U << numBits) - 1);
-}
-
-// Calculates the A and B coefficients for the 3 edges of the triangle
-//
-// maths for edge equations:
-//   standard form of a line in 2d
-//   Ax + By + C = 0
-//   A = y0 - y1
-//   B = x1 - x0
-//   C = x0y1 - x1y0
-INLINE
-void triangleSetupAB(const __m128 vX, const __m128 vY, __m128& vA, __m128& vB)
-{
-    // vYsub = y1 y2 y0 dc
-    __m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1));
-    // vY =    y0 y1 y2 dc
-    vA = _mm_sub_ps(vY, vYsub);
-
-    // Result:
-    // A[0] = y0 - y1
-    // A[1] = y1 - y2
-    // A[2] = y2 - y0
-
-    // vXsub = x1 x2 x0 dc
-    __m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1));
-    // vX =    x0 x1 x2 dc
-    vB = _mm_sub_ps(vXsub, vX);
-
-    // Result:
-    // B[0] = x1 - x0
-    // B[1] = x2 - x1
-    // B[2] = x0 - x2
-}
-
-INLINE
-void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i& vA, __m128i& vB)
-{
-    // generate edge equations
-    // A = y0 - y1
-    // B = x1 - x0
-    // C = x0y1 - x1y0
-    __m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1));
-    vA            = _mm_sub_epi32(vY, vYsub);
-
-    __m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1));
-    vB            = _mm_sub_epi32(vXsub, vX);
-}
-
-INLINE
-void triangleSetupABIntVertical(const simdscalari vX[3],
-                                const simdscalari vY[3],
-                                simdscalari (&vA)[3],
-                                simdscalari (&vB)[3])
-{
-    // A = y0 - y1
-    // B = x1 - x0
-    vA[0] = _simd_sub_epi32(vY[0], vY[1]);
-    vA[1] = _simd_sub_epi32(vY[1], vY[2]);
-    vA[2] = _simd_sub_epi32(vY[2], vY[0]);
-
-    vB[0] = _simd_sub_epi32(vX[1], vX[0]);
-    vB[1] = _simd_sub_epi32(vX[2], vX[1]);
-    vB[2] = _simd_sub_epi32(vX[0], vX[2]);
-}
-
-#if ENABLE_AVX512_SIMD16
-INLINE
-void triangleSetupABIntVertical(const simd16scalari vX[3],
-                                const simd16scalari vY[3],
-                                simd16scalari (&vA)[3],
-                                simd16scalari (&vB)[3])
-{
-    // A = y0 - y1
-    // B = x1 - x0
-    vA[0] = _simd16_sub_epi32(vY[0], vY[1]);
-    vA[1] = _simd16_sub_epi32(vY[1], vY[2]);
-    vA[2] = _simd16_sub_epi32(vY[2], vY[0]);
-
-    vB[0] = _simd16_sub_epi32(vX[1], vX[0]);
-    vB[1] = _simd16_sub_epi32(vX[2], vX[1]);
-    vB[2] = _simd16_sub_epi32(vX[0], vX[2]);
-}
-
-#endif
-// Calculate the determinant of the triangle
-// 2 vectors between the 3 points: P, Q
-// Px = x0-x2, Py = y0-y2
-// Qx = x1-x2, Qy = y1-y2
-//       |Px Qx|
-// det = |     | = PxQy - PyQx
-//       |Py Qy|
-// simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2)
-//               try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx
-//               : B[2]*A[1] - (-(y2-y0))*(-(x2-x1))
-//               : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1)
-//               : B[2]*A[1] - A[2]*B[1]
-INLINE
-float calcDeterminantInt(const __m128i vA, const __m128i vB)
-{
-    // vAShuf = [A1, A0, A2, A0]
-    __m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1));
-    // vBShuf = [B2, B0, B1, B0]
-    __m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2));
-    // vMul = [A1*B2, B1*A2]
-    __m128i vMul = _mm_mul_epi32(vAShuf, vBShuf);
-
-    // shuffle upper to lower
-    // vMul2 = [B1*A2, B1*A2]
-    __m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2));
-    // vMul = [A1*B2 - B1*A2]
-    vMul = _mm_sub_epi64(vMul, vMul2);
-
-    int64_t result;
-    _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul));
-
-    double dResult = (double)result;
-    dResult        = dResult * (1.0 / FIXED_POINT16_SCALE);
-
-    return (float)dResult;
-}
-
-INLINE
-void calcDeterminantIntVertical(const simdscalari vA[3],
-                                const simdscalari vB[3],
-                                simdscalari*      pvDet)
-{
-    // refer to calcDeterminantInt comment for calculation explanation
-
-    // A1*B2
-    simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5
-    simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7
-
-    simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]);
-    simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]);
-
-    simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5
-    simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7
-
-    // B1*A2
-    simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]);
-    simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]);
-
-    simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]);
-    simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]);
-
-    simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo);
-    simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi);
-
-    // A1*B2 - A2*B1
-    simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo);
-    simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi);
-
-    // shuffle 0 1 4 5 2 3 6 7 -> 0 1 2 3
-    simdscalari vResultLo = _simd_permute2f128_si(detLo, detHi, 0x20);
-
-    // shuffle 0 1 4 5 2 3 6 7 -> 4 5 6 7
-    simdscalari vResultHi = _simd_permute2f128_si(detLo, detHi, 0x31);
-
-    pvDet[0] = vResultLo;
-    pvDet[1] = vResultHi;
-}
-
-#if ENABLE_AVX512_SIMD16
-INLINE
-void calcDeterminantIntVertical(const simd16scalari vA[3],
-                                const simd16scalari vB[3],
-                                simd16scalari*      pvDet)
-{
-    // refer to calcDeterminantInt comment for calculation explanation
-
-    // A1*B2
-    simd16scalari vA1_lo =
-        _simd16_unpacklo_epi32(vA[1], vA[1]); // X 0 X 1 X 4 X 5 X 8 X 9 X C X D (32b)
-    simd16scalari vA1_hi = _simd16_unpackhi_epi32(vA[1], vA[1]); // X 2 X 3 X 6 X 7 X A X B X E X F
-
-    simd16scalari vB2_lo = _simd16_unpacklo_epi32(vB[2], vB[2]);
-    simd16scalari vB2_hi = _simd16_unpackhi_epi32(vB[2], vB[2]);
-
-    simd16scalari vA1B2_lo = _simd16_mul_epi32(vA1_lo, vB2_lo); // 0 1 4 5 8 9 C D (64b)
-    simd16scalari vA1B2_hi = _simd16_mul_epi32(vA1_hi, vB2_hi); // 2 3 6 7 A B E F
-
-    // B1*A2
-    simd16scalari vA2_lo = _simd16_unpacklo_epi32(vA[2], vA[2]);
-    simd16scalari vA2_hi = _simd16_unpackhi_epi32(vA[2], vA[2]);
-
-    simd16scalari vB1_lo = _simd16_unpacklo_epi32(vB[1], vB[1]);
-    simd16scalari vB1_hi = _simd16_unpackhi_epi32(vB[1], vB[1]);
-
-    simd16scalari vA2B1_lo = _simd16_mul_epi32(vA2_lo, vB1_lo);
-    simd16scalari vA2B1_hi = _simd16_mul_epi32(vA2_hi, vB1_hi);
-
-    // A1*B2 - A2*B1
-    simd16scalari difflo = _simd16_sub_epi64(vA1B2_lo, vA2B1_lo); // 0 1 4 5 8 9 C D (64b)
-    simd16scalari diffhi = _simd16_sub_epi64(vA1B2_hi, vA2B1_hi); // 2 3 6 7 A B E F
-
-    // (1, 0, 1, 0) = 01 00 01 00 = 0x44, (3, 2, 3, 2) = 11 10 11 10 = 0xEE
-    simd16scalari templo = _simd16_permute2f128_si(difflo, diffhi, 0x44); // 0 1 4 5 2 3 6 7 (64b)
-    simd16scalari temphi = _simd16_permute2f128_si(difflo, diffhi, 0xEE); // 8 9 C D A B E F
-
-    // (3, 1, 2, 0) = 11 01 10 00 = 0xD8
-    pvDet[0] = _simd16_permute2f128_si(templo, templo, 0xD8); // 0 1 2 3 4 5 6 7 (64b)
-    pvDet[1] = _simd16_permute2f128_si(temphi, temphi, 0xD8); // 8 9 A B C D E F
-}
-
-#endif
-INLINE
-void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128& vB, __m128& vC)
-{
-    // C = -Ax - By
-    vC         = _mm_mul_ps(vA, vX);
-    __m128 vCy = _mm_mul_ps(vB, vY);
-    vC         = _mm_mul_ps(vC, _mm_set1_ps(-1.0f));
-    vC         = _mm_sub_ps(vC, vCy);
-}
-
-template <uint32_t NumVerts>
-INLINE void viewportTransform(simdvector* v, const SWR_VIEWPORT_MATRICES& vpMatrices)
-{
-    simdscalar m00 = _simd_load1_ps(&vpMatrices.m00[0]);
-    simdscalar m30 = _simd_load1_ps(&vpMatrices.m30[0]);
-    simdscalar m11 = _simd_load1_ps(&vpMatrices.m11[0]);
-    simdscalar m31 = _simd_load1_ps(&vpMatrices.m31[0]);
-    simdscalar m22 = _simd_load1_ps(&vpMatrices.m22[0]);
-    simdscalar m32 = _simd_load1_ps(&vpMatrices.m32[0]);
-
-    for (uint32_t i = 0; i < NumVerts; ++i)
-    {
-        v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
-        v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
-        v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
-    }
-}
-
-#if USE_SIMD16_FRONTEND
-template <uint32_t NumVerts>
-INLINE void viewportTransform(simd16vector* v, const SWR_VIEWPORT_MATRICES& vpMatrices)
-{
-    const simd16scalar m00 = _simd16_broadcast_ss(&vpMatrices.m00[0]);
-    const simd16scalar m30 = _simd16_broadcast_ss(&vpMatrices.m30[0]);
-    const simd16scalar m11 = _simd16_broadcast_ss(&vpMatrices.m11[0]);
-    const simd16scalar m31 = _simd16_broadcast_ss(&vpMatrices.m31[0]);
-    const simd16scalar m22 = _simd16_broadcast_ss(&vpMatrices.m22[0]);
-    const simd16scalar m32 = _simd16_broadcast_ss(&vpMatrices.m32[0]);
-
-    for (uint32_t i = 0; i < NumVerts; ++i)
-    {
-        v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30);
-        v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31);
-        v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32);
-    }
-}
-
-#endif
-template <uint32_t NumVerts>
-INLINE void viewportTransform(simdvector*                  v,
-                              const SWR_VIEWPORT_MATRICES& vpMatrices,
-                              simdscalari const&           vViewportIdx)
-{
-    // perform a gather of each matrix element based on the viewport array indexes
-    simdscalar m00 = _simd_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
-    simdscalar m30 = _simd_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4);
-    simdscalar m11 = _simd_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4);
-    simdscalar m31 = _simd_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4);
-    simdscalar m22 = _simd_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4);
-    simdscalar m32 = _simd_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4);
-
-    for (uint32_t i = 0; i < NumVerts; ++i)
-    {
-        v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
-        v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
-        v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
-    }
-}
-
-#if USE_SIMD16_FRONTEND
-template <uint32_t NumVerts>
-INLINE void viewportTransform(simd16vector*                v,
-                              const SWR_VIEWPORT_MATRICES& vpMatrices,
-                              simd16scalari const&         vViewportIdx)
-{
-    // perform a gather of each matrix element based on the viewport array indexes
-    const simd16scalar m00 = _simd16_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
-    const simd16scalar m30 = _simd16_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4);
-    const simd16scalar m11 = _simd16_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4);
-    const simd16scalar m31 = _simd16_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4);
-    const simd16scalar m22 = _simd16_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4);
-    const simd16scalar m32 = _simd16_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4);
-
-    for (uint32_t i = 0; i < NumVerts; ++i)
-    {
-        v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30);
-        v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31);
-        v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32);
-    }
-}
-
-#endif
-INLINE
-void calcBoundingBoxInt(const __m128i& vX, const __m128i& vY, SWR_RECT& bbox)
-{
-    // Need horizontal fp min here
-    __m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1));
-    __m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2));
-
-    __m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1));
-    __m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2));
-
-    __m128i vMinX = _mm_min_epi32(vX, vX1);
-    vMinX         = _mm_min_epi32(vMinX, vX2);
-
-    __m128i vMaxX = _mm_max_epi32(vX, vX1);
-    vMaxX         = _mm_max_epi32(vMaxX, vX2);
-
-    __m128i vMinY = _mm_min_epi32(vY, vY1);
-    vMinY         = _mm_min_epi32(vMinY, vY2);
-
-    __m128i vMaxY = _mm_max_epi32(vY, vY1);
-    vMaxY         = _mm_max_epi32(vMaxY, vY2);
-
-    bbox.xmin = _mm_extract_epi32(vMinX, 0);
-    bbox.xmax = _mm_extract_epi32(vMaxX, 0);
-    bbox.ymin = _mm_extract_epi32(vMinY, 0);
-    bbox.ymax = _mm_extract_epi32(vMaxY, 0);
-}
-
-INLINE
-bool CanUseSimplePoints(DRAW_CONTEXT* pDC)
-{
-    const API_STATE& state = GetApiState(pDC);
-
-    return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X &&
-            state.rastState.pointSize == 1.0f && !state.rastState.pointParam &&
-            !state.rastState.pointSpriteEnable && !state.backendState.clipDistanceMask);
-}
-
-INLINE
-bool vHasNaN(const __m128& vec)
-{
-    const __m128  result = _mm_cmpunord_ps(vec, vec);
-    const int32_t mask   = _mm_movemask_ps(result);
-    return (mask != 0);
-}
-
-uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements);
-uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts);
-
-// ProcessDraw front-end function.  All combinations of parameter values are available
-PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed,
-                                    bool IsCutIndexEnabled,
-                                    bool HasTessellation,
-                                    bool HasGeometryShader,
-                                    bool HasStreamOut,
-                                    bool HasRasterization);
-
-void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData);
-void ProcessStoreTiles(SWR_CONTEXT*  pContext,
-                       DRAW_CONTEXT* pDC,
-                       uint32_t      workerId,
-                       void*         pUserData);
-void ProcessDiscardInvalidateTiles(SWR_CONTEXT*  pContext,
-                                   DRAW_CONTEXT* pDC,
-                                   uint32_t      workerId,
-                                   void*         pUserData);
-void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData);
-void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData);
-
-PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative);
-#if USE_SIMD16_FRONTEND
-PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative);
-#endif
-
-struct PA_STATE_BASE; // forward decl
-void BinPoints(DRAW_CONTEXT*      pDC,
-               PA_STATE&          pa,
-               uint32_t           workerId,
-               simdvector         prims[3],
-               uint32_t           primMask,
-               simdscalari const& primID,
-               simdscalari const& viewportIdx,
-               simdscalari const& rtIdx);
-void BinLines(DRAW_CONTEXT*      pDC,
-              PA_STATE&          pa,
-              uint32_t           workerId,
-              simdvector         prims[3],
-              uint32_t           primMask,
-              simdscalari const& primID,
-              simdscalari const& viewportIdx,
-              simdscalari const& rtIdx);
-#if USE_SIMD16_FRONTEND
-void SIMDCALL BinPoints_simd16(DRAW_CONTEXT*        pDC,
-                               PA_STATE&            pa,
-                               uint32_t             workerId,
-                               simd16vector         prims[3],
-                               uint32_t             primMask,
-                               simd16scalari const& primID,
-                               simd16scalari const& viewportIdx,
-                               simd16scalari const& rtIdx);
-void SIMDCALL BinLines_simd16(DRAW_CONTEXT*        pDC,
-                              PA_STATE&            pa,
-                              uint32_t             workerId,
-                              simd16vector         prims[3],
-                              uint32_t             primMask,
-                              simd16scalari const& primID,
-                              simd16scalari const& viewportIdx,
-                              simd16scalari const& rtIdx);
-#endif
-
--- a/src/gallium/drivers/swr/rasterizer/core/knobs.h
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h
@ -1,175 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file knobs.h
- *
- * @brief Static (Compile-Time) Knobs for Core.
- *
- ******************************************************************************/
-#pragma once
-
-#include <stdint.h>
-#include <gen_knobs.h>
-
-#define KNOB_ARCH_AVX 0
-#define KNOB_ARCH_AVX2 1
-#define KNOB_ARCH_AVX512 2
-
-///////////////////////////////////////////////////////////////////////////////
-// AVX512 Support
-///////////////////////////////////////////////////////////////////////////////
-
-#define ENABLE_AVX512_SIMD16 1
-#define USE_SIMD16_FRONTEND 1
-#define USE_SIMD16_SHADERS 1 // requires USE_SIMD16_FRONTEND
-#define USE_SIMD16_VS 1      // requires USE_SIMD16_SHADERS
-
-///////////////////////////////////////////////////////////////////////////////
-// Architecture validation
-///////////////////////////////////////////////////////////////////////////////
-#if !defined(KNOB_ARCH)
-#define KNOB_ARCH KNOB_ARCH_AVX
-#endif
-
-#if (KNOB_ARCH == KNOB_ARCH_AVX)
-#define KNOB_ARCH_ISA AVX
-#define KNOB_ARCH_STR "AVX"
-#elif (KNOB_ARCH == KNOB_ARCH_AVX2)
-#define KNOB_ARCH_ISA AVX2
-#define KNOB_ARCH_STR "AVX2"
-#elif (KNOB_ARCH == KNOB_ARCH_AVX512)
-#define KNOB_ARCH_ISA AVX512F
-#define KNOB_ARCH_STR "AVX512"
-#else
-#error "Unknown architecture"
-#endif
-
-#define KNOB_SIMD_WIDTH 8
-#define KNOB_SIMD_BYTES 32
-
-#define KNOB_SIMD16_WIDTH 16
-#define KNOB_SIMD16_BYTES 64
-
-#define MAX_KNOB_ARCH_STR_LEN sizeof("AVX512_PLUS_PADDING")
-
-///////////////////////////////////////////////////////////////////////////////
-// Configuration knobs
-///////////////////////////////////////////////////////////////////////////////
-// Maximum supported number of active vertex buffer streams
-#define KNOB_NUM_STREAMS 32
-
-// Maximum supported active viewports and scissors
-#define KNOB_NUM_VIEWPORTS_SCISSORS 16
-
-// Guardband range used by the clipper
-#define KNOB_GUARDBAND_WIDTH 32768.0f
-#define KNOB_GUARDBAND_HEIGHT 32768.0f
-
-// Scratch space requirements per worker. Currently only used for TGSM sizing for some stages
-#define KNOB_WORKER_SCRATCH_SPACE_SIZE (32 * 1024)
-
-///////////////////////////////
-// Macro tile configuration
-///////////////////////////////
-
-// raster tile dimensions
-#define KNOB_TILE_X_DIM 8
-#define KNOB_TILE_X_DIM_SHIFT 3
-#define KNOB_TILE_Y_DIM 8
-#define KNOB_TILE_Y_DIM_SHIFT 3
-
-// fixed macrotile pixel dimension for now, eventually will be
-// dynamically set based on tile format and pixel size
-#define KNOB_MACROTILE_X_DIM 32
-#define KNOB_MACROTILE_Y_DIM 32
-#define KNOB_MACROTILE_X_DIM_FIXED_SHIFT 13
-#define KNOB_MACROTILE_Y_DIM_FIXED_SHIFT 13
-#define KNOB_MACROTILE_X_DIM_FIXED (KNOB_MACROTILE_X_DIM << 8)
-#define KNOB_MACROTILE_Y_DIM_FIXED (KNOB_MACROTILE_Y_DIM << 8)
-#define KNOB_MACROTILE_X_DIM_IN_TILES (KNOB_MACROTILE_X_DIM >> KNOB_TILE_X_DIM_SHIFT)
-#define KNOB_MACROTILE_Y_DIM_IN_TILES (KNOB_MACROTILE_Y_DIM >> KNOB_TILE_Y_DIM_SHIFT)
-
-// total # of hot tiles available. This should be enough to
-// fully render a 16kx16k 128bpp render target
-#define KNOB_NUM_HOT_TILES_X 512
-#define KNOB_NUM_HOT_TILES_Y 512
-#define KNOB_COLOR_HOT_TILE_FORMAT R32G32B32A32_FLOAT
-#define KNOB_DEPTH_HOT_TILE_FORMAT R32_FLOAT
-#define KNOB_STENCIL_HOT_TILE_FORMAT R8_UINT
-
-// Max scissor rectangle
-#define KNOB_MAX_SCISSOR_X KNOB_NUM_HOT_TILES_X* KNOB_MACROTILE_X_DIM
-#define KNOB_MAX_SCISSOR_Y KNOB_NUM_HOT_TILES_Y* KNOB_MACROTILE_Y_DIM
-
-#if KNOB_SIMD_WIDTH == 8 && KNOB_TILE_X_DIM < 4
-#error "incompatible width/tile dimensions"
-#endif
-
-#if ENABLE_AVX512_SIMD16
-#if KNOB_SIMD16_WIDTH == 16 && KNOB_TILE_X_DIM < 8
-#error "incompatible width/tile dimensions"
-#endif
-#endif
-
-#if KNOB_SIMD_WIDTH == 8
-#define SIMD_TILE_X_DIM 4
-#define SIMD_TILE_Y_DIM 2
-#else
-#error "Invalid simd width"
-#endif
-
-#if ENABLE_AVX512_SIMD16
-#if KNOB_SIMD16_WIDTH == 16
-#define SIMD16_TILE_X_DIM 8
-#define SIMD16_TILE_Y_DIM 2
-#else
-#error "Invalid simd width"
-#endif
-#endif
-
-///////////////////////////////////////////////////////////////////////////////
-// Optimization knobs
-///////////////////////////////////////////////////////////////////////////////
-#define KNOB_USE_FAST_SRGB TRUE
-
-// enables cut-aware primitive assembler
-#define KNOB_ENABLE_CUT_AWARE_PA TRUE
-
-// enables early rasterization (useful for small triangles)
-#if !defined(KNOB_ENABLE_EARLY_RAST)
-#define KNOB_ENABLE_EARLY_RAST 1
-#endif
-
-#if KNOB_ENABLE_EARLY_RAST
-#define ER_SIMD_TILE_X_SHIFT 2
-#define ER_SIMD_TILE_Y_SHIFT 2
-#endif
-
-///////////////////////////////////////////////////////////////////////////////
-// Debug knobs
-///////////////////////////////////////////////////////////////////////////////
-//#define KNOB_ENABLE_RDTSC
-
-// Set to 1 to use the dynamic KNOB_TOSS_XXXX knobs.
-#if !defined(KNOB_ENABLE_TOSS_POINTS)
-#define KNOB_ENABLE_TOSS_POINTS 0
-#endif
--- a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
@ -1,108 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file knobs_init.h
- *
- * @brief Dynamic Knobs Initialization for Core.
- *
- ******************************************************************************/
-#pragma once
-
-#include <core/knobs.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#include <stdio.h>
-
-// Assume the type is compatible with a 32-bit integer
-template <typename T>
-static inline void ConvertEnvToKnob(const char* pOverride, T& knobValue)
-{
-    uint32_t value    = 0;
-    char*    pStopped = nullptr;
-    value             = strtoul(pOverride, &pStopped, 0);
-    if (pStopped != pOverride)
-    {
-        knobValue = static_cast<T>(value);
-    }
-}
-
-static inline void ConvertEnvToKnob(const char* pOverride, bool& knobValue)
-{
-    size_t len = strlen(pOverride);
-    if (len == 1)
-    {
-        auto c = tolower(pOverride[0]);
-        if (c == 'y' || c == 't' || c == '1')
-        {
-            knobValue = true;
-            return;
-        }
-        if (c == 'n' || c == 'f' || c == '0')
-        {
-            knobValue = false;
-            return;
-        }
-    }
-
-    // Try converting to a number and casting to bool
-    uint32_t value    = 0;
-    char*    pStopped = nullptr;
-    value             = strtoul(pOverride, &pStopped, 0);
-    if (pStopped != pOverride)
-    {
-        knobValue = value != 0;
-    }
-}
-
-static inline void ConvertEnvToKnob(const char* pOverride, float& knobValue)
-{
-    float value = knobValue;
-    if (sscanf(pOverride, "%f", &value))
-    {
-        knobValue = value;
-    }
-}
-
-static inline void ConvertEnvToKnob(const char* pOverride, std::string& knobValue)
-{
-    knobValue = pOverride;
-}
-
-template <typename T>
-static inline void InitKnob(T& knob)
-{
-    // Read environment variables
-    const char* pOverride = getenv(knob.Name());
-
-    if (pOverride)
-    {
-        auto knobValue = knob.DefaultValue();
-        ConvertEnvToKnob(pOverride, knobValue);
-        knob.Value(knobValue);
-    }
-    else
-    {
-        // Set default value
-        knob.Value(knob.DefaultValue());
-    }
-}
--- a/src/gallium/drivers/swr/rasterizer/core/multisample.h
+++ b/src/gallium/drivers/swr/rasterizer/core/multisample.h
@ -1,459 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file multisample.h
- *
- ******************************************************************************/
-
-#pragma once
-
-#include "context.h"
-#include "format_traits.h"
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief convenience typedef for testing for single sample case
-typedef std::integral_constant<int, 1> SingleSampleT;
-
-INLINE
-SWR_MULTISAMPLE_COUNT GetSampleCount(uint32_t numSamples)
-{
-    switch (numSamples)
-    {
-    case 1:
-        return SWR_MULTISAMPLE_1X;
-    case 2:
-        return SWR_MULTISAMPLE_2X;
-    case 4:
-        return SWR_MULTISAMPLE_4X;
-    case 8:
-        return SWR_MULTISAMPLE_8X;
-    case 16:
-        return SWR_MULTISAMPLE_16X;
-    default:
-        assert(0);
-        return SWR_MULTISAMPLE_1X;
-    }
-}
-
-// hardcoded offsets based on Direct3d standard multisample positions
-// 8 x 8 pixel grid ranging from (0, 0) to (15, 15), with (0, 0) = UL pixel corner
-// coords are 0.8 fixed point offsets from (0, 0)
-template <SWR_MULTISAMPLE_COUNT sampleCount, bool isCenter = false>
-struct MultisampleTraits
-{
-    INLINE static float       X(uint32_t sampleNum) = delete;
-    INLINE static float       Y(uint32_t sampleNum) = delete;
-    INLINE static simdscalari FullSampleMask()      = delete;
-
-    static const uint32_t numSamples = 0;
-};
-
-template <>
-struct MultisampleTraits<SWR_MULTISAMPLE_1X, false>
-{
-    INLINE static float       X(uint32_t sampleNum) { return samplePosX[sampleNum]; };
-    INLINE static float       Y(uint32_t sampleNum) { return samplePosY[sampleNum]; };
-    INLINE static simdscalari FullSampleMask() { return _simd_set1_epi32(0x1); };
-
-    static const uint32_t              numSamples         = 1;
-    static const uint32_t              numCoverageSamples = 1;
-    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_1X;
-    static constexpr uint32_t          samplePosXi[1]     = {0x80};
-    static constexpr uint32_t          samplePosYi[1]     = {0x80};
-    static constexpr float             samplePosX[1]      = {0.5f};
-    static constexpr float             samplePosY[1]      = {0.5f};
-};
-
-template <>
-struct MultisampleTraits<SWR_MULTISAMPLE_1X, true>
-{
-    INLINE static float       X(uint32_t sampleNum) { return 0.5f; };
-    INLINE static float       Y(uint32_t sampleNum) { return 0.5f; };
-    INLINE static simdscalari FullSampleMask() { return _simd_set1_epi32(0x1); };
-
-    static const uint32_t              numSamples         = 1;
-    static const uint32_t              numCoverageSamples = 1;
-    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_1X;
-    static constexpr uint32_t          samplePosXi[1]     = {0x80};
-    static constexpr uint32_t          samplePosYi[1]     = {0x80};
-    static constexpr float             samplePosX[1]      = {0.5f};
-    static constexpr float             samplePosY[1]      = {0.5f};
-};
-
-template <>
-struct MultisampleTraits<SWR_MULTISAMPLE_2X, false>
-{
-    INLINE static float X(uint32_t sampleNum)
-    {
-        SWR_ASSERT(sampleNum < numSamples);
-        return samplePosX[sampleNum];
-    };
-    INLINE static float Y(uint32_t sampleNum)
-    {
-        SWR_ASSERT(sampleNum < numSamples);
-        return samplePosY[sampleNum];
-    };
-    INLINE static simdscalari FullSampleMask()
-    {
-        static const simdscalari mask = _simd_set1_epi32(0x3);
-        return mask;
-    }
-
-    static const uint32_t              numSamples         = 2;
-    static const uint32_t              numCoverageSamples = 2;
-    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_2X;
-    static constexpr uint32_t          samplePosXi[2]     = {0xC0, 0x40};
-    static constexpr uint32_t          samplePosYi[2]     = {0xC0, 0x40};
-    static constexpr float             samplePosX[2]      = {0.75f, 0.25f};
-    static constexpr float             samplePosY[2]      = {0.75f, 0.25f};
-};
-
-template <>
-struct MultisampleTraits<SWR_MULTISAMPLE_2X, true>
-{
-    INLINE static float       X(uint32_t sampleNum) { return 0.5f; };
-    INLINE static float       Y(uint32_t sampleNum) { return 0.5f; };
-    INLINE static simdscalari FullSampleMask()
-    {
-        static const simdscalari mask = _simd_set1_epi32(0x3);
-        return mask;
-    }
-    static const uint32_t              numSamples         = 2;
-    static const uint32_t              numCoverageSamples = 1;
-    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_2X;
-    static constexpr uint32_t          samplePosXi[2]     = {0x80, 0x80};
-    static constexpr uint32_t          samplePosYi[2]     = {0x80, 0x80};
-    static constexpr float             samplePosX[2]      = {0.5f, 0.5f};
-    static constexpr float             samplePosY[2]      = {0.5f, 0.5f};
-};
-
-template <>
-struct MultisampleTraits<SWR_MULTISAMPLE_4X, false>
-{
-    INLINE static float X(uint32_t sampleNum)
-    {
-        SWR_ASSERT(sampleNum < numSamples);
-        return samplePosX[sampleNum];
-    };
-    INLINE static float Y(uint32_t sampleNum)
-    {
-        SWR_ASSERT(sampleNum < numSamples);
-        return samplePosY[sampleNum];
-    };
-    INLINE static simdscalari FullSampleMask()
-    {
-        static const simdscalari mask = _simd_set1_epi32(0xF);
-        return mask;
-    }
-
-    static const uint32_t              numSamples         = 4;
-    static const uint32_t              numCoverageSamples = 4;
-    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_4X;
-    static constexpr uint32_t          samplePosXi[4]     = {0x60, 0xE0, 0x20, 0xA0};
-    static constexpr uint32_t          samplePosYi[4]     = {0x20, 0x60, 0xA0, 0xE0};
-    static constexpr float             samplePosX[4]      = {0.375f, 0.875f, 0.125f, 0.625f};
-    static constexpr float             samplePosY[4]      = {0.125f, 0.375f, 0.625f, 0.875f};
-};
-
-template <>
-struct MultisampleTraits<SWR_MULTISAMPLE_4X, true>
-{
-    INLINE static float       X(uint32_t sampleNum) { return 0.5f; };
-    INLINE static float       Y(uint32_t sampleNum) { return 0.5f; };
-    INLINE static simdscalari FullSampleMask()
-    {
-        static const simdscalari mask = _simd_set1_epi32(0xF);
-        return mask;
-    }
-
-    static const uint32_t              numSamples         = 4;
-    static const uint32_t              numCoverageSamples = 1;
-    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_4X;
-    static constexpr uint32_t          samplePosXi[4]     = {0x80, 0x80, 0x80, 0x80};
-    static constexpr uint32_t          samplePosYi[4]     = {0x80, 0x80, 0x80, 0x80};
-    static constexpr float             samplePosX[4]      = {0.5f, 0.5f, 0.5f, 0.5f};
-    static constexpr float             samplePosY[4]      = {0.5f, 0.5f, 0.5f, 0.5f};
-};
-
-template <>
-struct MultisampleTraits<SWR_MULTISAMPLE_8X, false>
-{
-    INLINE static float X(uint32_t sampleNum)
-    {
-        SWR_ASSERT(sampleNum < numSamples);
-        return samplePosX[sampleNum];
-    };
-    INLINE static float Y(uint32_t sampleNum)
-    {
-        SWR_ASSERT(sampleNum < numSamples);
-        return samplePosY[sampleNum];
-    };
-    INLINE static simdscalari FullSampleMask()
-    {
-        static const simdscalari mask = _simd_set1_epi32(0xFF);
-        return mask;
-    }
-
-    static const uint32_t              numSamples         = 8;
-    static const uint32_t              numCoverageSamples = 8;
-    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_8X;
-    static constexpr uint32_t samplePosXi[8] = {0x90, 0x70, 0xD0, 0x50, 0x30, 0x10, 0xB0, 0xF0};
-    static constexpr uint32_t samplePosYi[8] = {0x50, 0xB0, 0x90, 0x30, 0xD0, 0x70, 0xF0, 0x10};
-    static constexpr float    samplePosX[8]  = {
-        0.5625f, 0.4375f, 0.8125f, 0.3125f, 0.1875f, 0.0625f, 0.6875f, 0.9375f};
-    static constexpr float samplePosY[8] = {
-        0.3125f, 0.6875f, 0.5625f, 0.1875f, 0.8125f, 0.4375f, 0.9375f, 0.0625f};
-};
-
-template <>
-struct MultisampleTraits<SWR_MULTISAMPLE_8X, true>
-{
-    INLINE static float       X(uint32_t sampleNum) { return 0.5f; };
-    INLINE static float       Y(uint32_t sampleNum) { return 0.5f; };
-    INLINE static simdscalari FullSampleMask()
-    {
-        static const simdscalari mask = _simd_set1_epi32(0xFF);
-        return mask;
-    }
-    static const uint32_t              numSamples         = 8;
-    static const uint32_t              numCoverageSamples = 1;
-    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_8X;
-    static constexpr uint32_t samplePosXi[8] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
-    static constexpr uint32_t samplePosYi[8] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
-    static constexpr float    samplePosX[8]  = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
-    static constexpr float    samplePosY[8]  = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
-};
-
-template <>
-struct MultisampleTraits<SWR_MULTISAMPLE_16X, false>
-{
-    INLINE static float X(uint32_t sampleNum)
-    {
-        SWR_ASSERT(sampleNum < numSamples);
-        return samplePosX[sampleNum];
-    };
-    INLINE static float Y(uint32_t sampleNum)
-    {
-        SWR_ASSERT(sampleNum < numSamples);
-        return samplePosY[sampleNum];
-    };
-    INLINE static simdscalari FullSampleMask()
-    {
-        static const simdscalari mask = _simd_set1_epi32(0xFFFF);
-        return mask;
-    }
-
-    static const uint32_t              numSamples         = 16;
-    static const uint32_t              numCoverageSamples = 16;
-    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_16X;
-    static constexpr uint32_t          samplePosXi[16]    = {0x90,
-                                                 0x70,
-                                                 0x50,
-                                                 0xC0,
-                                                 0x30,
-                                                 0xA0,
-                                                 0xD0,
-                                                 0xB0,
-                                                 0x60,
-                                                 0x80,
-                                                 0x40,
-                                                 0x20,
-                                                 0x00,
-                                                 0xF0,
-                                                 0xE0,
-                                                 0x10};
-    static constexpr uint32_t          samplePosYi[16]    = {0x90,
-                                                 0x50,
-                                                 0xA0,
-                                                 0x70,
-                                                 0x60,
-                                                 0xD0,
-                                                 0xB0,
-                                                 0x30,
-                                                 0xE0,
-                                                 0x10,
-                                                 0x20,
-                                                 0xC0,
-                                                 0x80,
-                                                 0x40,
-                                                 0xF0,
-                                                 0x00};
-    static constexpr float             samplePosX[16]     = {0.5625f,
-                                             0.4375f,
-                                             0.3125f,
-                                             0.7500f,
-                                             0.1875f,
-                                             0.6250f,
-                                             0.8125f,
-                                             0.6875f,
-                                             0.3750f,
-                                             0.5000f,
-                                             0.2500f,
-                                             0.1250f,
-                                             0.0000f,
-                                             0.9375f,
-                                             0.8750f,
-                                             0.0625f};
-    static constexpr float             samplePosY[16]     = {0.5625f,
-                                             0.3125f,
-                                             0.6250f,
-                                             0.4375f,
-                                             0.3750f,
-                                             0.8125f,
-                                             0.6875f,
-                                             0.1875f,
-                                             0.8750f,
-                                             0.0625f,
-                                             0.1250f,
-                                             0.7500f,
-                                             0.5000f,
-                                             0.2500f,
-                                             0.9375f,
-                                             0.0000f};
-};
-
-template <>
-struct MultisampleTraits<SWR_MULTISAMPLE_16X, true>
-{
-    INLINE static float       X(uint32_t sampleNum) { return 0.5f; };
-    INLINE static float       Y(uint32_t sampleNum) { return 0.5f; };
-    INLINE static simdscalari FullSampleMask()
-    {
-        static const simdscalari mask = _simd_set1_epi32(0xFFFF);
-        return mask;
-    }
-    static const uint32_t              numSamples         = 16;
-    static const uint32_t              numCoverageSamples = 1;
-    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_16X;
-    static constexpr uint32_t          samplePosXi[16]    = {0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80};
-    static constexpr uint32_t          samplePosYi[16]    = {0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80,
-                                                 0x80};
-    static constexpr float             samplePosX[16]     = {0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f};
-    static constexpr float             samplePosY[16]     = {0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f,
-                                             0.5f};
-};
-
-INLINE
-bool isNonStandardPattern(const SWR_MULTISAMPLE_COUNT sampleCount,
-                          const SWR_MULTISAMPLE_POS&  samplePos)
-{
-    // detect if we're using standard or center sample patterns
-    const uint32_t *standardPosX, *standardPosY;
-    switch (sampleCount)
-    {
-    case SWR_MULTISAMPLE_1X:
-        standardPosX = MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosXi;
-        standardPosY = MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosYi;
-        break;
-    case SWR_MULTISAMPLE_2X:
-        standardPosX = MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosXi;
-        standardPosY = MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosYi;
-        break;
-    case SWR_MULTISAMPLE_4X:
-        standardPosX = MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosXi;
-        standardPosY = MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosYi;
-        break;
-    case SWR_MULTISAMPLE_8X:
-        standardPosX = MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosXi;
-        standardPosY = MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosYi;
-        break;
-    case SWR_MULTISAMPLE_16X:
-        standardPosX = MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosXi;
-        standardPosY = MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosYi;
-        break;
-    default:
-        break;
-    }
-
-    // scan sample pattern for standard or center
-    uint32_t numSamples  = GetNumSamples(sampleCount);
-    bool     bIsStandard = true;
-    if (numSamples > 1)
-    {
-        for (uint32_t i = 0; i < numSamples; i++)
-        {
-            bIsStandard =
-                (standardPosX[i] == samplePos.Xi(i)) || (standardPosY[i] == samplePos.Yi(i));
-            if (!bIsStandard)
-                break;
-        }
-    }
-    return !bIsStandard;
-}
--- a/src/gallium/drivers/swr/rasterizer/core/pa.h
+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h
--- a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
@ -1,473 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file rasterizer.cpp
- *
- * @brief Implementation for the rasterizer.
- *
- ******************************************************************************/
-
-#include <vector>
-#include <algorithm>
-
-#include "rasterizer.h"
-#include "backends/gen_rasterizer.hpp"
-#include "rdtsc_core.h"
-#include "backend.h"
-#include "utils.h"
-#include "frontend.h"
-#include "tilemgr.h"
-#include "memory/tilingtraits.h"
-#include "rasterizer_impl.h"
-
-PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT]
-                              [STATE_VALID_TRI_EDGE_COUNT][2];
-
-void RasterizeLine(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
-{
-    const TRIANGLE_WORK_DESC& workDesc = *((TRIANGLE_WORK_DESC*)pData);
-#if KNOB_ENABLE_TOSS_POINTS
-    if (KNOB_TOSS_BIN_TRIS)
-    {
-        return;
-    }
-#endif
-
-    // bloat line to two tris and call the triangle rasterizer twice
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizeLine, pDC->drawId);
-
-    const API_STATE&     state     = GetApiState(pDC);
-    const SWR_RASTSTATE& rastState = state.rastState;
-
-    // macrotile dimensioning
-    uint32_t macroX, macroY;
-    MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
-    int32_t macroBoxLeft   = macroX * KNOB_MACROTILE_X_DIM_FIXED;
-    int32_t macroBoxRight  = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
-    int32_t macroBoxTop    = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
-    int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
-
-    const SWR_RECT& scissorInFixedPoint =
-        state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
-
-    // create a copy of the triangle buffer to write our adjusted vertices to
-    OSALIGNSIMD(float) newTriBuffer[4 * 4];
-    TRIANGLE_WORK_DESC newWorkDesc = workDesc;
-    newWorkDesc.pTriBuffer         = &newTriBuffer[0];
-
-    // create a copy of the attrib buffer to write our adjusted attribs to
-    OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
-    newWorkDesc.pAttribs = &newAttribBuffer[0];
-
-    const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f);
-    const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f);
-
-    __m128 vX, vY, vZ, vRecipW;
-
-    vX      = _mm_load_ps(workDesc.pTriBuffer);
-    vY      = _mm_load_ps(workDesc.pTriBuffer + 4);
-    vZ      = _mm_load_ps(workDesc.pTriBuffer + 8);
-    vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
-
-    // triangle 0
-    // v0,v1 -> v0,v0,v1
-    __m128 vXa      = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
-    __m128 vYa      = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
-    __m128 vZa      = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
-    __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0));
-
-    __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth);
-    __m128 vAdjust    = _mm_mul_ps(vLineWidth, vBloat0);
-    if (workDesc.triFlags.yMajor)
-    {
-        vXa = _mm_add_ps(vAdjust, vXa);
-    }
-    else
-    {
-        vYa = _mm_add_ps(vAdjust, vYa);
-    }
-
-    // Store triangle description for rasterizer
-    _mm_store_ps((float*)&newTriBuffer[0], vXa);
-    _mm_store_ps((float*)&newTriBuffer[4], vYa);
-    _mm_store_ps((float*)&newTriBuffer[8], vZa);
-    _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
-
-    // binner bins 3 edges for lines as v0, v1, v1
-    // tri0 needs v0, v0, v1
-    for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
-    {
-        __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
-        __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
-
-        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib0);
-        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib0);
-        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib1);
-    }
-
-    // Store user clip distances for triangle 0
-    float    newClipBuffer[3 * 8];
-    uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
-    if (numClipDist)
-    {
-        newWorkDesc.pUserClipBuffer = newClipBuffer;
-
-        float* pOldBuffer = workDesc.pUserClipBuffer;
-        float* pNewBuffer = newClipBuffer;
-        for (uint32_t i = 0; i < numClipDist; ++i)
-        {
-            // read barycentric coeffs from binner
-            float a = *(pOldBuffer++);
-            float b = *(pOldBuffer++);
-
-            // reconstruct original clip distance at vertices
-            float c0 = a + b;
-            float c1 = b;
-
-            // construct triangle barycentrics
-            *(pNewBuffer++) = c0 - c1;
-            *(pNewBuffer++) = c0 - c1;
-            *(pNewBuffer++) = c1;
-        }
-    }
-
-    // setup triangle rasterizer function
-    PFN_WORK_FUNC pfnTriRast;
-    // conservative rast not supported for points/lines
-    pfnTriRast = GetRasterizerFunc(rastState.sampleCount,
-                                   rastState.bIsCenterPattern,
-                                   false,
-                                   SWR_INPUT_COVERAGE_NONE,
-                                   EdgeValToEdgeState(ALL_EDGES_VALID),
-                                   (pDC->pState->state.scissorsTileAligned == false));
-
-    // make sure this macrotile intersects the triangle
-    __m128i vXai = fpToFixedPoint(vXa);
-    __m128i vYai = fpToFixedPoint(vYa);
-    OSALIGNSIMD(SWR_RECT) bboxA;
-    calcBoundingBoxInt(vXai, vYai, bboxA);
-
-    if (!(bboxA.xmin > macroBoxRight || bboxA.xmin > scissorInFixedPoint.xmax ||
-          bboxA.xmax - 1 < macroBoxLeft || bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
-          bboxA.ymin > macroBoxBottom || bboxA.ymin > scissorInFixedPoint.ymax ||
-          bboxA.ymax - 1 < macroBoxTop || bboxA.ymax - 1 < scissorInFixedPoint.ymin))
-    {
-        // rasterize triangle
-        pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
-    }
-
-    // triangle 1
-    // v0,v1 -> v1,v1,v0
-    vXa      = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
-    vYa      = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
-    vZa      = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
-    vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1));
-
-    vAdjust = _mm_mul_ps(vLineWidth, vBloat1);
-    if (workDesc.triFlags.yMajor)
-    {
-        vXa = _mm_add_ps(vAdjust, vXa);
-    }
-    else
-    {
-        vYa = _mm_add_ps(vAdjust, vYa);
-    }
-
-    // Store triangle description for rasterizer
-    _mm_store_ps((float*)&newTriBuffer[0], vXa);
-    _mm_store_ps((float*)&newTriBuffer[4], vYa);
-    _mm_store_ps((float*)&newTriBuffer[8], vZa);
-    _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
-
-    // binner bins 3 edges for lines as v0, v1, v1
-    // tri1 needs v1, v1, v0
-    for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
-    {
-        __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
-        __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
-
-        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1);
-        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1);
-        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0);
-    }
-
-    // store user clip distance for triangle 1
-    if (numClipDist)
-    {
-        float* pOldBuffer = workDesc.pUserClipBuffer;
-        float* pNewBuffer = newClipBuffer;
-        for (uint32_t i = 0; i < numClipDist; ++i)
-        {
-            // read barycentric coeffs from binner
-            float a = *(pOldBuffer++);
-            float b = *(pOldBuffer++);
-
-            // reconstruct original clip distance at vertices
-            float c0 = a + b;
-            float c1 = b;
-
-            // construct triangle barycentrics
-            *(pNewBuffer++) = c1 - c0;
-            *(pNewBuffer++) = c1 - c0;
-            *(pNewBuffer++) = c0;
-        }
-    }
-
-    vXai = fpToFixedPoint(vXa);
-    vYai = fpToFixedPoint(vYa);
-    calcBoundingBoxInt(vXai, vYai, bboxA);
-
-    if (!(bboxA.xmin > macroBoxRight || bboxA.xmin > scissorInFixedPoint.xmax ||
-          bboxA.xmax - 1 < macroBoxLeft || bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
-          bboxA.ymin > macroBoxBottom || bboxA.ymin > scissorInFixedPoint.ymax ||
-          bboxA.ymax - 1 < macroBoxTop || bboxA.ymax - 1 < scissorInFixedPoint.ymin))
-    {
-        // rasterize triangle
-        pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
-    }
-
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizeLine, 1);
-}
-
-void RasterizeSimplePoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
-{
-#if KNOB_ENABLE_TOSS_POINTS
-    if (KNOB_TOSS_BIN_TRIS)
-    {
-        return;
-    }
-#endif
-
-    const TRIANGLE_WORK_DESC& workDesc     = *(const TRIANGLE_WORK_DESC*)pData;
-    const BACKEND_FUNCS&      backendFuncs = pDC->pState->backendFuncs;
-
-    // map x,y relative offsets from start of raster tile to bit position in
-    // coverage mask for the point
-    static const uint32_t coverageMap[8][8] = {{0, 1, 4, 5, 8, 9, 12, 13},
-                                               {2, 3, 6, 7, 10, 11, 14, 15},
-                                               {16, 17, 20, 21, 24, 25, 28, 29},
-                                               {18, 19, 22, 23, 26, 27, 30, 31},
-                                               {32, 33, 36, 37, 40, 41, 44, 45},
-                                               {34, 35, 38, 39, 42, 43, 46, 47},
-                                               {48, 49, 52, 53, 56, 57, 60, 61},
-                                               {50, 51, 54, 55, 58, 59, 62, 63}};
-
-    OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc = {};
-
-    // pull point information from triangle buffer
-    // @todo use structs for readability
-    uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
-    uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
-    float    z            = *(workDesc.pTriBuffer + 2);
-
-    // construct triangle descriptor for point
-    // no interpolation, set up i,j for constant interpolation of z and attribs
-    // @todo implement an optimized backend that doesn't require triangle information
-
-    // compute coverage mask from x,y packed into the coverageMask flag
-    // mask indices by the maximum valid index for x/y of coveragemap.
-    uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
-    uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
-    for (uint32_t i = 0; i < _countof(triDesc.coverageMask); ++i)
-    {
-        triDesc.coverageMask[i] = 1ULL << coverageMap[tY][tX];
-    }
-    triDesc.anyCoveredSamples = triDesc.coverageMask[0];
-    triDesc.innerCoverageMask = triDesc.coverageMask[0];
-
-    // no persp divide needed for points
-    triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
-    triDesc.triFlags                         = workDesc.triFlags;
-    triDesc.recipDet                         = 1.0f;
-    triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
-    triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
-    triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
-    triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
-
-    RenderOutputBuffers renderBuffers;
-    GetRenderHotTiles(pDC,
-                      workerId,
-                      macroTile,
-                      tileAlignedX >> KNOB_TILE_X_DIM_SHIFT,
-                      tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT,
-                      renderBuffers,
-                      triDesc.triFlags.renderTargetArrayIndex);
-
-    RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelBackend, pDC->drawId);
-    backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
-    RDTSC_END(pDC->pContext->pBucketMgr, BEPixelBackend, 0);
-}
-
-void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
-{
-    const TRIANGLE_WORK_DESC& workDesc     = *(const TRIANGLE_WORK_DESC*)pData;
-    const SWR_RASTSTATE&      rastState    = pDC->pState->state.rastState;
-    const SWR_BACKEND_STATE&  backendState = pDC->pState->state.backendState;
-
-    bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
-
-    // load point vertex
-    float x = *workDesc.pTriBuffer;
-    float y = *(workDesc.pTriBuffer + 1);
-    float z = *(workDesc.pTriBuffer + 2);
-
-    // create a copy of the triangle buffer to write our adjusted vertices to
-    OSALIGNSIMD(float) newTriBuffer[4 * 4];
-    TRIANGLE_WORK_DESC newWorkDesc = workDesc;
-    newWorkDesc.pTriBuffer         = &newTriBuffer[0];
-
-    // create a copy of the attrib buffer to write our adjusted attribs to
-    OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
-    newWorkDesc.pAttribs = &newAttribBuffer[0];
-
-    newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
-    newWorkDesc.numAttribs      = workDesc.numAttribs;
-    newWorkDesc.triFlags        = workDesc.triFlags;
-
-    // construct two tris by bloating point by point size
-    float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
-    float lowerX        = x - halfPointSize;
-    float upperX        = x + halfPointSize;
-    float lowerY        = y - halfPointSize;
-    float upperY        = y + halfPointSize;
-
-    // tri 0
-    float* pBuf = &newTriBuffer[0];
-    *pBuf++     = lowerX;
-    *pBuf++     = lowerX;
-    *pBuf++     = upperX;
-    pBuf++;
-    *pBuf++ = lowerY;
-    *pBuf++ = upperY;
-    *pBuf++ = upperY;
-    pBuf++;
-    _mm_store_ps(pBuf, _mm_set1_ps(z));
-    _mm_store_ps(pBuf += 4, _mm_set1_ps(1.0f));
-
-    // setup triangle rasterizer function
-    PFN_WORK_FUNC pfnTriRast;
-    // conservative rast not supported for points/lines
-    pfnTriRast = GetRasterizerFunc(rastState.sampleCount,
-                                   rastState.bIsCenterPattern,
-                                   false,
-                                   SWR_INPUT_COVERAGE_NONE,
-                                   EdgeValToEdgeState(ALL_EDGES_VALID),
-                                   (pDC->pState->state.scissorsTileAligned == false));
-
-    // overwrite texcoords for point sprites
-    if (isPointSpriteTexCoordEnabled)
-    {
-        // copy original attribs
-        memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float));
-        newWorkDesc.pAttribs = &newAttribBuffer[0];
-
-        // overwrite texcoord for point sprites
-        uint32_t texCoordMask   = backendState.pointSpriteTexCoordMask;
-        unsigned long texCoordAttrib = 0;
-
-        while (_BitScanForward(&texCoordAttrib, texCoordMask))
-        {
-            texCoordMask &= ~(1 << texCoordAttrib);
-            __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
-            if (rastState.pointSpriteTopOrigin)
-            {
-                pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
-                pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0);
-                pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
-            }
-            else
-            {
-                pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
-                pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0);
-                pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
-            }
-        }
-    }
-    else
-    {
-        // no texcoord overwrite, can reuse the attrib buffer from frontend
-        newWorkDesc.pAttribs = workDesc.pAttribs;
-    }
-
-    pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
-
-    // tri 1
-    pBuf    = &newTriBuffer[0];
-    *pBuf++ = lowerX;
-    *pBuf++ = upperX;
-    *pBuf++ = upperX;
-    pBuf++;
-    *pBuf++ = lowerY;
-    *pBuf++ = upperY;
-    *pBuf++ = lowerY;
-    // z, w unchanged
-
-    if (isPointSpriteTexCoordEnabled)
-    {
-        uint32_t texCoordMask   = backendState.pointSpriteTexCoordMask;
-        unsigned long texCoordAttrib = 0;
-
-        while (_BitScanForward(&texCoordAttrib, texCoordMask))
-        {
-            texCoordMask &= ~(1 << texCoordAttrib);
-            __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
-            if (rastState.pointSpriteTopOrigin)
-            {
-                pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
-                pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
-                pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
-            }
-            else
-            {
-                pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
-                pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1);
-                pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
-            }
-        }
-    }
-
-    pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
-}
-
-void InitRasterizerFunctions()
-{
-    InitRasterizerFuncs();
-}
-
-// Selector for correct templated RasterizeTriangle function
-PFN_WORK_FUNC GetRasterizerFunc(SWR_MULTISAMPLE_COUNT numSamples,
-                                bool                  IsCenter,
-                                bool                  IsConservative,
-                                SWR_INPUT_COVERAGE    InputCoverage,
-                                uint32_t              EdgeEnable,
-                                bool                  RasterizeScissorEdges)
-{
-    SWR_ASSERT(numSamples >= 0 && numSamples < SWR_MULTISAMPLE_TYPE_COUNT);
-    SWR_ASSERT(InputCoverage >= 0 && InputCoverage < SWR_INPUT_COVERAGE_COUNT);
-    SWR_ASSERT(EdgeEnable < STATE_VALID_TRI_EDGE_COUNT);
-
-    PFN_WORK_FUNC func = gRasterizerFuncs[numSamples][IsCenter][IsConservative][InputCoverage]
-                                         [EdgeEnable][RasterizeScissorEdges];
-    SWR_ASSERT(func);
-
-    return func;
-}
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h
@ -1,237 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file rasterizer.h
- *
- * @brief Definitions for the rasterizer.
- *
- ******************************************************************************/
-#pragma once
-
-#include "context.h"
-#include <type_traits>
-#include "conservativeRast.h"
-#include "multisample.h"
-
-void RasterizeLine(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData);
-void RasterizeSimplePoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData);
-void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData);
-void InitRasterizerFunctions();
-
-INLINE
-__m128i fpToFixedPoint(const __m128 vIn)
-{
-    __m128 vFixed = _mm_mul_ps(vIn, _mm_set1_ps(FIXED_POINT_SCALE));
-    return _mm_cvtps_epi32(vFixed);
-}
-
-enum TriEdgesStates
-{
-    STATE_NO_VALID_EDGES = 0,
-    STATE_E0_E1_VALID,
-    STATE_E0_E2_VALID,
-    STATE_E1_E2_VALID,
-    STATE_ALL_EDGES_VALID,
-    STATE_VALID_TRI_EDGE_COUNT,
-};
-
-enum TriEdgesValues
-{
-    NO_VALID_EDGES  = 0,
-    E0_E1_VALID     = 0x3,
-    E0_E2_VALID     = 0x5,
-    E1_E2_VALID     = 0x6,
-    ALL_EDGES_VALID = 0x7,
-    VALID_TRI_EDGE_COUNT,
-};
-
-// Selector for correct templated RasterizeTriangle function
-PFN_WORK_FUNC GetRasterizerFunc(SWR_MULTISAMPLE_COUNT numSamples,
-                                bool                  IsCenter,
-                                bool                  IsConservative,
-                                SWR_INPUT_COVERAGE    InputCoverage,
-                                uint32_t              EdgeEnable,
-                                bool                  RasterizeScissorEdges);
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief ValidTriEdges convenience typedefs used for templated function
-/// specialization supported Fixed Point precisions
-typedef std::integral_constant<uint32_t, ALL_EDGES_VALID> AllEdgesValidT;
-typedef std::integral_constant<uint32_t, E0_E1_VALID>     E0E1ValidT;
-typedef std::integral_constant<uint32_t, E0_E2_VALID>     E0E2ValidT;
-typedef std::integral_constant<uint32_t, E1_E2_VALID>     E1E2ValidT;
-typedef std::integral_constant<uint32_t, NO_VALID_EDGES>  NoEdgesValidT;
-
-typedef std::integral_constant<uint32_t, STATE_ALL_EDGES_VALID> StateAllEdgesValidT;
-typedef std::integral_constant<uint32_t, STATE_E0_E1_VALID>     StateE0E1ValidT;
-typedef std::integral_constant<uint32_t, STATE_E0_E2_VALID>     StateE0E2ValidT;
-typedef std::integral_constant<uint32_t, STATE_E1_E2_VALID>     StateE1E2ValidT;
-typedef std::integral_constant<uint32_t, STATE_NO_VALID_EDGES>  StateNoEdgesValidT;
-
-// some specializations to convert from edge state to edge bitmask values
-template <typename EdgeMask>
-struct EdgeMaskVal
-{
-    static_assert(EdgeMask::value > STATE_ALL_EDGES_VALID,
-                  "Primary EdgeMaskVal shouldn't be instantiated");
-};
-
-template <>
-struct EdgeMaskVal<StateAllEdgesValidT>
-{
-    typedef AllEdgesValidT T;
-};
-
-template <>
-struct EdgeMaskVal<StateE0E1ValidT>
-{
-    typedef E0E1ValidT T;
-};
-
-template <>
-struct EdgeMaskVal<StateE0E2ValidT>
-{
-    typedef E0E2ValidT T;
-};
-
-template <>
-struct EdgeMaskVal<StateE1E2ValidT>
-{
-    typedef E1E2ValidT T;
-};
-
-template <>
-struct EdgeMaskVal<StateNoEdgesValidT>
-{
-    typedef NoEdgesValidT T;
-};
-
-INLINE uint32_t EdgeValToEdgeState(uint32_t val)
-{
-    SWR_ASSERT(val < VALID_TRI_EDGE_COUNT, "Unexpected tri edge mask");
-    static const uint32_t edgeValToEdgeState[VALID_TRI_EDGE_COUNT] = {0, 0, 0, 1, 0, 2, 3, 4};
-    return edgeValToEdgeState[val];
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @struct RasterScissorEdgesT
-/// @brief Primary RasterScissorEdgesT templated struct that holds compile
-/// time information about the number of edges needed to be rasterized,
-/// If either the scissor rect or conservative rast is enabled,
-/// the scissor test is enabled and the rasterizer will test
-/// 3 triangle edges + 4 scissor edges for coverage.
-/// @tparam RasterScissorEdgesT: number of multisamples
-/// @tparam ConservativeT: is this a conservative rasterization
-/// @tparam EdgeMaskT: Which edges are valid(not degenerate)
-template <typename RasterScissorEdgesT, typename ConservativeT, typename EdgeMaskT>
-struct RasterEdgeTraits
-{
-    typedef std::true_type                      RasterizeScissorEdgesT;
-    typedef std::integral_constant<uint32_t, 7> NumEdgesT;
-    // typedef std::integral_constant<uint32_t, EdgeMaskT::value> ValidEdgeMaskT;
-    typedef typename EdgeMaskVal<EdgeMaskT>::T ValidEdgeMaskT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief specialization of RasterEdgeTraits. If neither scissor rect
-/// nor conservative rast is enabled, only test 3 triangle edges
-/// for coverage
-template <typename EdgeMaskT>
-struct RasterEdgeTraits<std::false_type, std::false_type, EdgeMaskT>
-{
-    typedef std::false_type                     RasterizeScissorEdgesT;
-    typedef std::integral_constant<uint32_t, 3> NumEdgesT;
-    // no need for degenerate edge masking in non-conservative case; rasterize all triangle edges
-    typedef std::integral_constant<uint32_t, ALL_EDGES_VALID> ValidEdgeMaskT;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @struct RasterizerTraits
-/// @brief templated struct that holds compile time information used
-/// during rasterization. Inherits EdgeTraits and ConservativeRastBETraits.
-/// @tparam NumSamplesT: number of multisamples
-/// @tparam ConservativeT: is this a conservative rasterization
-/// @tparam InputCoverageT: what type of input coverage is the PS expecting?
-/// (only used with conservative rasterization)
-/// @tparam RasterScissorEdgesT: do we need to rasterize with a scissor?
-template <typename NumSamplesT,
-          typename CenterPatternT,
-          typename ConservativeT,
-          typename InputCoverageT,
-          typename EdgeEnableT,
-          typename RasterScissorEdgesT>
-struct _RasterizerTraits : public ConservativeRastBETraits<ConservativeT, InputCoverageT>,
-                           public RasterEdgeTraits<RasterScissorEdgesT, ConservativeT, EdgeEnableT>
-{
-    typedef MultisampleTraits<static_cast<SWR_MULTISAMPLE_COUNT>(NumSamplesT::value),
-                              CenterPatternT::value>
-        MT;
-
-    /// Fixed point precision the rasterizer is using
-    typedef FixedPointTraits<Fixed_16_8> PrecisionT;
-    /// Fixed point precision of the edge tests used during rasterization
-    typedef FixedPointTraits<Fixed_X_16> EdgePrecisionT;
-
-    // If conservative rast or MSAA center pattern is enabled, only need a single sample coverage
-    // test, with the result copied to all samples
-    typedef std::integral_constant<int, ConservativeT::value ? 1 : MT::numCoverageSamples>
-        NumCoverageSamplesT;
-
-    static_assert(
-        EdgePrecisionT::BitsT::value >=
-            ConservativeRastBETraits<ConservativeT,
-                                     InputCoverageT>::ConservativePrecisionT::BitsT::value,
-        "Rasterizer edge fixed point precision < required conservative rast precision");
-
-    /// constants used to offset between different types of raster tiles
-    static const int colorRasterTileStep{
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)) *
-        MT::numSamples};
-    static const int depthRasterTileStep{
-        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)) *
-        MT::numSamples};
-    static const int stencilRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM *
-                                            (FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)) *
-                                           MT::numSamples};
-    static const int colorRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) *
-                                            colorRasterTileStep};
-    static const int depthRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) *
-                                            depthRasterTileStep};
-    static const int stencilRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) *
-                                              stencilRasterTileStep};
-};
-
-template <uint32_t NumSamplesT,
-          uint32_t CenterPatternT,
-          uint32_t ConservativeT,
-          uint32_t InputCoverageT,
-          uint32_t EdgeEnableT,
-          uint32_t RasterScissorEdgesT>
-struct RasterizerTraits final
-    : public _RasterizerTraits<std::integral_constant<uint32_t, NumSamplesT>,
-                               std::integral_constant<bool, CenterPatternT != 0>,
-                               std::integral_constant<bool, ConservativeT != 0>,
-                               std::integral_constant<uint32_t, InputCoverageT>,
-                               std::integral_constant<uint32_t, EdgeEnableT>,
-                               std::integral_constant<bool, RasterScissorEdgesT != 0>>
-{
-};
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h
--- a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp
@ -1,94 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#include "rdtsc_core.h"
-#include "common/rdtsc_buckets.h"
-
-// must match CORE_BUCKETS enum order
-BUCKET_DESC gCoreBuckets[] = {
-    {"APIClearRenderTarget", "", true, 0xff0b8bea},
-    {"APIDraw", "", true, 0xff000066},
-    {"APIDrawWakeAllThreads", "", false, 0xffffffff},
-    {"APIDrawIndexed", "", true, 0xff000066},
-    {"APIDispatch", "", true, 0xff660000},
-    {"APIStoreTiles", "", true, 0xff00ffff},
-    {"APIGetDrawContext", "", false, 0xffffffff},
-    {"APISync", "", true, 0xff6666ff},
-    {"APIWaitForIdle", "", true, 0xff0000ff},
-    {"FEProcessDraw", "", true, 0xff009900},
-    {"FEProcessDrawIndexed", "", true, 0xff009900},
-    {"FEFetchShader", "", false, 0xffffffff},
-    {"FEVertexShader", "", false, 0xffffffff},
-    {"FEHullShader", "", false, 0xffffffff},
-    {"FETessellation", "", false, 0xffffffff},
-    {"FEDomainShader", "", false, 0xffffffff},
-    {"FEGeometryShader", "", false, 0xffffffff},
-    {"FEStreamout", "", false, 0xffffffff},
-    {"FEPAAssemble", "", false, 0xffffffff},
-    {"FEBinPoints", "", false, 0xff29b854},
-    {"FEBinLines", "", false, 0xff29b854},
-    {"FEBinTriangles", "", false, 0xff29b854},
-    {"FETriangleSetup", "", false, 0xffffffff},
-    {"FEViewportCull", "", false, 0xffffffff},
-    {"FEGuardbandClip", "", false, 0xffffffff},
-    {"FEClipPoints", "", false, 0xffffffff},
-    {"FEClipLines", "", false, 0xffffffff},
-    {"FEClipTriangles", "", false, 0xffffffff},
-    {"FEClipRectangles", "", false, 0xffffffff},
-    {"FECullZeroAreaAndBackface", "", false, 0xffffffff},
-    {"FECullBetweenCenters", "", false, 0xffffffff},
-    {"FEEarlyRastEnter", "", false, 0xffffffff},
-    {"FEEarlyRastExit", "", false, 0xffffffff},
-    {"FEProcessStoreTiles", "", true, 0xff39c864},
-    {"FEProcessInvalidateTiles", "", true, 0xffffffff},
-    {"WorkerWorkOnFifoBE", "", false, 0xff40261c},
-    {"WorkerFoundWork", "", false, 0xff573326},
-    {"BELoadTiles", "", true, 0xffb0e2ff},
-    {"BEDispatch", "", true, 0xff00a2ff},
-    {"BEClear", "", true, 0xff00ccbb},
-    {"BERasterizeLine", "", true, 0xffb26a4e},
-    {"BERasterizeTriangle", "", true, 0xffb26a4e},
-    {"BETriangleSetup", "", false, 0xffffffff},
-    {"BEStepSetup", "", false, 0xffffffff},
-    {"BECullZeroArea", "", false, 0xffffffff},
-    {"BEEmptyTriangle", "", false, 0xffffffff},
-    {"BETrivialAccept", "", false, 0xffffffff},
-    {"BETrivialReject", "", false, 0xffffffff},
-    {"BERasterizePartial", "", false, 0xffffffff},
-    {"BEPixelBackend", "", false, 0xffffffff},
-    {"BESetup", "", false, 0xffffffff},
-    {"BEBarycentric", "", false, 0xffffffff},
-    {"BEEarlyDepthTest", "", false, 0xffffffff},
-    {"BEPixelShader", "", false, 0xffffffff},
-    {"BESingleSampleBackend", "", false, 0xffffffff},
-    {"BEPixelRateBackend", "", false, 0xffffffff},
-    {"BESampleRateBackend", "", false, 0xffffffff},
-    {"BENullBackend", "", false, 0xffffffff},
-    {"BELateDepthTest", "", false, 0xffffffff},
-    {"BEOutputMerger", "", false, 0xffffffff},
-    {"BEStoreTiles", "", true, 0xff00cccc},
-    {"BEEndTile", "", false, 0xffffffff},
-};
-static_assert(NumBuckets == (sizeof(gCoreBuckets) / sizeof(gCoreBuckets[0])),
-              "RDTSC Bucket enum and description table size mismatched.");
-
--- a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h
+++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h
@ -1,185 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- ****************************************************************************/
-
-#pragma once
-#include "knobs.h"
-
-#include "common/os.h"
-#include "common/rdtsc_buckets.h"
-
-#include <vector>
-
-///////////////////////////////////////////////////////////////////////////////
-// NOTE:  This enum MUST be kept in sync with gCoreBuckets in rdtsc_core.cpp
-///////////////////////////////////////////////////////////////////////////////
-enum CORE_BUCKETS
-{
-    APIClearRenderTarget,
-    APIDraw,
-    APIDrawWakeAllThreads,
-    APIDrawIndexed,
-    APIDispatch,
-    APIStoreTiles,
-    APIGetDrawContext,
-    APISync,
-    APIWaitForIdle,
-    FEProcessDraw,
-    FEProcessDrawIndexed,
-    FEFetchShader,
-    FEVertexShader,
-    FEHullShader,
-    FETessellation,
-    FEDomainShader,
-    FEGeometryShader,
-    FEStreamout,
-    FEPAAssemble,
-    FEBinPoints,
-    FEBinLines,
-    FEBinTriangles,
-    FETriangleSetup,
-    FEViewportCull,
-    FEGuardbandClip,
-    FEClipPoints,
-    FEClipLines,
-    FEClipTriangles,
-    FEClipRectangles,
-    FECullZeroAreaAndBackface,
-    FECullBetweenCenters,
-    FEEarlyRastEnter,
-    FEEarlyRastExit,
-    FEProcessStoreTiles,
-    FEProcessInvalidateTiles,
-    WorkerWorkOnFifoBE,
-    WorkerFoundWork,
-    BELoadTiles,
-    BEDispatch,
-    BEClear,
-    BERasterizeLine,
-    BERasterizeTriangle,
-    BETriangleSetup,
-    BEStepSetup,
-    BECullZeroArea,
-    BEEmptyTriangle,
-    BETrivialAccept,
-    BETrivialReject,
-    BERasterizePartial,
-    BEPixelBackend,
-    BESetup,
-    BEBarycentric,
-    BEEarlyDepthTest,
-    BEPixelShader,
-    BESingleSampleBackend,
-    BEPixelRateBackend,
-    BESampleRateBackend,
-    BENullBackend,
-    BELateDepthTest,
-    BEOutputMerger,
-    BEStoreTiles,
-    BEEndTile,
-
-    NumBuckets
-};
-
-void rdtscReset(BucketManager* pBucketMgr);
-void rdtscInit(BucketManager* pBucketMgr, int threadId);
-void rdtscStart(BucketManager* pBucketMgr, uint32_t bucketId);
-void rdtscStop(BucketManager* pBucketMgr, uint32_t bucketId, uint32_t count, uint64_t drawId);
-void rdtscEvent(BucketManager* pBucketMgr, uint32_t bucketId, uint32_t count1, uint32_t count2);
-void rdtscEndFrame(BucketManager* pBucketMgr);
-
-#ifdef KNOB_ENABLE_RDTSC
-#define RDTSC_RESET(pBucketMgr) rdtscReset(pBucketMgr)
-#define RDTSC_INIT(pBucketMgr, threadId) rdtscInit(pBucketMgr,threadId)
-#define RDTSC_START(pBucketMgr, bucket) rdtscStart(pBucketMgr, bucket)
-#define RDTSC_STOP(pBucketMgr, bucket, count, draw) rdtscStop(pBucketMgr, bucket, count, draw)
-#define RDTSC_EVENT(pBucketMgr, bucket, count1, count2) rdtscEvent(pBucketMgr, bucket, count1, count2)
-#define RDTSC_ENDFRAME(pBucketMgr) rdtscEndFrame(pBucketMgr)
-#else
-#define RDTSC_RESET(pBucketMgr)
-#define RDTSC_INIT(pBucketMgr, threadId)
-#define RDTSC_START(pBucketMgr, bucket)
-#define RDTSC_STOP(pBucketMgr, bucket, count, draw)
-#define RDTSC_EVENT(pBucketMgr, bucket, count1, count2)
-#define RDTSC_ENDFRAME(pBucketMgr)
-#endif
-
-extern BUCKET_DESC           gCoreBuckets[];
-
-INLINE void rdtscReset(BucketManager *pBucketMgr)
-{
-    pBucketMgr->mCurrentFrame = 0;
-    pBucketMgr->ClearThreads();
-}
-
-INLINE void rdtscInit(BucketManager* pBucketMgr, int threadId)
-{
-    // register all the buckets once
-    if (!pBucketMgr->mBucketsInitialized && (threadId == 0))
-    {
-        pBucketMgr->mBucketMap.resize(NumBuckets);
-        for (uint32_t i = 0; i < NumBuckets; ++i)
-        {
-            pBucketMgr->mBucketMap[i] = pBucketMgr->RegisterBucket(gCoreBuckets[i]);
-        }
-        pBucketMgr->mBucketsInitialized = true;
-    }
-
-    std::string name = threadId == 0 ? "API" : "WORKER";
-    pBucketMgr->RegisterThread(name);
-}
-
-INLINE void rdtscStart(BucketManager* pBucketMgr, uint32_t bucketId)
-{
-    uint32_t id = pBucketMgr->mBucketMap[bucketId];
-    pBucketMgr->StartBucket(id);
-}
-
-INLINE void rdtscStop(BucketManager* pBucketMgr, uint32_t bucketId, uint32_t count, uint64_t drawId)
-{
-    uint32_t id = pBucketMgr->mBucketMap[bucketId];
-    pBucketMgr->StopBucket(id);
-}
-
-INLINE void rdtscEvent(BucketManager* pBucketMgr, uint32_t bucketId, uint32_t count1, uint32_t count2)
-{
-    uint32_t id = pBucketMgr->mBucketMap[bucketId];
-    pBucketMgr->AddEvent(id, count1);
-}
-
-INLINE void rdtscEndFrame(BucketManager* pBucketMgr)
-{
-    pBucketMgr->mCurrentFrame++;
-
-    if (pBucketMgr->mCurrentFrame == KNOB_BUCKETS_START_FRAME &&
-        KNOB_BUCKETS_START_FRAME < KNOB_BUCKETS_END_FRAME)
-    {
-        pBucketMgr->StartCapture();
-    }
-
-    if (pBucketMgr->mCurrentFrame == KNOB_BUCKETS_END_FRAME &&
-        KNOB_BUCKETS_START_FRAME < KNOB_BUCKETS_END_FRAME)
-    {
-        pBucketMgr->StopCapture();
-        pBucketMgr->PrintReport("rdtsc.txt");
-    }
-}
--- a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
+++ b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
@ -1,95 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file arena.h
- *
- * @brief RingBuffer
- *        The RingBuffer class manages all aspects of the ring buffer including
- *        the head/tail indices, etc.
- *
- ******************************************************************************/
-#pragma once
-
-template <typename T>
-class RingBuffer
-{
-public:
-    RingBuffer() : mpRingBuffer(nullptr), mNumEntries(0), mRingHead(0), mRingTail(0) {}
-
-    ~RingBuffer() { Destroy(); }
-
-    void Init(uint32_t numEntries)
-    {
-        SWR_ASSERT(numEntries > 0);
-        SWR_ASSERT(((1ULL << 32) % numEntries) == 0,
-                   "%d is not evenly divisible into 2 ^ 32.  Wrap errors will occur!",
-                   numEntries);
-        mNumEntries  = numEntries;
-        mpRingBuffer = (T*)AlignedMalloc(sizeof(T) * numEntries, 64);
-        SWR_ASSERT(mpRingBuffer != nullptr);
-        memset((void*)mpRingBuffer, 0, sizeof(T) * numEntries);
-    }
-
-    void Destroy()
-    {
-        AlignedFree(mpRingBuffer);
-        mpRingBuffer = nullptr;
-    }
-
-    T& operator[](const uint32_t index)
-    {
-        SWR_ASSERT(index < mNumEntries);
-        return mpRingBuffer[index];
-    }
-
-    INLINE void Enqueue()
-    {
-        mRingHead++; // There's only one producer.
-        // Assert to find wrap-around cases, NEVER ENABLE DURING CHECKIN!!
-        // SWR_REL_ASSERT(mRingHead);
-    }
-
-    INLINE void Dequeue()
-    {
-        InterlockedIncrement(&mRingTail); // There are multiple consumers.
-    }
-
-    INLINE bool IsEmpty() { return (GetHead() == GetTail()); }
-
-    INLINE bool IsFull()
-    {
-        uint32_t numEnqueued = GetHead() - GetTail();
-        SWR_ASSERT(numEnqueued <= mNumEntries);
-
-        return (numEnqueued == mNumEntries);
-    }
-
-    INLINE uint32_t GetTail() volatile { return mRingTail; }
-    INLINE uint32_t GetHead() volatile { return mRingHead; }
-
-protected:
-    T*       mpRingBuffer;
-    uint32_t mNumEntries;
-
-    OSALIGNLINE(volatile uint32_t) mRingHead; // Consumer Counter
-    OSALIGNLINE(volatile uint32_t) mRingTail; // Producer Counter
-};
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
--- a/src/gallium/drivers/swr/rasterizer/core/state_funcs.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state_funcs.h
@ -1,67 +0,0 @@
-/****************************************************************************
- * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * @file state.h
- *
- * @brief Definitions for API state - complex function implementation.
- *
- ******************************************************************************/
-#pragma once
-
-#include "core/state.h"
-#include "common/simdintrin.h"
-
-template <typename MaskT>
-INLINE __m128i SWR_MULTISAMPLE_POS::expandThenBlend4(uint32_t* min, uint32_t* max)
-{
-    __m128i vMin = _mm_set1_epi32(*min);
-    __m128i vMax = _mm_set1_epi32(*max);
-    return _simd_blend4_epi32<MaskT::value>(vMin, vMax);
-}
-
-INLINE void SWR_MULTISAMPLE_POS::PrecalcSampleData(int numSamples)
-{
-    for (int i = 0; i < numSamples; i++)
-    {
-        _vXi[i] = _mm_set1_epi32(_xi[i]);
-        _vYi[i] = _mm_set1_epi32(_yi[i]);
-        _vX[i]  = _simd_set1_ps(_x[i]);
-        _vY[i]  = _simd_set1_ps(_y[i]);
-    }
-    // precalculate the raster tile BB for the rasterizer.
-    CalcTileSampleOffsets(numSamples);
-}
-
-INLINE void SWR_MULTISAMPLE_POS::CalcTileSampleOffsets(int numSamples)
-{
-    auto minXi  = std::min_element(std::begin(_xi), &_xi[numSamples]);
-    auto maxXi  = std::max_element(std::begin(_xi), &_xi[numSamples]);
-    using xMask = std::integral_constant<int, 0xA>;
-    // BR(max),    BL(min),    UR(max),    UL(min)
-    tileSampleOffsetsX = expandThenBlend4<xMask>(minXi, maxXi);
-
-    auto minYi  = std::min_element(std::begin(_yi), &_yi[numSamples]);
-    auto maxYi  = std::max_element(std::begin(_yi), &_yi[numSamples]);
-    using yMask = std::integral_constant<int, 0xC>;
-    // BR(max),    BL(min),    UR(max),    UL(min)
-    tileSampleOffsetsY = expandThenBlend4<yMask>(minYi, maxYi);
-};
--- a/Show more
+++ b/Show more