swr/rast: Switch intrinsic usage to SIMDLib

Switch from a macro-based simd intrinsics layer to a more C++ implementation, which also adds AVX512 optimizations to 128-bit and 256-bit SIMD. Reviewed-by: Bruce Cherniak <bruce.cherniak at intel.com>
2026-01-23 12:40:29 +01:00 · 2017-06-15 15:24:07 -05:00 · 2017-06-15 15:24:07 -05:00 · fc4f6c44c4
commit fc4f6c44c4
parent 8b66d18a3b
30 changed files with 6206 additions and 2663 deletions
--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@ -65,6 +65,19 @@ COMMON_CXX_SOURCES := \
 	rasterizer/common/rdtsc_buckets_shared.h \
 	rasterizer/common/simd16intrin.h \
 	rasterizer/common/simdintrin.h \
+	rasterizer/common/simdlib.hpp \
+	rasterizer/common/simdlib_128_avx.inl \
+	rasterizer/common/simdlib_128_avx2.inl \
+	rasterizer/common/simdlib_128_avx512.inl \
+	rasterizer/common/simdlib_256_avx.inl \
+	rasterizer/common/simdlib_256_avx2.inl \
+	rasterizer/common/simdlib_256_avx512.inl \
+	rasterizer/common/simdlib_512_avx512.inl \
+	rasterizer/common/simdlib_512_avx512_masks.inl \
+	rasterizer/common/simdlib_512_emu.inl \
+	rasterizer/common/simdlib_512_emu_masks.inl \
+	rasterizer/common/simdlib_interface.hpp \
+	rasterizer/common/simdlib_types.hpp \
 	rasterizer/common/swr_assert.cpp \
 	rasterizer/common/swr_assert.h

--- a/src/gallium/drivers/swr/rasterizer/common/intrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/intrin.h
@ -26,89 +26,37 @@

 #include "os.h"

-#include <cassert>
+#define SIMD_ARCH KNOB_ARCH
+#include "simdlib_types.hpp"

-#include <emmintrin.h>
-#include <immintrin.h>
-#include <xmmintrin.h>
+typedef SIMDImpl::SIMD128Impl::Float                      simd4scalar;
+typedef SIMDImpl::SIMD128Impl::Double                     simd4scalard;
+typedef SIMDImpl::SIMD128Impl::Integer                    simd4scalari;
+typedef SIMDImpl::SIMD128Impl::Vec4                       simd4vector;
+typedef SIMDImpl::SIMD128Impl::Mask                       simd4mask;
+
+typedef SIMDImpl::SIMD256Impl::Float                      simd8scalar;
+typedef SIMDImpl::SIMD256Impl::Double                     simd8scalard;
+typedef SIMDImpl::SIMD256Impl::Integer                    simd8scalari;
+typedef SIMDImpl::SIMD256Impl::Vec4                       simd8vector;
+typedef SIMDImpl::SIMD256Impl::Mask                       simd8mask;
+
+typedef SIMDImpl::SIMD512Impl::Float                      simd16scalar;
+typedef SIMDImpl::SIMD512Impl::Double                     simd16scalard;
+typedef SIMDImpl::SIMD512Impl::Integer                    simd16scalari;
+typedef SIMDImpl::SIMD512Impl::Vec4                       simd16vector;
+typedef SIMDImpl::SIMD512Impl::Mask                       simd16mask;

 #if KNOB_SIMD_WIDTH == 8 
-typedef __m256 simdscalar;
-typedef __m256i simdscalari;
-typedef uint8_t simdmask;
+typedef simd8scalar     simdscalar;
+typedef simd8scalard    simdscalard;
+typedef simd8scalari    simdscalari;
+typedef simd8vector     simdvector;
+typedef simd8mask       simdmask;
 #else
 #error Unsupported vector width
 #endif

-// simd vector
-OSALIGNSIMD(union) simdvector
-{
-    simdscalar  v[4];
-    struct
-    {
-        simdscalar x, y, z, w;
-    };
-
-    simdscalar& operator[] (const int i) { return v[i]; }
-    const simdscalar& operator[] (const int i) const { return v[i]; }
-};
-
-#if ENABLE_AVX512_SIMD16
-
-#if KNOB_SIMD16_WIDTH == 16
-
-#if ENABLE_AVX512_EMULATION
-struct simd16scalar
-{
-    __m256  lo;
-    __m256  hi;
-};
-struct simd16scalard
-{
-    __m256d lo;
-    __m256d hi;
-};
-struct simd16scalari
-{
-    __m256i lo;
-    __m256i hi;
-};
-typedef uint16_t simd16mask;
-
-#else
-typedef __m512 simd16scalar;
-typedef __m512d simd16scalard;
-typedef __m512i simd16scalari;
-typedef __mmask16 simd16mask;
-#endif//ENABLE_AVX512_EMULATION
-#else
-#error Unsupported vector width
-#endif//KNOB_SIMD16_WIDTH == 16
-
-#define _simd16_masklo(mask) ((mask) & 0xFF)
-#define _simd16_maskhi(mask) (((mask) >> 8) & 0xFF)
-#define _simd16_setmask(hi, lo) (((hi) << 8) | (lo))
-
-#if defined(_WIN32)
-#define SIMDAPI __vectorcall
-#else
-#define SIMDAPI
-#endif
-
-OSALIGN(union, KNOB_SIMD16_BYTES) simd16vector
-{
-    simd16scalar  v[4];
-    struct
-    {
-        simd16scalar x, y, z, w;
-    };
-
-    simd16scalar& operator[] (const int i) { return v[i]; }
-    const simd16scalar& operator[] (const int i) const { return v[i]; }
-};
-
-#endif // ENABLE_AVX512_SIMD16
-
 INLINE
 UINT pdep_u32(UINT a, UINT mask)
 {
--- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
@ -0,0 +1,550 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#pragma once
+
+#include "simdlib_types.hpp"
+
+// For documentation, please see the following include...
+// #include "simdlib_interface.hpp"
+
+namespace SIMDImpl
+{
+    namespace SIMD128Impl
+    {
+#if SIMD_ARCH >= SIMD_ARCH_AVX
+        struct AVXImpl
+        {
+#define __SIMD_LIB_AVX_HPP__
+#include "simdlib_128_avx.inl"
+#undef __SIMD_LIB_AVX_HPP__
+        }; // struct AVXImpl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
+
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX2
+        struct AVX2Impl : AVXImpl
+        {
+#define __SIMD_LIB_AVX2_HPP__
+#include "simdlib_128_avx2.inl"
+#undef __SIMD_LIB_AVX2_HPP__
+        }; // struct AVX2Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+        struct AVX512Impl : AVX2Impl
+        {
+#define __SIMD_LIB_AVX512_HPP__
+#include "simdlib_128_avx512.inl"
+#undef __SIMD_LIB_AVX512_HPP__
+        }; // struct AVX2Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
+
+        struct Traits : SIMDImpl::Traits
+        {
+#if SIMD_ARCH == SIMD_ARCH_AVX
+            using IsaImpl = AVXImpl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX2
+            using IsaImpl = AVX2Impl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX512
+            using IsaImpl = AVX512Impl;
+#else
+#error Invalid value for SIMD_ARCH
+#endif
+
+            using Float     = SIMD128Impl::Float;
+            using Double    = SIMD128Impl::Double;
+            using Integer   = SIMD128Impl::Integer;
+            using Vec4      = SIMD128Impl::Vec4;
+            using Mask      = SIMD128Impl::Mask;
+        };
+    } // ns SIMD128Impl
+
+    namespace SIMD256Impl
+    {
+#if SIMD_ARCH >= SIMD_ARCH_AVX
+        struct AVXImpl
+        {
+#define __SIMD_LIB_AVX_HPP__
+#include "simdlib_256_avx.inl"
+#undef __SIMD_LIB_AVX_HPP__
+        }; // struct AVXImpl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
+
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX2
+        struct AVX2Impl : AVXImpl
+        {
+#define __SIMD_LIB_AVX2_HPP__
+#include "simdlib_256_avx2.inl"
+#undef __SIMD_LIB_AVX2_HPP__
+        }; // struct AVX2Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+        struct AVX512Impl : AVX2Impl
+        {
+#define __SIMD_LIB_AVX512_HPP__
+#include "simdlib_256_avx512.inl"
+#undef __SIMD_LIB_AVX512_HPP__
+        }; // struct AVX2Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
+
+        struct Traits : SIMDImpl::Traits
+        {
+#if SIMD_ARCH == SIMD_ARCH_AVX
+            using IsaImpl = AVXImpl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX2
+            using IsaImpl = AVX2Impl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX512
+            using IsaImpl = AVX512Impl;
+#else
+#error Invalid value for SIMD_ARCH
+#endif
+
+            using Float     = SIMD256Impl::Float;
+            using Double    = SIMD256Impl::Double;
+            using Integer   = SIMD256Impl::Integer;
+            using Vec4      = SIMD256Impl::Vec4;
+            using Mask      = SIMD256Impl::Mask;
+        };
+    } // ns SIMD256Impl
+
+    namespace SIMD512Impl
+    {
+#if SIMD_ARCH >= SIMD_ARCH_AVX
+        template<typename SIMD256T>
+        struct AVXImplBase
+        {
+#define __SIMD_LIB_AVX_HPP__
+#include "simdlib_512_emu.inl"
+#include "simdlib_512_emu_masks.inl"
+#undef __SIMD_LIB_AVX_HPP__
+        }; // struct AVXImplBase
+        using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>;
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
+
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX2
+        using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>;
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
+
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+        struct AVX512Impl
+        {
+#define __SIMD_LIB_AVX512_HPP__
+#include "simdlib_512_avx512.inl"
+#include "simdlib_512_avx512_masks.inl"
+#undef __SIMD_LIB_AVX512_HPP__
+        }; // struct AVX512Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
+
+        struct Traits : SIMDImpl::Traits
+        {
+#if SIMD_ARCH == SIMD_ARCH_AVX
+            using IsaImpl = AVXImpl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX2
+            using IsaImpl = AVX2Impl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX512
+            using IsaImpl = AVX512Impl;
+#else
+#error Invalid value for SIMD_ARCH
+#endif
+
+            using Float     = SIMD512Impl::Float;
+            using Double    = SIMD512Impl::Double;
+            using Integer   = SIMD512Impl::Integer;
+            using Vec4      = SIMD512Impl::Vec4;
+            using Mask      = SIMD512Impl::Mask;
+        };
+    } // ns SIMD512Impl
+} // ns SIMDImpl
+
+template <typename Traits>
+struct SIMDBase : Traits::IsaImpl
+{
+    using CompareType   = typename Traits::CompareType;
+    using ScaleFactor   = typename Traits::ScaleFactor;
+    using RoundMode     = typename Traits::RoundMode;
+    using SIMD          = typename Traits::IsaImpl;
+    using Float         = typename Traits::Float;
+    using Double        = typename Traits::Double;
+    using Integer       = typename Traits::Integer;
+    using Vec4          = typename Traits::Vec4;
+    using Mask          = typename Traits::Mask;
+
+    // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
+    static SIMDINLINE
+    void vec4_load1_ps(Vec4& r, const float *p)
+    {
+        r[0] = SIMD::set1_ps(p[0]);
+        r[1] = SIMD::set1_ps(p[1]);
+        r[2] = SIMD::set1_ps(p[2]);
+        r[3] = SIMD::set1_ps(p[3]);
+    }
+
+    static SIMDINLINE
+    void vec4_set1_vps(Vec4& r, Float s)
+    {
+        r[0] = s;
+        r[1] = s;
+        r[2] = s;
+        r[3] = s;
+    }
+
+    static SIMDINLINE
+    Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1)
+    {
+        Float tmp, r;
+        r   = SIMD::mul_ps(v0[0], v1[0]);     // (v0.x*v1.x)
+
+        tmp = SIMD::mul_ps(v0[1], v1[1]);     // (v0.y*v1.y)
+        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y)
+
+        tmp = SIMD::mul_ps(v0[2], v1[2]);     // (v0.z*v1.z)
+        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+
+        return r;
+    }
+
+    static SIMDINLINE
+    Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1)
+    {
+        Float tmp, r;
+        r   = SIMD::mul_ps(v0[0], v1[0]);     // (v0.x*v1.x)
+
+        tmp = SIMD::mul_ps(v0[1], v1[1]);     // (v0.y*v1.y)
+        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y)
+
+        tmp = SIMD::mul_ps(v0[2], v1[2]);     // (v0.z*v1.z)
+        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+
+        tmp = SIMD::mul_ps(v0[3], v1[3]);     // (v0.w*v1.w)
+        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+
+        return r;
+    }
+
+    static SIMDINLINE
+    Float vec4_rcp_length_ps(const Vec4& v)
+    {
+        Float length = vec4_dp4_ps(v, v);
+        return SIMD::rsqrt_ps(length);
+    }
+
+    static SIMDINLINE
+    void vec4_normalize_ps(Vec4& r, const Vec4& v)
+    {
+        Float rcpLength = vec4_rcp_length_ps(v);
+
+        r[0] = SIMD::mul_ps(v[0], rcpLength);
+        r[1] = SIMD::mul_ps(v[1], rcpLength);
+        r[2] = SIMD::mul_ps(v[2], rcpLength);
+        r[3] = SIMD::mul_ps(v[3], rcpLength);
+    }
+
+    static SIMDINLINE
+    void vec4_mul_ps(Vec4& r, const Vec4& v, Float s)
+    {
+        r[0] = SIMD::mul_ps(v[0], s);
+        r[1] = SIMD::mul_ps(v[1], s);
+        r[2] = SIMD::mul_ps(v[2], s);
+        r[3] = SIMD::mul_ps(v[3], s);
+    }
+
+    static SIMDINLINE
+    void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
+    {
+        r[0] = SIMD::mul_ps(v0[0], v1[0]);
+        r[1] = SIMD::mul_ps(v0[1], v1[1]);
+        r[2] = SIMD::mul_ps(v0[2], v1[2]);
+        r[3] = SIMD::mul_ps(v0[3], v1[3]);
+    }
+
+    static SIMDINLINE
+    void vec4_add_ps(Vec4& r, const Vec4& v0, Float s)
+    {
+        r[0] = SIMD::add_ps(v0[0], s);
+        r[1] = SIMD::add_ps(v0[1], s);
+        r[2] = SIMD::add_ps(v0[2], s);
+        r[3] = SIMD::add_ps(v0[3], s);
+    }
+
+    static SIMDINLINE
+    void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
+    {
+        r[0] = SIMD::add_ps(v0[0], v1[0]);
+        r[1] = SIMD::add_ps(v0[1], v1[1]);
+        r[2] = SIMD::add_ps(v0[2], v1[2]);
+        r[3] = SIMD::add_ps(v0[3], v1[3]);
+    }
+
+    static SIMDINLINE
+    void vec4_min_ps(Vec4& r, const Vec4& v0, Float s)
+    {
+        r[0] = SIMD::min_ps(v0[0], s);
+        r[1] = SIMD::min_ps(v0[1], s);
+        r[2] = SIMD::min_ps(v0[2], s);
+        r[3] = SIMD::min_ps(v0[3], s);
+    }
+
+    static SIMDINLINE
+    void vec4_max_ps(Vec4& r, const Vec4& v0, Float s)
+    {
+        r[0] = SIMD::max_ps(v0[0], s);
+        r[1] = SIMD::max_ps(v0[1], s);
+        r[2] = SIMD::max_ps(v0[2], s);
+        r[3] = SIMD::max_ps(v0[3], s);
+    }
+
+    // Matrix4x4 * Vector4
+    //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
+    //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
+    //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
+    //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
+    static SIMDINLINE
+    void SIMDCALL mat4x4_vec4_multiply(
+        Vec4& result,
+        const float *pMatrix,
+        const Vec4& v)
+    {
+        Float m;
+        Float r0;
+        Float r1;
+
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
+        r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+        result[0] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
+        r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+        result[1] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
+        r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+        result[2] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 3);  // m[row][3]
+        r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+        result[3] = r0;
+    }
+
+    // Matrix4x4 * Vector3 - Direction Vector where w = 0.
+    //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
+    //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
+    //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
+    //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
+    static SIMDINLINE
+    void SIMDCALL mat3x3_vec3_w0_multiply(
+        Vec4& result,
+        const float *pMatrix,
+        const Vec4& v)
+    {
+        Float m;
+        Float r0;
+        Float r1;
+
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        result[0] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        result[1] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        result[2] = r0;
+
+        result[3] = SIMD::setzero_ps();
+    }
+
+    // Matrix4x4 * Vector3 - Position vector where w = 1.
+    //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
+    //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
+    //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
+    //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
+    static SIMDINLINE
+    void SIMDCALL mat4x4_vec3_w1_multiply(
+        Vec4& result,
+        const float *pMatrix,
+        const Vec4& v)
+    {
+        Float m;
+        Float r0;
+        Float r1;
+
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[0] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[1] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[2] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 3);  // m[row][3]
+        result[3] = SIMD::add_ps(r0, m);        // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    }
+
+    static SIMDINLINE
+    void SIMDCALL mat4x3_vec3_w1_multiply(
+        Vec4& result,
+        const float *pMatrix,
+        const Vec4& v)
+    {
+        Float m;
+        Float r0;
+        Float r1;
+
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[0] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[1] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[2] = r0;
+        result[3] = SIMD::set1_ps(1.0f);
+    }
+}; // struct SIMDBase
+
+using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
+using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>;
+using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>;
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
@ -0,0 +1,545 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD128 AVX (1) implementation
+//============================================================================
+
+#define SIMD_WRAPPER_1(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return _mm_##op(a);\
+    }
+
+#define SIMD_WRAPPER_2(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm_##op(a, b);\
+    }
+
+#define SIMD_DWRAPPER_2(op)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm_##op(a, b);\
+    }
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm_##op(a, b, ImmT);\
+    }
+
+#define SIMD_DWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm_##op(a, b, ImmT);\
+    }
+
+#define SIMD_WRAPPER_3(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return _mm_##op(a, b, c);\
+    }
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_1I_(op, intrin)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return intrin(a, ImmT);\
+    }
+#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm_##op)
+
+#define SIMD_IWRAPPER_2_(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return intrin(a, b);\
+    }
+
+#define SIMD_IWRAPPER_2(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm_##op(a, b);\
+    }
+
+#define SIMD_IFWRAPPER_2(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\
+    }
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm_##op(a, b, ImmT);\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
+SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c)    // return (a * b) + c
+{
+    return add_ps(mul_ps(a, b), c);
+}
+static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c)    // return (a * b) - c
+{
+    return sub_ps(mul_ps(a, b), c);
+}
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+    return _mm_round_ps(a, static_cast<int>(RMT));
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(and_ps);                             // return a & b       (float treated as int)
+SIMD_IWRAPPER_2_(and_si, _mm_and_si128);        // return a & b       (int)
+SIMD_WRAPPER_2(andnot_ps);                          // return (~a) & b    (float treated as int)
+SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128);  // return (~a) & b    (int)
+SIMD_WRAPPER_2(or_ps);                              // return a | b       (float treated as int)
+SIMD_IWRAPPER_2_(or_si, _mm_or_si128);          // return a | b       (int)
+SIMD_WRAPPER_2(xor_ps);                             // return a ^ b       (float treated as int)
+SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128);        // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I(slli_epi32);               // return a << ImmT
+
+static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b      (uint32)
+{
+    int32_t a, count;
+    a = _mm_extract_epi32(vA, 0);
+    count = _mm_extract_epi32(vB, 0);
+    a <<= count;
+    vA = _mm_insert_epi32(vA, a, 0);
+
+    a = _mm_extract_epi32(vA, 1);
+    count = _mm_extract_epi32(vB, 1);
+    a <<= count;
+    vA = _mm_insert_epi32(vA, a, 1);
+
+    a = _mm_extract_epi32(vA, 2);
+    count = _mm_extract_epi32(vB, 2);
+    a <<= count;
+    vA = _mm_insert_epi32(vA, a, 2);
+
+    a = _mm_extract_epi32(vA, 3);
+    count = _mm_extract_epi32(vB, 3);
+    a <<= count;
+    vA = _mm_insert_epi32(vA, a, 3);
+
+    return vA;
+}
+
+SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_1I_(srli_si, _mm_srli_si128); // return a >> (ImmT*8) (uint)
+
+template<int ImmT>                              // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+    return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+
+static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b      (uint32)
+{
+    int32_t a, count;
+    a = _mm_extract_epi32(vA, 0);
+    count = _mm_extract_epi32(vB, 0);
+    a >>= count;
+    vA = _mm_insert_epi32(vA, a, 0);
+
+    a = _mm_extract_epi32(vA, 1);
+    count = _mm_extract_epi32(vB, 1);
+    a >>= count;
+    vA = _mm_insert_epi32(vA, a, 1);
+
+    a = _mm_extract_epi32(vA, 2);
+    count = _mm_extract_epi32(vB, 2);
+    a >>= count;
+    vA = _mm_insert_epi32(vA, a, 2);
+
+    a = _mm_extract_epi32(vA, 3);
+    count = _mm_extract_epi32(vB, 3);
+    a >>= count;
+    vA = _mm_insert_epi32(vA, a, 3);
+
+    return vA;
+}
+
+
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a)   // return *(Float*)(&a)
+{
+    return _mm_castpd_ps(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a)   // return *(Integer*)(&a)
+{
+    return _mm_castps_si128(a);
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)   // return *(Double*)(&a)
+{
+    return _mm_castsi128_pd(a);
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+{
+    return _mm_castps_pd(a);
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)   // return *(Float*)(&a)
+{
+    return _mm_castsi128_ps(a);
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
+{
+    return _mm_cvtepi32_ps(a);
+}
+
+SIMD_IWRAPPER_1(cvtepu8_epi16);     // return (int16)a    (uint8 --> int16)
+SIMD_IWRAPPER_1(cvtepu8_epi32);     // return (int32)a    (uint8 --> int32)
+SIMD_IWRAPPER_1(cvtepu16_epi32);    // return (int32)a    (uint16 --> int32)
+SIMD_IWRAPPER_1(cvtepu16_epi64);    // return (int64)a    (uint16 --> int64)
+SIMD_IWRAPPER_1(cvtepu32_epi64);    // return (int64)a    (uint32 --> int64)
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+{
+    return _mm_cvtps_epi32(a);
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+{
+    return _mm_cvttps_epi32(a);
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+    return _mm_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
+}
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+SIMD_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
+SIMD_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
+SIMD_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
+SIMD_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
+SIMD_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
+SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
+SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
+SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
+SIMD_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+    return  0 != _mm_testz_ps(a, b);
+}
+
+static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+    return  0 != _mm_testz_si128(a, b);
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
+SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)  // return *p (all elements in vector get same value)
+{
+    return _mm_broadcast_ss(p);
+}
+
+SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm_packus_epi32 and _mm512_packus_epi32
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    return castps_si(_mm_permutevar_ps(castsi_ps(a), swiz));
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    return _mm_permutevar_ps(a, swiz);
+}
+
+SIMD_IWRAPPER_1I(shuffle_epi32);
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) = delete;
+
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_DWRAPPER_2I(shuffle_pd);
+SIMD_WRAPPER_2I(shuffle_ps);
+SIMD_IWRAPPER_2(unpackhi_epi16);
+
+//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps);
+static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
+{
+    return castps_si(_mm_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
+}
+
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_DWRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IFWRAPPER_2(unpacklo_epi32, _mm_unpacklo_ps);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+SIMD_DWRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    uint32_t *pOffsets = (uint32_t*)&idx;
+    Float vResult;
+    float* pResult = (float*)&vResult;
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        uint32_t offset = pOffsets[i];
+        offset = offset * static_cast<uint32_t>(ScaleT);
+        pResult[i] = *(float const*)(((uint8_t const*)p + offset));
+    }
+
+    return vResult;
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+{
+    return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return _mm_load_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return _mm_load_si128(&p->v);
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return _mm_loadu_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return _mm_lddqu_si128(&p->v);
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    uint32_t *pOffsets = (uint32_t*)&idx;
+    Float vResult = old;
+    float* pResult = (float*)&vResult;
+    DWORD index;
+    uint32_t umask = movemask_ps(mask);
+    while (_BitScanForward(&index, umask))
+    {
+        umask &= ~(1 << index);
+        uint32_t offset = pOffsets[index];
+        offset = offset * static_cast<uint32_t>(ScaleT);
+        pResult[index] = *(float const *)(((uint8_t const *)p + offset));
+    }
+
+    return vResult;
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    _mm_maskstore_ps(p, mask, src);
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    return static_cast<uint32_t>(_mm_movemask_epi8(a));
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+    return static_cast<uint32_t>(_mm_movemask_pd(a));
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+    return static_cast<uint32_t>(_mm_movemask_ps(a));
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+    return _mm_set1_epi32(i);
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+    return _mm_set1_epi8(i);
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+{
+    return _mm_set1_ps(f);
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+{
+    return _mm_setzero_ps();
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+{
+    return _mm_setzero_si128();
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    _mm_store_ps(p, a);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    _mm_store_si128(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
+{
+    _mm_storeu_si128(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+{
+    _mm_stream_ps(p, a);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(float in3, float in2, float in1, float in0)
+{
+    return _mm_set_ps(in3, in2, in1, in0);
+}
+
+template <int ImmT>
+static SIMDINLINE float SIMDCALL extract_ps(Float a)
+{
+    int tmp = _mm_extract_ps(a, ImmT);
+    return *reinterpret_cast<float*>(&tmp);
+}
+
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IFWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_1I
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2I
+
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl
@ -0,0 +1,68 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX2_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD4 AVX (2) implementation
+//
+// Since this implementation inherits from the AVX (1) implementation,
+// the only operations below ones that replace AVX (1) operations.
+// Only 2 shifts and 2 gathers were introduced with AVX 2
+// Also, add native support for FMA operations
+//============================================================================
+#define SIMD_WRAPPER_3(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return _mm_##op(a, b, c);\
+    }
+
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+
+static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b      (uint32)
+{
+    return _mm_sllv_epi32(vA, vB);
+}
+
+static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b      (uint32)
+{
+    return _mm_srlv_epi32(vA, vB);
+}
+
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return _mm_i32gather_ps(p, idx, static_cast<const int>(ScaleT));
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    return _mm_mask_i32gather_ps(old, p, idx, mask, static_cast<const int>(ScaleT));
+}
+
+#undef SIMD_WRAPPER_3
+
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
@ -0,0 +1,408 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD128 AVX (512) implementation
+//
+// Since this implementation inherits from the AVX (2) implementation,
+// the only operations below ones that replace AVX (2) operations.
+// These use native AVX512 instructions with masking to enable a larger
+// register set.
+//============================================================================
+
+private:
+    static SIMDINLINE __m512  __conv(Float r) { return _mm512_castps128_ps512(r.v); }
+    static SIMDINLINE __m512d __conv(Double r) { return _mm512_castpd128_pd512(r.v); }
+    static SIMDINLINE __m512i __conv(Integer r) { return _mm512_castsi128_si512(r.v); }
+    static SIMDINLINE Float   __conv(__m512 r) { return _mm512_castps512_ps128(r); }
+    static SIMDINLINE Double  __conv(__m512d r) { return _mm512_castpd512_pd128(r); }
+    static SIMDINLINE Integer __conv(__m512i r) { return _mm512_castsi512_si128(r); }
+public:
+
+#define SIMD_WRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_WRAPPER_1(op)  SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_WRAPPER_1I(op)  SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_WRAPPER_2(op)  SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+    }
+
+#define SIMD_WRAPPER_3_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
+    }
+#define SIMD_WRAPPER_3(op)  SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
+
+#define SIMD_DWRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Double SIMDCALL op(Double a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_1(op)  SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_DWRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Double SIMDCALL op(Double a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_1I(op)  SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_DWRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_2(op)  SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_DWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT));\
+    }
+
+#define SIMD_IWRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_IWRAPPER_1_32(op)  SIMD_IWRAPPER_1_(op, op, __mmask16(0xf))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_1_8(op)   SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_1_16(op)  SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_1_64(op)  SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_IWRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_IWRAPPER_1I_32(op)  SIMD_IWRAPPER_1I_(op, op, __mmask16(0xf))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_1I_8(op)   SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_1I_16(op)  SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_1I_64(op)  SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_IWRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_IWRAPPER_2_32(op)  SIMD_IWRAPPER_2_(op, op, __mmask16(0xf))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_2_8(op)   SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_2_16(op)  SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_2_64(op)  SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1_(rcp_ps, rcp28_ps, __mmask16(0xf));     // return 1.0f / a
+SIMD_WRAPPER_1_(rsqrt_ps, rsqrt28_ps, __mmask16(0xf));   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1_32(abs_epi32);  // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2_32(add_epi32);  // return a + b (int32)
+SIMD_IWRAPPER_2_32(max_epi32);  // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(max_epu32);  // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(min_epi32);  // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(min_epu32);  // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(mul_epi32);  // return a * b (int32)
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
+SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+
+#endif
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2_32(mullo_epi32);
+SIMD_IWRAPPER_2_32(sub_epi32);  // return a - b (int32)
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
+SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
+
+#endif
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2_(and_si,    and_epi32, __mmask16(0xf));    // return a & b       (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xf)); // return (~a) & b    (int)
+SIMD_IWRAPPER_2_(or_si,     or_epi32, __mmask16(0xf));     // return a | b       (int)
+SIMD_IWRAPPER_2_(xor_si,    xor_epi32, __mmask16(0xf));    // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I_32(slli_epi32);               // return a << ImmT
+SIMD_IWRAPPER_2_32(sllv_epi32);                // return a << b      (uint32)
+SIMD_IWRAPPER_1I_32(srai_epi32);               // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I_32(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_2_32(srlv_epi32);                // return a >> b      (uint32)
+
+// use AVX2 version
+//SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
+
+//-----------------------------------------------------------------------
+// Conversion operations (Use AVX2 versions)
+//-----------------------------------------------------------------------
+// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff);    // return (int16)a    (uint8 --> int16)
+// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff);      // return (int32)a    (uint8 --> int32)
+// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff);     // return (int32)a    (uint16 --> int32)
+// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf);      // return (int64)a    (uint16 --> int64)
+// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf);      // return (int64)a    (uint32 --> int64)
+
+//-----------------------------------------------------------------------
+// Comparison operations (Use AVX2 versions
+//-----------------------------------------------------------------------
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi8);    // return a == b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi16);   // return a == b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi32);   // return a == b (int32)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi64);   // return a == b (int64)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,);   // return a > b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi16);   // return a > b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi32);   // return a > b (int32)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi64);   // return a > b (int64)
+//
+//static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
+//{
+//    return cmpgt_epi32(b, a);
+//}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 --> uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+#endif
+// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
+
+//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+//{
+//    return _mm256_permutevar8x32_ps(a, swiz);
+//}
+
+SIMD_IWRAPPER_1I_32(shuffle_epi32);
+//template<int ImmT>
+//static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+//{
+//    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+//}
+//SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_IWRAPPER_2_32(unpackhi_epi32);
+SIMD_IWRAPPER_2_32(unpacklo_epi32);
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_16(unpackhi_epi16);
+SIMD_IWRAPPER_2_64(unpackhi_epi64);
+SIMD_IWRAPPER_2_8(unpackhi_epi8);
+SIMD_IWRAPPER_2_16(unpacklo_epi16);
+SIMD_IWRAPPER_2_64(unpacklo_epi64);
+SIMD_IWRAPPER_2_8(unpacklo_epi8);
+
+#endif
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return __conv(_mm512_maskz_load_ps(__mmask16(0xf), p));
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return __conv(_mm512_maskz_load_epi32(__mmask16(0xf), p));
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p));
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p));
+}
+
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return __conv(_mm512_mask_i32gather_ps(
+                    _mm512_setzero_ps(),
+                    __mmask16(0xf),
+                    __conv(idx),
+                    p,
+                    static_cast<int>(ScaleT)));
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    __mmask16 m = 0xf;
+    m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)),
+                                _mm512_set1_epi32(0x8000000));
+    return __conv(_mm512_mask_i32gather_ps(
+                    __conv(old),
+                    m,
+                    __conv(idx),
+                    p,
+                    static_cast<int>(ScaleT)));
+}
+
+#if !defined(AVX512F_STRICT)
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    __mmask64 m = 0xffffull;
+    return static_cast<uint32_t>(
+        _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
+}
+
+#endif
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    __mmask16 m = 0xf;
+    m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
+    _mm512_mask_store_ps(p, m, __conv(src));
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    _mm512_mask_store_ps(p, __mmask16(0xf), __conv(a));
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    _mm512_mask_store_epi32(p, __mmask16(0xf), __conv(a));
+}
+
+//=======================================================================
+// Legacy interface (available only in SIMD256 width)
+//=======================================================================
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_1I_
+#undef SIMD_WRAPPER_1I
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_3
+#undef SIMD_DWRAPPER_1_
+#undef SIMD_DWRAPPER_1
+#undef SIMD_DWRAPPER_1I_
+#undef SIMD_DWRAPPER_1I
+#undef SIMD_DWRAPPER_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_IWRAPPER_1_
+#undef SIMD_IWRAPPER_1_8
+#undef SIMD_IWRAPPER_1_16
+#undef SIMD_IWRAPPER_1_32
+#undef SIMD_IWRAPPER_1_64
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_1I_8
+#undef SIMD_IWRAPPER_1I_16
+#undef SIMD_IWRAPPER_1I_32
+#undef SIMD_IWRAPPER_1I_64
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2_8
+#undef SIMD_IWRAPPER_2_16
+#undef SIMD_IWRAPPER_2_32
+#undef SIMD_IWRAPPER_2_64
+#undef SIMD_IWRAPPER_2I
+//#undef SIMD_IWRAPPER_2I_8
+//#undef SIMD_IWRAPPER_2I_16
+//#undef SIMD_IWRAPPER_2I_32
+//#undef SIMD_IWRAPPER_2I_64
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
@ -0,0 +1,757 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+using SIMD128T = SIMD128Impl::AVXImpl;
+
+//============================================================================
+// SIMD256 AVX (1) implementation
+//============================================================================
+
+#define SIMD_WRAPPER_1(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return _mm256_##op(a);\
+    }
+
+#define SIMD_WRAPPER_2(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm256_##op(a, b);\
+    }
+
+#define SIMD_DWRAPPER_2(op)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm256_##op(a, b);\
+    }
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return  _mm256_##op(a, b, ImmT);\
+    }
+
+#define SIMD_DWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm256_##op(a, b, ImmT);\
+    }
+
+#define SIMD_WRAPPER_3(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return _mm256_##op(a, b, c);\
+    }
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm256_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_2(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##op(a, b);\
+    }
+
+#define SIMD_IFWRAPPER_2(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\
+    }
+
+#define SIMD_IFWRAPPER_2I(op, intrin)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return castps_si( intrin(castsi_ps(a), castsi_ps(b), ImmT) );\
+    }
+
+#define SIMD_IWRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##intrin(a, b, ImmT);\
+    }
+#define SIMD_IWRAPPER_2I(op)  SIMD_IWRAPPER_2I_(op, op)
+
+#define SIMD_IWRAPPER_3(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c)   \
+    {\
+        return _mm256_##op(a, b, c);\
+    }
+
+// emulated integer simd
+#define SIMD_EMU_IWRAPPER_1(op) \
+    static SIMDINLINE \
+    Integer SIMDCALL op(Integer a)\
+    {\
+        return Integer\
+        {\
+            SIMD128T::op(a.v4[0]),\
+            SIMD128T::op(a.v4[1]),\
+        };\
+    }
+#define SIMD_EMU_IWRAPPER_1L(op, shift) \
+    static SIMDINLINE \
+    Integer SIMDCALL op(Integer a)\
+    {\
+        return Integer \
+        {\
+            SIMD128T::op(a.v4[0]), \
+            SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])), \
+        };\
+    }\
+    static SIMDINLINE \
+    Integer SIMDCALL op(SIMD128Impl::Integer a)\
+    {\
+        return Integer \
+        {\
+            SIMD128T::op(a), \
+            SIMD128T::op(SIMD128T::template srli_si<shift>(a)), \
+        };\
+    }
+
+#define SIMD_EMU_IWRAPPER_1I(op) \
+    template <int ImmT> static SIMDINLINE \
+    Integer SIMDCALL op(Integer a)\
+    {\
+        return Integer\
+        {\
+            SIMD128T::template op<ImmT>(a.v4[0]),\
+            SIMD128T::template op<ImmT>(a.v4[1]),\
+        };\
+    }
+
+#define SIMD_EMU_IWRAPPER_2(op) \
+    static SIMDINLINE \
+    Integer SIMDCALL op(Integer a, Integer b)\
+    {\
+        return Integer\
+        {\
+            SIMD128T::op(a.v4[0], b.v4[0]),\
+            SIMD128T::op(a.v4[1], b.v4[1]),\
+        };\
+    }
+
+#define SIMD_EMU_IWRAPPER_2I(op) \
+    template <int ImmT> static SIMDINLINE \
+    Integer SIMDCALL op(Integer a, Integer b)\
+    {\
+        return Integer\
+        {\
+            SIMD128T::template op<ImmT>(a.v4[0], b.v[0]),\
+            SIMD128T::template op<ImmT>(a.v4[1], b.v[1]),\
+        };\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
+{
+    return add_ps(mul_ps(a, b), c);
+}
+
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
+SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+    return _mm256_round_ps(a, static_cast<int>(RMT));
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_EMU_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_EMU_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_EMU_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_EMU_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_EMU_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_EMU_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_EMU_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_EMU_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_EMU_IWRAPPER_2(mullo_epi32);
+SIMD_EMU_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_EMU_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(and_ps);         // return a & b       (float treated as int)
+SIMD_EMU_IWRAPPER_2(and_si);    // return a & b       (int)
+SIMD_WRAPPER_2(andnot_ps);      // return (~a) & b    (float treated as int)
+SIMD_EMU_IWRAPPER_2(andnot_si); // return (~a) & b    (int)
+SIMD_WRAPPER_2(or_ps);          // return a | b       (float treated as int)
+SIMD_EMU_IWRAPPER_2(or_si);     // return a | b       (int)
+SIMD_WRAPPER_2(xor_ps);         // return a ^ b       (float treated as int)
+SIMD_EMU_IWRAPPER_2(xor_si);    // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_EMU_IWRAPPER_1I(slli_epi32);               // return a << ImmT
+
+static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vCount) // return a << b      (uint32)
+{
+    int32_t aHi, aLow, countHi, countLow;
+    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
+
+    aHi = _mm_extract_epi32(vAHi, 0);
+    countHi = _mm_extract_epi32(vCountHi, 0);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
+
+    aLow = _mm_extract_epi32(vALow, 0);
+    countLow = _mm_extract_epi32(vCountLow, 0);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 0);
+
+    aHi = _mm_extract_epi32(vAHi, 1);
+    countHi = _mm_extract_epi32(vCountHi, 1);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
+
+    aLow = _mm_extract_epi32(vALow, 1);
+    countLow = _mm_extract_epi32(vCountLow, 1);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 1);
+
+    aHi = _mm_extract_epi32(vAHi, 2);
+    countHi = _mm_extract_epi32(vCountHi, 2);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
+
+    aLow = _mm_extract_epi32(vALow, 2);
+    countLow = _mm_extract_epi32(vCountLow, 2);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 2);
+
+    aHi = _mm_extract_epi32(vAHi, 3);
+    countHi = _mm_extract_epi32(vCountHi, 3);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
+
+    aLow = _mm_extract_epi32(vALow, 3);
+    countLow = _mm_extract_epi32(vCountLow, 3);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 3);
+
+    __m256i ret = _mm256_set1_epi32(0);
+    ret = _mm256_insertf128_si256(ret, vAHi, 1);
+    ret = _mm256_insertf128_si256(ret, vALow, 0);
+    return ret;
+}
+
+SIMD_EMU_IWRAPPER_1I(srai_epi32);   // return a >> ImmT   (int32)
+SIMD_EMU_IWRAPPER_1I(srli_epi32);   // return a >> ImmT   (uint32)
+SIMD_EMU_IWRAPPER_1I(srli_si);      // return a >> (ImmT*8) (uint)
+
+template<int ImmT>                              // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+    return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+
+static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vCount) // return a >> b      (uint32)
+{
+    int32_t aHi, aLow, countHi, countLow;
+    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
+
+    aHi = _mm_extract_epi32(vAHi, 0);
+    countHi = _mm_extract_epi32(vCountHi, 0);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
+
+    aLow = _mm_extract_epi32(vALow, 0);
+    countLow = _mm_extract_epi32(vCountLow, 0);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 0);
+
+    aHi = _mm_extract_epi32(vAHi, 1);
+    countHi = _mm_extract_epi32(vCountHi, 1);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
+
+    aLow = _mm_extract_epi32(vALow, 1);
+    countLow = _mm_extract_epi32(vCountLow, 1);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 1);
+
+    aHi = _mm_extract_epi32(vAHi, 2);
+    countHi = _mm_extract_epi32(vCountHi, 2);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
+
+    aLow = _mm_extract_epi32(vALow, 2);
+    countLow = _mm_extract_epi32(vCountLow, 2);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 2);
+
+    aHi = _mm_extract_epi32(vAHi, 3);
+    countHi = _mm_extract_epi32(vCountHi, 3);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
+
+    aLow = _mm_extract_epi32(vALow, 3);
+    countLow = _mm_extract_epi32(vCountLow, 3);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 3);
+
+    __m256i ret = _mm256_set1_epi32(0);
+    ret = _mm256_insertf128_si256(ret, vAHi, 1);
+    ret = _mm256_insertf128_si256(ret, vALow, 0);
+    return ret;
+}
+
+
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a)   // return *(Float*)(&a)
+{
+    return _mm256_castpd_ps(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a)   // return *(Integer*)(&a)
+{
+    return _mm256_castps_si256(a);
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)   // return *(Double*)(&a)
+{
+    return _mm256_castsi256_pd(a);
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+{
+    return _mm256_castps_pd(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castpd_si(Double a)   // return *(Integer*)(&a)
+{
+    return _mm256_castpd_si256(a);
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)   // return *(Float*)(&a)
+{
+    return _mm256_castsi256_ps(a);
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
+{
+    return _mm256_cvtepi32_ps(a);
+}
+
+SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8);                  // return (int16)a    (uint8 --> int16)
+SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4);                  // return (int32)a    (uint8 --> int32)
+SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8);                 // return (int32)a    (uint16 --> int32)
+SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4);                 // return (int64)a    (uint16 --> int64)
+SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8);                 // return (int64)a    (uint32 --> int64)
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+{
+    return _mm256_cvtps_epi32(a);
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+{
+    return _mm256_cvttps_epi32(a);
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+    return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
+}
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+SIMD_EMU_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
+SIMD_EMU_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
+SIMD_EMU_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
+SIMD_EMU_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
+SIMD_EMU_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+    return  0 != _mm256_testz_ps(a, b);
+}
+
+static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+    return  0 != _mm256_testz_si256(a, b);
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
+SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps);  // return ImmT ? b : a  (int32)
+SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)  // return *p (all elements in vector get same value)
+{
+    return _mm256_broadcast_ss(p);
+}
+
+SIMD_EMU_IWRAPPER_2(packs_epi16);   // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_EMU_IWRAPPER_2(packs_epi32);   // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_EMU_IWRAPPER_2(packus_epi16);  // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_EMU_IWRAPPER_2(packus_epi32);  // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
+{
+    Integer result;
+
+    // Ugly slow implementation
+    uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+    uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
+
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        pResult[i] = pA[0xF & pSwiz[i]];
+    }
+
+    return result;
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    Float result;
+
+    // Ugly slow implementation
+    float const *pA = reinterpret_cast<float const*>(&a);
+    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+    float *pResult = reinterpret_cast<float *>(&result);
+
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        pResult[i] = pA[0xF & pSwiz[i]];
+    }
+
+    return result;
+}
+
+SIMD_WRAPPER_2I(permute2f128_ps);
+SIMD_DWRAPPER_2I(permute2f128_pd);
+SIMD_IWRAPPER_2I_(permute2f128_si, permute2f128_si256);
+
+
+SIMD_EMU_IWRAPPER_1I(shuffle_epi32);
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+{
+    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+}
+SIMD_EMU_IWRAPPER_2(shuffle_epi8);
+SIMD_DWRAPPER_2I(shuffle_pd);
+SIMD_WRAPPER_2I(shuffle_ps);
+SIMD_EMU_IWRAPPER_2(unpackhi_epi16);
+SIMD_IFWRAPPER_2(unpackhi_epi32, _mm256_unpackhi_ps);
+SIMD_EMU_IWRAPPER_2(unpackhi_epi64);
+SIMD_EMU_IWRAPPER_2(unpackhi_epi8);
+SIMD_DWRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_EMU_IWRAPPER_2(unpacklo_epi16);
+SIMD_IFWRAPPER_2(unpacklo_epi32, _mm256_unpacklo_ps);
+SIMD_EMU_IWRAPPER_2(unpacklo_epi64);
+SIMD_EMU_IWRAPPER_2(unpacklo_epi8);
+SIMD_DWRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    uint32_t *pOffsets = (uint32_t*)&idx;
+    Float vResult;
+    float* pResult = (float*)&vResult;
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        uint32_t offset = pOffsets[i];
+        offset = offset * static_cast<uint32_t>(ScaleT);
+        pResult[i] = *(float const*)(((uint8_t const*)p + offset));
+    }
+
+    return vResult;
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+{
+    return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return _mm256_load_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return _mm256_load_si256(&p->v);
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return _mm256_loadu_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return _mm256_lddqu_si256(&p->v);
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    uint32_t *pOffsets = (uint32_t*)&idx;
+    Float vResult = old;
+    float* pResult = (float*)&vResult;
+    DWORD index;
+    uint32_t umask = movemask_ps(mask);
+    while (_BitScanForward(&index, umask))
+    {
+        umask &= ~(1 << index);
+        uint32_t offset = pOffsets[index];
+        offset = offset * static_cast<uint32_t>(ScaleT);
+        pResult[index] = *(float const *)(((uint8_t const *)p + offset));
+    }
+
+    return vResult;
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    _mm256_maskstore_ps(p, mask, src);
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    return SIMD128T::movemask_epi8(a.v4[0]) |
+           (SIMD128T::movemask_epi8(a.v4[1]) << 16);
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+    return static_cast<uint32_t>(_mm256_movemask_pd(a));
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+    return static_cast<uint32_t>(_mm256_movemask_ps(a));
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+    return _mm256_set1_epi32(i);
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+    return _mm256_set1_epi8(i);
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+{
+    return _mm256_set1_ps(f);
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+{
+    return _mm256_setzero_ps();
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+{
+    return _mm256_setzero_si256();
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    _mm256_store_ps(p, a);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    _mm256_store_si256(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+{
+    _mm256_stream_ps(p, a);
+}
+
+//=======================================================================
+// Legacy interface (available only in SIMD256 width)
+//=======================================================================
+
+static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const *p)
+{
+    return _mm256_broadcast_ps(&p->v);
+}
+
+template<int ImmT>
+static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double a)
+{
+    return _mm256_extractf128_pd(a, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE SIMD128Impl::Float  SIMDCALL extractf128_ps(Float a)
+{
+    return _mm256_extractf128_ps(a, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer a)
+{
+    return _mm256_extractf128_si256(a, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE Double SIMDCALL insertf128_pd(Double a, SIMD128Impl::Double b)
+{
+    return _mm256_insertf128_pd(a, b, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE Float SIMDCALL insertf128_ps(Float a, SIMD128Impl::Float b)
+{
+    return _mm256_insertf128_ps(a, b, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL insertf128_si(Integer a, SIMD128Impl::Integer b)
+{
+    return _mm256_insertf128_si256(a, b, ImmT);
+}
+
+#ifndef _mm256_set_m128i
+#define _mm256_set_m128i(/* SIMD128Impl::Integer */ hi, /* SIMD128Impl::Integer */ lo) \
+    _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
+#endif
+
+#ifndef _mm256_loadu2_m128i
+#define _mm256_loadu2_m128i(/* SIMD128Impl::Integer const* */ hiaddr, \
+                            /* SIMD128Impl::Integer const* */ loaddr) \
+    _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
+#endif
+
+static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi, SIMD128Impl::Integer const* plo)
+{
+    return _mm256_loadu2_m128i(&phi->v, &plo->v);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer *phi, SIMD128Impl::Integer *plo, Integer src)
+{
+    _mm256_storeu2_m128i(&phi->v, &plo->v, src);
+}
+
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IFWRAPPER_2
+#undef SIMD_IFWRAPPER_2I
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_2I_
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_3
+#undef SIMD_EMU_IWRAPPER_1
+#undef SIMD_EMU_IWRAPPER_1I
+#undef SIMD_EMU_IWRAPPER_2
+#undef SIMD_EMU_IWRAPPER_2I
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
@ -0,0 +1,234 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX2_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD256 AVX (2) implementation
+//
+// Since this implementation inherits from the AVX (1) implementation,
+// the only operations below ones that replace AVX (1) operations.
+// Mostly these are integer operations that are no longer emulated with SSE
+//============================================================================
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm256_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_1L(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm256_##op(_mm256_castsi256_si128(a));\
+    }\
+
+#define SIMD_IWRAPPER_1I(op)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm256_##op(a, ImmT);\
+    }
+
+#define SIMD_IWRAPPER_1I_(op, intrin)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm256_##intrin(a, ImmT);\
+    }
+
+#define SIMD_IWRAPPER_2_(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##intrin(a, b);\
+    }
+
+#define SIMD_IWRAPPER_2(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##op(a, b);\
+    }
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##op(a, b, ImmT);\
+    }
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##op(a, b, ImmT);\
+    }
+
+//-----------------------------------------------------------------------
+// Floating point arithmetic operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c)   // return (a * b) + c
+{
+    return _mm256_fmadd_ps(a, b, c);
+}
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2_(and_si,    and_si256);     // return a & b       (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_si256);  // return (~a) & b    (int)
+SIMD_IWRAPPER_2_(or_si,     or_si256);      // return a | b       (int)
+SIMD_IWRAPPER_2_(xor_si,    xor_si256);     // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I(slli_epi32);               // return a << ImmT
+SIMD_IWRAPPER_2(sllv_epi32);                // return a << b      (uint32)
+SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_2(srlv_epi32);                // return a >> b      (uint32)
+SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
+
+template<int ImmT>                          // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+    return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1L(cvtepu8_epi16);    // return (int16)a    (uint8 --> int16)
+SIMD_IWRAPPER_1L(cvtepu8_epi32);    // return (int32)a    (uint8 --> int32)
+SIMD_IWRAPPER_1L(cvtepu16_epi32);   // return (int32)a    (uint16 --> int32)
+SIMD_IWRAPPER_1L(cvtepu16_epi64);   // return (int64)a    (uint16 --> int64)
+SIMD_IWRAPPER_1L(cvtepu32_epi64);   // return (int64)a    (uint32 --> int64)
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
+SIMD_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
+SIMD_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
+SIMD_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
+SIMD_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
+SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
+SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
+SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
+
+static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
+{
+    return cmpgt_epi32(b, a);
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2I(blend_epi32);  // return ImmT ? b : a  (int32)
+SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    return _mm256_permutevar8x32_ps(a, swiz);
+}
+
+SIMD_IWRAPPER_1I(shuffle_epi32);
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+{
+    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+}
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_IWRAPPER_2(unpackhi_epi16);
+SIMD_IWRAPPER_2(unpackhi_epi32);
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IWRAPPER_2(unpacklo_epi32);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return _mm256_i32gather_ps(p, idx, static_cast<int>(ScaleT));
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+	// g++ in debug mode needs the explicit .v suffix instead of relying on operator __m256()
+	// Only for this intrinsic - not sure why. :(
+    return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast<int>(ScaleT));
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    return static_cast<uint32_t>(_mm256_movemask_epi8(a));
+}
+
+//=======================================================================
+// Legacy interface (available only in SIMD256 width)
+//=======================================================================
+
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_1L
+#undef SIMD_IWRAPPER_1I
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_2I
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
@ -0,0 +1,409 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD256 AVX (512) implementation
+//
+// Since this implementation inherits from the AVX (2) implementation,
+// the only operations below ones that replace AVX (2) operations.
+// These use native AVX512 instructions with masking to enable a larger
+// register set.
+//============================================================================
+
+private:
+    static SIMDINLINE __m512  __conv(Float r) { return _mm512_castps256_ps512(r.v); }
+    static SIMDINLINE __m512d __conv(Double r) { return _mm512_castpd256_pd512(r.v); }
+    static SIMDINLINE __m512i __conv(Integer r) { return _mm512_castsi256_si512(r.v); }
+    static SIMDINLINE Float   __conv(__m512 r) { return _mm512_castps512_ps256(r); }
+    static SIMDINLINE Double  __conv(__m512d r) { return _mm512_castpd512_pd256(r); }
+    static SIMDINLINE Integer __conv(__m512i r) { return _mm512_castsi512_si256(r); }
+public:
+
+#define SIMD_WRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_WRAPPER_1(op)  SIMD_WRAPPER_1_(op, op, __mmask16(0xff))
+
+#define SIMD_WRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_WRAPPER_1I(op)  SIMD_WRAPPER_1I_(op, op, __mmask16(0xff))
+
+#define SIMD_WRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_WRAPPER_2(op)  SIMD_WRAPPER_2_(op, op, __mmask16(0xff))
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT));\
+    }
+
+#define SIMD_WRAPPER_3_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
+    }
+#define SIMD_WRAPPER_3(op)  SIMD_WRAPPER_3_(op, op, __mmask16(0xff))
+
+#define SIMD_DWRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Double SIMDCALL op(Double a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_1(op)  SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_DWRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Double SIMDCALL op(Double a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_1I(op)  SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_DWRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_2(op)  SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_DWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+    }
+
+#define SIMD_IWRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_IWRAPPER_1_32(op)  SIMD_IWRAPPER_1_(op, op, __mmask16(0xff))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_1_8(op)   SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_1_16(op)  SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_1_64(op)  SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_IWRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_IWRAPPER_1I_32(op)  SIMD_IWRAPPER_1I_(op, op, __mmask16(0xff))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_1I_8(op)   SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_1I_16(op)  SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_1I_64(op)  SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_IWRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_IWRAPPER_2_32(op)  SIMD_IWRAPPER_2_(op, op, __mmask16(0xff))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_2_8(op)   SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_2_16(op)  SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_2_64(op)  SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT));\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+//SIMD_WRAPPER_1_(rcp_ps, rcp28_ps, __mmask16(0xff));     // return 1.0f / a
+//SIMD_WRAPPER_1_(rsqrt_ps, rsqrt28_ps, __mmask16(0xff));   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1_32(abs_epi32);  // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2_32(add_epi32);  // return a + b (int32)
+SIMD_IWRAPPER_2_32(max_epi32);  // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(max_epu32);  // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(min_epi32);  // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(min_epu32);  // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(mul_epi32);  // return a * b (int32)
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
+SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+
+#endif
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2_32(mullo_epi32);
+SIMD_IWRAPPER_2_32(sub_epi32);  // return a - b (int32)
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
+SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
+
+#endif
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2_(and_si,    and_epi32, __mmask16(0xff));    // return a & b       (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xff)); // return (~a) & b    (int)
+SIMD_IWRAPPER_2_(or_si,     or_epi32, __mmask16(0xff));     // return a | b       (int)
+SIMD_IWRAPPER_2_(xor_si,    xor_epi32, __mmask16(0xff));    // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I_32(slli_epi32);               // return a << ImmT
+SIMD_IWRAPPER_2_32(sllv_epi32);                // return a << b      (uint32)
+SIMD_IWRAPPER_1I_32(srai_epi32);               // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I_32(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_2_32(srlv_epi32);                // return a >> b      (uint32)
+
+// use AVX2 version
+//SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
+
+//-----------------------------------------------------------------------
+// Conversion operations (Use AVX2 versions)
+//-----------------------------------------------------------------------
+// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff);    // return (int16)a    (uint8 --> int16)
+// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff);      // return (int32)a    (uint8 --> int32)
+// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff);     // return (int32)a    (uint16 --> int32)
+// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf);      // return (int64)a    (uint16 --> int64)
+// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf);      // return (int64)a    (uint32 --> int64)
+
+//-----------------------------------------------------------------------
+// Comparison operations (Use AVX2 versions
+//-----------------------------------------------------------------------
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi8);    // return a == b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi16);   // return a == b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi32);   // return a == b (int32)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi64);   // return a == b (int64)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,);   // return a > b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi16);   // return a > b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi32);   // return a > b (int32)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi64);   // return a > b (int64)
+//
+//static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
+//{
+//    return cmpgt_epi32(b, a);
+//}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 --> uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+#endif
+
+// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
+
+//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+//{
+//    return _mm256_permutevar8x32_ps(a, swiz);
+//}
+
+SIMD_IWRAPPER_1I_32(shuffle_epi32);
+//template<int ImmT>
+//static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+//{
+//    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+//}
+//SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_IWRAPPER_2_32(unpackhi_epi32);
+SIMD_IWRAPPER_2_32(unpacklo_epi32);
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_16(unpackhi_epi16);
+SIMD_IWRAPPER_2_64(unpackhi_epi64);
+SIMD_IWRAPPER_2_8(unpackhi_epi8);
+SIMD_IWRAPPER_2_16(unpacklo_epi16);
+SIMD_IWRAPPER_2_64(unpacklo_epi64);
+SIMD_IWRAPPER_2_8(unpacklo_epi8);
+
+#endif
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return __conv(_mm512_maskz_load_ps(__mmask16(0xff), p));
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return __conv(_mm512_maskz_load_epi32(__mmask16(0xff), p));
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p));
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p));
+}
+
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return __conv(_mm512_mask_i32gather_ps(
+                    _mm512_setzero_ps(),
+                    __mmask16(0xff),
+                    __conv(idx),
+                    p,
+                    static_cast<int>(ScaleT)));
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    __mmask16 m = 0xff;
+    m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)),
+                                _mm512_set1_epi32(0x8000000));
+    return __conv(_mm512_mask_i32gather_ps(
+                    __conv(old),
+                    m,
+                    __conv(idx),
+                    p,
+                    static_cast<int>(ScaleT)));
+}
+
+#if !defined(AVX512F_STRICT)
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    __mmask64 m = 0xffffffffull;
+    return static_cast<uint32_t>(
+        _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
+}
+
+#endif
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    __mmask16 m = 0xff;
+    m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
+    _mm512_mask_store_ps(p, m, __conv(src));
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    _mm512_mask_store_ps(p, __mmask16(0xff), __conv(a));
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    _mm512_mask_store_epi32(p, __mmask16(0xff), __conv(a));
+}
+
+//=======================================================================
+// Legacy interface (available only in SIMD256 width)
+//=======================================================================
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_1I_
+#undef SIMD_WRAPPER_1I
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_3
+#undef SIMD_DWRAPPER_1_
+#undef SIMD_DWRAPPER_1
+#undef SIMD_DWRAPPER_1I_
+#undef SIMD_DWRAPPER_1I
+#undef SIMD_DWRAPPER_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_IWRAPPER_1_
+#undef SIMD_IWRAPPER_1_8
+#undef SIMD_IWRAPPER_1_16
+#undef SIMD_IWRAPPER_1_32
+#undef SIMD_IWRAPPER_1_64
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_1I_8
+#undef SIMD_IWRAPPER_1I_16
+#undef SIMD_IWRAPPER_1I_32
+#undef SIMD_IWRAPPER_1I_64
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2_8
+#undef SIMD_IWRAPPER_2_16
+#undef SIMD_IWRAPPER_2_32
+#undef SIMD_IWRAPPER_2_64
+#undef SIMD_IWRAPPER_2I
+//#undef SIMD_IWRAPPER_2I_8
+//#undef SIMD_IWRAPPER_2I_16
+//#undef SIMD_IWRAPPER_2I_32
+//#undef SIMD_IWRAPPER_2I_64
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
@ -0,0 +1,682 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD16 AVX512 (F) implementation
+//
+//  TODO: Optimize for KNL / KNH or for SKX??
+//      For now probably optimizing more for KNL as that's where
+//      immediate customers are.
+//============================================================================
+
+static const int TARGET_SIMD_WIDTH = 16;
+using SIMD256T = SIMD256Impl::AVX2Impl;
+
+#define SIMD_WRAPPER_1_(op, intrin)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return intrin(a);\
+    }
+
+#define SIMD_WRAPPER_1(op)  \
+    SIMD_WRAPPER_1_(op, _mm512_##op)
+
+#define SIMD_WRAPPER_2_(op, intrin)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm512_##intrin(a, b);\
+    }
+#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
+
+#define SIMD_WRAPPERI_2_(op, intrin)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm512_castsi512_ps(_mm512_##intrin(\
+            _mm512_castps_si512(a), _mm512_castps_si512(b)));\
+    }
+
+#define SIMD_DWRAPPER_2(op)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm512_##op(a, b);\
+    }
+
+#define SIMD_WRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm512_##intrin(a, b, ImmT);\
+    }
+#define SIMD_WRAPPER_2I(op)  SIMD_WRAPPER_2I_(op, op)
+
+#define SIMD_DWRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm512_##intrin(a, b, ImmT);\
+    }
+#define SIMD_DWRAPPER_2I(op)  SIMD_DWRAPPER_2I_(op, op)
+
+#define SIMD_WRAPPER_3(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return _mm512_##op(a, b, c);\
+    }
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm512_##op(a);\
+    }
+#define SIMD_IWRAPPER_1_8(op)  \
+    static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a)   \
+    {\
+        return _mm512_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_1_4(op)  \
+    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a)   \
+    {\
+        return _mm512_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_1I_(op, intrin)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return intrin(a, ImmT);\
+    }
+#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
+
+#define SIMD_IWRAPPER_2_(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm512_##intrin(a, b);\
+    }
+#define SIMD_IWRAPPER_2(op)  SIMD_IWRAPPER_2_(op, op)
+
+#define SIMD_IWRAPPER_2_CMP(op, cmp)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return cmp(a, b);\
+    }
+
+#define SIMD_IFWRAPPER_2(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
+    }
+
+#define SIMD_IWRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm512_##intrin(a, b, ImmT);\
+    }
+#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
+
+private:
+    static SIMDINLINE Integer vmask(__mmask8 m)
+    {
+        return _mm512_maskz_set1_epi64(m, -1LL);
+    }
+    static SIMDINLINE Integer vmask(__mmask16 m)
+    {
+        return _mm512_maskz_set1_epi32(m, -1);
+    }
+    static SIMDINLINE Integer vmask(__mmask32 m)
+    {
+        return _mm512_maskz_set1_epi16(m, -1);
+    }
+    static SIMDINLINE Integer vmask(__mmask64 m)
+    {
+        return _mm512_maskz_set1_epi8(m, -1);
+    }
+
+public:
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp28_ps);       // return 1.0f / a
+SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt28_ps);   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+    return _mm512_roundscale_ps(a, static_cast<int>(RMT));
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+                            // return (a * b) & 0xFFFFFFFF
+                            //
+                            // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+                            // and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2_(and_si, and_si512);        // return a & b       (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_si512);  // return (~a) & b    (int)
+SIMD_IWRAPPER_2_(or_si, or_si512);          // return a | b       (int)
+SIMD_IWRAPPER_2_(xor_si, xor_si512);        // return a ^ b       (int)
+
+#if defined(AVX512F_STRICT)
+
+SIMD_WRAPPERI_2_(and_ps, and_epi32);          // return a & b       (float treated as int)
+SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32);    // return (~a) & b    (float treated as int)
+SIMD_WRAPPERI_2_(or_ps, or_epi32);            // return a | b       (float treated as int)
+SIMD_WRAPPERI_2_(xor_ps, xor_epi32);          // return a ^ b       (float treated as int)
+
+#else
+
+SIMD_WRAPPER_2(and_ps);                     // return a & b       (float treated as int)
+SIMD_WRAPPER_2(andnot_ps);                  // return (~a) & b    (float treated as int)
+SIMD_WRAPPER_2(or_ps);                      // return a | b       (float treated as int)
+SIMD_WRAPPER_2(xor_ps);                     // return a ^ b       (float treated as int)
+
+#endif
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I(slli_epi32);               // return a << ImmT
+SIMD_IWRAPPER_2(sllv_epi32);
+SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_1I_(srli_si, srli_si512);     // return a >> (ImmT*8) (uint)
+
+template<int ImmT>                              // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+    return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+
+SIMD_IWRAPPER_2(srlv_epi32);
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a)   // return *(Float*)(&a)
+{
+    return _mm512_castpd_ps(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a)   // return *(Integer*)(&a)
+{
+    return _mm512_castps_si512(a);
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)   // return *(Double*)(&a)
+{
+    return _mm512_castsi512_pd(a);
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+{
+    return _mm512_castps_pd(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castpd_si(Double a)   // return *(Integer*)(&a)
+{
+    return _mm512_castpd_si512(a);
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)   // return *(Float*)(&a)
+{
+    return _mm512_castsi512_ps(a);
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
+{
+    return _mm512_cvtepi32_ps(a);
+}
+
+SIMD_IWRAPPER_1_8(cvtepu8_epi16);     // return (int16)a    (uint8 --> int16)
+SIMD_IWRAPPER_1_4(cvtepu8_epi32);     // return (int32)a    (uint8 --> int32)
+SIMD_IWRAPPER_1_8(cvtepu16_epi32);    // return (int32)a    (uint16 --> int32)
+SIMD_IWRAPPER_1_4(cvtepu16_epi64);    // return (int64)a    (uint16 --> int64)
+SIMD_IWRAPPER_1_8(cvtepu32_epi64);    // return (int64)a    (uint32 --> int64)
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+{
+    return _mm512_cvtps_epi32(a);
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+{
+    return _mm512_cvttps_epi32(a);
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
+{
+    return _mm512_cmp_ps_mask(a, b, static_cast<const int>(CmpTypeT));
+}
+
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+    // Legacy vector mask generator
+    __mmask16 result = cmp_ps_mask<CmpTypeT>(a, b);
+    return castsi_ps(vmask(result));
+}
+
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+template<CompareTypeInt CmpTypeT>
+static SIMDINLINE Integer SIMDCALL cmp_epi8(Integer a, Integer b)
+{
+    // Legacy vector mask generator
+    __mmask64 result = _mm512_cmp_epi8_mask(a, b, static_cast<const int>(CmpTypeT));
+    return vmask(result);
+}
+template<CompareTypeInt CmpTypeT>
+static SIMDINLINE Integer SIMDCALL cmp_epi16(Integer a, Integer b)
+{
+    // Legacy vector mask generator
+    __mmask32 result = _mm512_cmp_epi16_mask(a, b, static_cast<const int>(CmpTypeT));
+    return vmask(result);
+}
+template<CompareTypeInt CmpTypeT>
+static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b)
+{
+    // Legacy vector mask generator
+    __mmask16 result = _mm512_cmp_epi32_mask(a, b, static_cast<const int>(CmpTypeT));
+    return vmask(result);
+}
+template<CompareTypeInt CmpTypeT>
+static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b)
+{
+    // Legacy vector mask generator
+    __mmask8 result = _mm512_cmp_epi64_mask(a, b, static_cast<const int>(CmpTypeT));
+    return vmask(result);
+}
+
+SIMD_IWRAPPER_2_CMP(cmpeq_epi8,  cmp_epi8<CompareTypeInt::EQ>);    // return a == b (int8)
+SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>);   // return a == b (int16)
+SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>);   // return a == b (int32)
+SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>);   // return a == b (int64)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi8,  cmp_epi8<CompareTypeInt::GT>);    // return a > b (int8)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>);   // return a > b (int16)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>);   // return a > b (int32)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>);   // return a > b (int64)
+SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>);   // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+    return (0 == static_cast<int>(_mm512_test_epi32_mask(castps_si(a), castps_si(b))));
+}
+
+static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+    return (0 == static_cast<int>(_mm512_test_epi32_mask(a, b)));
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+template <int ImmT>
+static SIMDINLINE Float blend_ps(Float a, Float b) // return ImmT ? b : a  (float)
+{
+    return _mm512_mask_blend_ps(__mmask16(ImmT), a, b);
+}
+
+template <int ImmT>
+static SIMDINLINE Float blend_epi32(Integer a, Integer b) // return ImmT ? b : a  (int32)
+{
+    return _mm512_mask_blend_epi32(__mmask16(ImmT), a, b);
+}
+
+static SIMDINLINE Float blendv_ps(Float a, Float b, Float mask) // return mask ? b : a  (float)
+{
+    return _mm512_mask_blend_ps(__mmask16(movemask_ps(mask)), a, b);
+}
+
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)  // return *p (all elements in vector get same value)
+{
+    return _mm512_set1_ps(*p);
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
+{
+    return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), imm));
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
+{
+    return _mm512_extractf64x4_pd(a, imm);
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
+{
+    return _mm512_extracti64x4_epi64(a, imm);
+}
+
+template<int imm>
+static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
+{
+    return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(a), _mm256_castps_pd(b), imm));
+}
+
+template<int imm>
+static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
+{
+    return _mm512_insertf64x4(a, b, imm);
+}
+
+template<int imm>
+static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
+{
+    return _mm512_inserti64x4(a, b, imm);
+}
+
+SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    return _mm512_permutexvar_epi32(swiz, a);
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    return _mm512_permutexvar_ps(swiz, a);
+}
+
+SIMD_WRAPPER_2I_(permute2f128_ps, shuffle_f32x4);
+SIMD_DWRAPPER_2I_(permute2f128_pd, shuffle_f64x2);
+SIMD_IWRAPPER_2I_(permute2f128_si, shuffle_i32x4);
+
+SIMD_IWRAPPER_1I(shuffle_epi32);
+
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_DWRAPPER_2I(shuffle_pd);
+SIMD_WRAPPER_2I(shuffle_ps);
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+{
+    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+}
+
+SIMD_IWRAPPER_2(unpackhi_epi16);
+
+//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps);
+static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
+{
+    return castps_si(_mm512_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
+}
+
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_DWRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IFWRAPPER_2(unpacklo_epi32, unpacklo_ps);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+SIMD_DWRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    uint32_t *pOffsets = (uint32_t*)&idx;
+    Float vResult;
+    float* pResult = (float*)&vResult;
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        uint32_t offset = pOffsets[i];
+        offset = offset * static_cast<uint32_t>(ScaleT);
+        pResult[i] = *(float const*)(((uint8_t const*)p + offset));
+    }
+
+    return vResult;
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+{
+    return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return _mm512_load_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return _mm512_load_si512(&p->v);
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return _mm512_loadu_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return _mm512_loadu_si512(p);
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    __mmask16 k = _mm512_cmpneq_ps_mask(mask, setzero_ps());
+
+    return _mm512_mask_i32gather_ps(old, k, idx, p, ScaleT);
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    Mask m = _mm512_cmplt_epi32_mask(mask, setzero_si());
+    _mm512_mask_store_ps(p, m, src);
+}
+
+static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
+{
+    __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si());
+    return static_cast<uint64_t>(m);
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+    __mmask8 m = _mm512_cmplt_pd_mask(a, setzero_pd());
+    return static_cast<uint32_t>(m);
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+    __mmask16 m = _mm512_cmplt_ps_mask(a, setzero_ps());
+    return static_cast<uint32_t>(m);
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+    return _mm512_set1_epi32(i);
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+    return _mm512_set1_epi8(i);
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+{
+    return _mm512_set1_ps(f);
+}
+
+static SIMDINLINE Double SIMDCALL setzero_pd()      // return 0 (double)
+{
+    return _mm512_setzero_pd();
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+{
+    return _mm512_setzero_ps();
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+{
+    return _mm512_setzero_si512();
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    _mm512_store_ps(p, a);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    _mm512_store_si512(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
+{
+    _mm512_storeu_si512(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+{
+    _mm512_stream_ps(p, a);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+    int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
+    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return _mm512_set_epi32(
+        i15, i14, i13, i12, i11, i10, i9, i8,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return set_epi32(
+        0, 0, 0, 0, 0, 0, 0, 0,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+    float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
+    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return _mm512_set_ps(
+        i15, i14, i13, i12, i11, i10, i9, i8,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return set_ps(
+        0, 0, 0, 0, 0, 0, 0, 0,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
+{
+    return castsi_ps(_mm512_maskz_mov_epi32(__mmask16(mask), set1_epi32(-1)));
+}
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPERI_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_WRAPPER_2I_
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IFWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_1I
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2I
+
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl
@ -0,0 +1,27 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+// Implement mask-enabled SIMD functions
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
@ -0,0 +1,842 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD16 AVX (1) implementation
+//============================================================================
+
+static const int TARGET_SIMD_WIDTH = 8;
+using SIMD128T = SIMD128Impl::AVXImpl;
+
+#define SIMD_WRAPPER_1(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return Float\
+        {\
+            SIMD256T::op(a.v8[0]),\
+            SIMD256T::op(a.v8[1]),\
+        };\
+    }
+
+#define SIMD_WRAPPER_2(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return Float\
+        {\
+            SIMD256T::op(a.v8[0], b.v8[0]),\
+            SIMD256T::op(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return Float\
+        {\
+            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_WRAPPER_2I_1(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return Float\
+        {\
+            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_WRAPPER_3(op)  \
+        static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+        {\
+            return Float\
+            {\
+                SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
+                SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
+            };\
+        }
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::op(a.v8[0]),\
+            SIMD256T::op(a.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_2(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::op(a.v8[0], b.v8[0]),\
+            SIMD256T::op(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_2I_1(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_2I_2(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_3(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
+            SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
+        };\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
+SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+    return Float
+    {
+        SIMD256T::template round_ps<RMT>(a.v8[0]),
+        SIMD256T::template round_ps<RMT>(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(and_ps);     // return a & b       (float treated as int)
+SIMD_IWRAPPER_2(and_si);    // return a & b       (int)
+SIMD_WRAPPER_2(andnot_ps);  // return (~a) & b    (float treated as int)
+SIMD_IWRAPPER_2(andnot_si); // return (~a) & b    (int)
+SIMD_WRAPPER_2(or_ps);      // return a | b       (float treated as int)
+SIMD_IWRAPPER_2(or_si);     // return a | b       (int)
+SIMD_WRAPPER_2(xor_ps);     // return a ^ b       (float treated as int)
+SIMD_IWRAPPER_2(xor_si);    // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL slli_epi32(Integer a)      // return a << ImmT
+{
+    return Integer
+    {
+        SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
+        SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
+    };
+}
+
+SIMD_IWRAPPER_2(sllv_epi32);                                // return a << b      (uint32)
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL srai_epi32(Integer a)      // return a >> ImmT   (int32)
+{
+    return Integer
+    {
+        SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
+        SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
+    };
+}
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL srli_epi32(Integer a)      // return a >> ImmT   (uint32)
+{
+    return Integer
+    {
+        SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
+        SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
+    };
+}
+
+template<int ImmT>                                          // for each 128-bit lane:
+static SIMDINLINE Integer SIMDCALL srli_si(Integer a)         //  return a >> (ImmT*8) (uint)
+{
+    return Integer
+    {
+        SIMD256T::template srli_si<ImmT>(a.v8[0]),
+        SIMD256T::template srli_si<ImmT>(a.v8[1]),
+    };
+}
+template<int ImmT>
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)       // same as srli_si, but with Float cast to int
+{
+    return Float
+    {
+        SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
+        SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
+    };
+}
+
+SIMD_IWRAPPER_2(srlv_epi32);                                // return a >> b      (uint32)
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a)              // return *(Float*)(&a)
+{
+    return Float
+    {
+        SIMD256T::castpd_ps(a.v8[0]),
+        SIMD256T::castpd_ps(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a)              // return *(Integer*)(&a)
+{
+    return Integer
+    {
+        SIMD256T::castps_si(a.v8[0]),
+        SIMD256T::castps_si(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)              // return *(Double*)(&a)
+{
+    return Double
+    {
+        SIMD256T::castsi_pd(a.v8[0]),
+        SIMD256T::castsi_pd(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+{
+    return Double
+    {
+        SIMD256T::castps_pd(a.v8[0]),
+        SIMD256T::castps_pd(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)              // return *(Float*)(&a)
+{
+    return Float
+    {
+        SIMD256T::castsi_ps(a.v8[0]),
+        SIMD256T::castsi_ps(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a)            // return (float)a    (int32 --> float)
+{
+    return Float
+    {
+        SIMD256T::cvtepi32_ps(a.v8[0]),
+        SIMD256T::cvtepi32_ps(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer a)          // return (int16)a    (uint8 --> int16)
+{
+    return Integer
+    {
+        SIMD256T::cvtepu8_epi16(a.v4[0]),
+        SIMD256T::cvtepu8_epi16(a.v4[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer a)          // return (int32)a    (uint8 --> int32)
+{
+    return Integer
+	{
+        SIMD256T::cvtepu8_epi32(a.v4[0]),
+        SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
+	};
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer a)         // return (int32)a    (uint16 --> int32)
+{
+    return Integer
+    {
+        SIMD256T::cvtepu16_epi32(a.v4[0]),
+        SIMD256T::cvtepu16_epi32(a.v4[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer a)         // return (int64)a    (uint16 --> int64)
+{
+    return Integer
+    {
+        SIMD256T::cvtepu16_epi64(a.v4[0]),
+        SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer a)         // return (int64)a    (uint32 --> int64)
+{
+    return Integer
+    {
+        SIMD256T::cvtepu32_epi64(a.v4[0]),
+        SIMD256T::cvtepu32_epi64(a.v4[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+{
+    return Integer
+    {
+        SIMD256T::cvtps_epi32(a.v8[0]),
+        SIMD256T::cvtps_epi32(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+{
+    return Integer
+    {
+        SIMD256T::cvtps_epi32(a.v8[0]),
+        SIMD256T::cvtps_epi32(a.v8[1]),
+    };
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+    return Float
+    {
+        SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
+        SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
+    };
+}
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+template<CompareType CmpTypeT>
+static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
+{
+    return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
+}
+
+
+SIMD_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
+SIMD_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
+SIMD_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
+SIMD_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
+SIMD_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
+SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
+SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
+SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
+SIMD_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+    return  0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) &
+                  SIMD256T::testz_ps(a.v8[1], b.v8[1]));
+}
+
+static SIMDINLINE int SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+    return  0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) &
+                  SIMD256T::testz_si(a.v8[1], b.v8[1]));
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
+SIMD_IWRAPPER_2I(blend_epi32);  // return ImmT ? b : a  (int32)
+SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+    return Integer
+    {
+        SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
+        SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+    return Integer
+    {
+        SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
+        SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)         // return *p (all elements in vector get same value)
+{
+    float f = *p;
+    return Float
+    {
+        SIMD256T::set1_ps(f),
+        SIMD256T::set1_ps(f),
+    };
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    return a.v8[imm];
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    return a.v8[imm];
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    return a.v8[imm];
+}
+
+template<int imm>
+static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    a.v8[imm] = b;
+    return a;
+}
+
+template<int imm>
+static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    a.v8[imm] = b;
+    return a;
+}
+
+template<int imm>
+static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    a.v8[imm] = b;
+    return a;
+}
+
+SIMD_IWRAPPER_2(packs_epi16);      // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);      // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16);     // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32);     // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
+{
+    Integer result;
+
+    // Ugly slow implementation
+    uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+    uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
+
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        pResult[i] = pA[0xF & pSwiz[i]];
+    }
+
+    return result;
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    Float result;
+
+    // Ugly slow implementation
+    float const *pA = reinterpret_cast<float const*>(&a);
+    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+    float *pResult = reinterpret_cast<float *>(&result);
+
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        pResult[i] = pA[0xF & pSwiz[i]];
+    }
+
+    return result;
+}
+
+// All of the 512-bit permute2f128_XX intrinsics do the following:
+//
+//      SELECT4(src, control) {
+//          CASE(control[1:0])
+//              0:	tmp[127:0] : = src[127:0]
+//              1 : tmp[127:0] : = src[255:128]
+//              2 : tmp[127:0] : = src[383:256]
+//              3 : tmp[127:0] : = src[511:384]
+//              ESAC
+//              RETURN tmp[127:0]
+//      }
+//      
+//      dst[127:0]   : = SELECT4(a[511:0], imm8[1:0])
+//      dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
+//      dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
+//      dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
+//      dst[MAX:512] : = 0
+//
+// Since the 256-bit AVX instructions use a 4-bit control field (instead
+// of 2-bit for AVX512), we need to expand the control bits sent to the
+// AVX instructions for emulation.
+//
+template <int shuf>
+static SIMDINLINE Float SIMDCALL permute2f128_ps(Float a, Float b)
+{
+    return Float
+    {
+        SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
+        SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+    };
+}
+
+template <int shuf>
+static SIMDINLINE Double SIMDCALL permute2f128_pd(Double a, Double b)
+{
+    return Double
+    {
+        SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
+        SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+    };
+}
+
+template <int shuf>
+static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer a, Integer b)
+{
+    return Integer
+	{
+        SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
+        SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+	};
+}
+
+SIMD_IWRAPPER_2I_1(shuffle_epi32);
+SIMD_IWRAPPER_2I_2(shuffle_epi64);
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_WRAPPER_2I_1(shuffle_pd);
+SIMD_WRAPPER_2I_1(shuffle_ps);
+SIMD_IWRAPPER_2(unpackhi_epi16);
+SIMD_IWRAPPER_2(unpackhi_epi32);
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_WRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IWRAPPER_2(unpacklo_epi32);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+SIMD_WRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return Float
+    {
+        SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
+        SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+{
+    return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return Float
+    {
+        SIMD256T::load_ps(p),
+        SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return Integer
+    {
+        SIMD256T::load_si(&p->v8[0]),
+        SIMD256T::load_si(&p->v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return Float
+    {
+        SIMD256T::loadu_ps(p),
+        SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return Integer
+    {
+        SIMD256T::loadu_si(&p->v8[0]),
+        SIMD256T::loadu_si(&p->v8[1]),
+    };
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    return Float
+    {
+        SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
+        SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
+    };
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
+    SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
+}
+
+static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
+{
+    uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
+             mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
+
+    return mask;
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+    uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
+             mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
+
+    return mask;
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+    uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
+             mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
+
+    return mask;
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+    return Integer
+    {
+        SIMD256T::set1_epi32(i),
+        SIMD256T::set1_epi32(i)
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+    return Integer
+    {
+        SIMD256T::set1_epi8(i),
+        SIMD256T::set1_epi8(i)
+    };
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+{
+    return Float
+    {
+        SIMD256T::set1_ps(f),
+        SIMD256T::set1_ps(f)
+    };
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+{
+    return Float
+    {
+        SIMD256T::setzero_ps(),
+        SIMD256T::setzero_ps()
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+{
+    return Integer
+    {
+        SIMD256T::setzero_si(),
+        SIMD256T::setzero_si()
+    };
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    SIMD256T::store_ps(p, a.v8[0]);
+    SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    SIMD256T::store_si(&p->v8[0], a.v8[0]);
+    SIMD256T::store_si(&p->v8[1], a.v8[1]);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+{
+    SIMD256T::stream_ps(p, a.v8[0]);
+    SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+    int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
+    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return Integer
+    {
+        SIMD256T::set_epi32(
+            i7, i6, i5, i4, i3, i2, i1, i0),
+        SIMD256T::set_epi32(
+            i15, i14, i13, i12, i11, i10, i9, i8)
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return set_epi32(
+        0, 0, 0, 0, 0, 0, 0, 0,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+    float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
+    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return Float
+    {
+        SIMD256T::set_ps(
+            i7, i6, i5, i4, i3, i2, i1, i0),
+        SIMD256T::set_ps(
+            i15, i14, i13, i12, i11, i10, i9, i8)
+    };
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return set_ps(
+        0, 0, 0, 0, 0, 0, 0, 0,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
+{
+    Integer vec = set1_epi32(mask);
+    const Integer bit = set_epi32(
+        0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100,
+        0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+    vec = and_si(vec, bit);
+    vec = cmplt_epi32(setzero_si(), vec);
+    return castsi_ps(vec);
+}
+
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_2I_1
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_2I_1
+#undef SIMD_IWRAPPER_3
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl
@ -0,0 +1,28 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+// no backwards compatibility for simd mask-enabled functions
+
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
@ -0,0 +1,428 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#pragma once
+#if 0
+//===========================================================================
+// Placeholder name representing either SIMD4, SIMD256, or SIMD16 structures.
+//===========================================================================
+struct SIMD256 // or SIMD4 or SIMD16
+{
+    //=======================================================================
+    // SIMD Types
+    //
+    // These typedefs are examples. The SIMD256 and SIMD16 implementations will
+    // use different base types with this same naming.
+    using Float     = __m256;  // Packed single-precision float vector
+    using Double    = __m256d; // Packed double-precision float vector
+    using Integer   = __m256i; // Packed integer vector (mutable element widths)
+    using Mask      = uint8_t; // Integer representing mask bits
+
+    //=======================================================================
+    // Standard interface
+    // (available in both SIMD256 and SIMD16 widths)
+    //=======================================================================
+
+    //-----------------------------------------------------------------------
+    // Single precision floating point arithmetic operations
+    //-----------------------------------------------------------------------
+    static Float    add_ps(Float a, Float b);               // return a + b
+    static Float    div_ps(Float a, Float b);               // return a / b
+    static Float    fmadd_ps(Float a, Float b, Float c);    // return (a * b) + c
+    static Float    fmsub_ps(Float a, Float b, Float c);    // return (a * b) - c
+    static Float    max_ps(Float a, Float b);               // return (a > b) ? a : b
+    static Float    min_ps(Float a, Float b);               // return (a < b) ? a : b
+    static Float    mul_ps(Float a, Float b);               // return a * b
+    static Float    rcp_ps(Float a);                        // return 1.0f / a
+    static Float    rsqrt_ps(Float a);                      // return 1.0f / sqrt(a)
+    static Float    sub_ps(Float a, Float b);               // return a - b
+
+    enum class RoundMode
+    {
+        TO_NEAREST_INT  = 0x00, // Round to nearest integer == TRUNCATE(value + (signof(value))0.5)
+        TO_NEG_INF      = 0x01, // Round to negative infinity
+        TO_POS_INF      = 0x02, // Round to positive infinity
+        TO_ZERO         = 0x03, // Round to 0 a.k.a. truncate
+        CUR_DIRECTION   = 0x04, // Round in direction set in MXCSR register
+
+        RAISE_EXC       = 0x00, // Raise exception on overflow
+        NO_EXC          = 0x08, // Suppress exceptions
+
+        NINT            = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(RAISE_EXC),
+        NINT_NOEXC      = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(NO_EXC),
+        FLOOR           = static_cast<int>(TO_NEG_INF)      | static_cast<int>(RAISE_EXC),
+        FLOOR_NOEXC     = static_cast<int>(TO_NEG_INF)      | static_cast<int>(NO_EXC),
+        CEIL            = static_cast<int>(TO_POS_INF)      | static_cast<int>(RAISE_EXC),
+        CEIL_NOEXC      = static_cast<int>(TO_POS_INF)      | static_cast<int>(NO_EXC),
+        TRUNC           = static_cast<int>(TO_ZERO)         | static_cast<int>(RAISE_EXC),
+        TRUNC_NOEXC     = static_cast<int>(TO_ZERO)         | static_cast<int>(NO_EXC),
+        RINT            = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(RAISE_EXC),
+        NEARBYINT       = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(NO_EXC),
+    };
+
+    // return round_func(a)
+    //
+    // round_func is chosen on the RMT template parameter.  See the documentation
+    // for the RoundMode enumeration above.
+    template <RoundMode RMT>
+    static Float    round_ps(Float a);                  // return round(a) 
+
+
+    //-----------------------------------------------------------------------
+    // Integer (various width) arithmetic operations
+    //-----------------------------------------------------------------------
+    static Integer  abs_epi32(Integer a);               // return absolute_value(a) (int32)
+    static Integer  add_epi32(Integer a, Integer b);    // return a + b (int32)
+    static Integer  add_epi8(Integer a, Integer b);     // return a + b (int8)
+    static Integer  adds_epu8(Integer a, Integer b);    // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+    static Integer  max_epi32(Integer a, Integer b);    // return (a > b) ? a : b (int32)
+    static Integer  max_epu32(Integer a, Integer b);    // return (a > b) ? a : b (uint32)
+    static Integer  min_epi32(Integer a, Integer b);    // return (a < b) ? a : b (int32)
+    static Integer  min_epu32(Integer a, Integer b);    // return (a < b) ? a : b (uint32)
+    static Integer  mul_epi32(Integer a, Integer b);    // return a * b (int32)
+
+    // return (a * b) & 0xFFFFFFFF
+    //
+    // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+    // and store the low 32 bits of the intermediate integers in dst.
+    static Float    mullo_epi32(Integer a, Integer b);
+
+    static Integer  sub_epi32(Integer a, Integer b);    // return a - b (int32)
+    static Integer  sub_epi64(Integer a, Integer b);    // return a - b (int64)
+    static Integer  subs_epu8(Integer a, Integer b);    // return (b > a) ? 0 : (a - b) (uint8)
+
+    //-----------------------------------------------------------------------
+    // Logical operations
+    //-----------------------------------------------------------------------
+    static Float    and_ps(Float a, Float b);           // return a & b       (float treated as int)
+    static Integer  and_si(Integer a, Integer b);       // return a & b       (int)
+    static Float    andnot_ps(Float a, Float b);        // return (~a) & b    (float treated as int)
+    static Integer  andnot_si(Integer a, Integer b);    // return (~a) & b    (int)
+    static Float    or_ps(Float a, Float b);            // return a | b       (float treated as int)
+    static Float    or_si(Integer a, Integer b);        // return a | b       (int)
+    static Float    xor_ps(Float a, Float b);           // return a ^ b       (float treated as int)
+    static Integer  xor_si(Integer a, Integer b);       // return a ^ b       (int)
+
+    //-----------------------------------------------------------------------
+    // Shift operations
+    //-----------------------------------------------------------------------
+    template<int ImmT>
+    static Integer  slli_epi32(Integer a);              // return a << ImmT
+    static Integer  sllv_epi32(Integer a, Integer b);   // return a << b
+    template<int ImmT>
+    static Integer  srai_epi32(Integer a);              // return a >> ImmT   (int32)
+    template<int ImmT>
+    static Integer  srli_epi32(Integer a);              // return a >> ImmT   (uint32)
+    template<int ImmT>                                  // for each 128-bit lane:
+    static Integer  srli_si(Integer a);                 //  return a >> (ImmT*8) (uint)
+    template<int ImmT>
+    static Float    srlisi_ps(Float a);                 // same as srli_si, but with Float cast to int
+    static Integer  srlv_epi32(Integer a, Integer b);   // return a >> b      (uint32)
+
+    //-----------------------------------------------------------------------
+    // Conversion operations
+    //-----------------------------------------------------------------------
+    static Float    castpd_ps(Double a);                // return *(Float*)(&a)
+    static Integer  castps_si(Float a);                 // return *(Integer*)(&a)
+    static Double   castsi_pd(Integer a);               // return *(Double*)(&a)
+    static Double   castps_pd(Float a);                 // return *(Double*)(&a)
+    static Float    castsi_ps(Integer a);               // return *(Float*)(&a)
+    static Float    cvtepi32_ps(Integer a);             // return (float)a    (int32 --> float)
+    static Integer  cvtepu8_epi16(Integer a);           // return (int16)a    (uint8 --> int16)
+    static Integer  cvtepu8_epi32(Integer a);           // return (int32)a    (uint8 --> int32)
+    static Integer  cvtepu16_epi32(Integer a);          // return (int32)a    (uint16 --> int32)
+    static Integer  cvtepu16_epi64(Integer a);          // return (int64)a    (uint16 --> int64)
+    static Integer  cvtepu32_epi64(Integer a);          // return (int64)a    (uint32 --> int64)
+    static Integer  cvtps_epi32(Float a);               // return (int32)a    (float --> int32)
+    static Integer  cvttps_epi32(Float a);              // return (int32)a    (rnd_to_zero(float) --> int32)
+
+    //-----------------------------------------------------------------------
+    // Comparison operations
+    //-----------------------------------------------------------------------
+
+    // Comparison types used with cmp_ps:
+    //   - ordered comparisons are always false if either operand is NaN
+    //   - unordered comparisons are always true if either operand is NaN
+    //   - signaling comparisons raise an exception if either operand is NaN
+    //   - non-signaling comparisons will never raise an exception
+    // 
+    // Ordered:     return (a != NaN) && (b != NaN) && (a cmp b)
+    // Unordered:   return (a == NaN) || (b == NaN) || (a cmp b)
+    enum class CompareType
+    {
+        EQ_OQ      = 0x00, // Equal (ordered, nonsignaling)
+        LT_OS      = 0x01, // Less-than (ordered, signaling)
+        LE_OS      = 0x02, // Less-than-or-equal (ordered, signaling)
+        UNORD_Q    = 0x03, // Unordered (nonsignaling)
+        NEQ_UQ     = 0x04, // Not-equal (unordered, nonsignaling)
+        NLT_US     = 0x05, // Not-less-than (unordered, signaling)
+        NLE_US     = 0x06, // Not-less-than-or-equal (unordered, signaling)
+        ORD_Q      = 0x07, // Ordered (nonsignaling)
+        EQ_UQ      = 0x08, // Equal (unordered, non-signaling)
+        NGE_US     = 0x09, // Not-greater-than-or-equal (unordered, signaling)
+        NGT_US     = 0x0A, // Not-greater-than (unordered, signaling)
+        FALSE_OQ   = 0x0B, // False (ordered, nonsignaling)
+        NEQ_OQ     = 0x0C, // Not-equal (ordered, non-signaling)
+        GE_OS      = 0x0D, // Greater-than-or-equal (ordered, signaling)
+        GT_OS      = 0x0E, // Greater-than (ordered, signaling)
+        TRUE_UQ    = 0x0F, // True (unordered, non-signaling)
+        EQ_OS      = 0x10, // Equal (ordered, signaling)
+        LT_OQ      = 0x11, // Less-than (ordered, nonsignaling)
+        LE_OQ      = 0x12, // Less-than-or-equal (ordered, nonsignaling)
+        UNORD_S    = 0x13, // Unordered (signaling)
+        NEQ_US     = 0x14, // Not-equal (unordered, signaling)
+        NLT_UQ     = 0x15, // Not-less-than (unordered, nonsignaling)
+        NLE_UQ     = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
+        ORD_S      = 0x17, // Ordered (signaling)
+        EQ_US      = 0x18, // Equal (unordered, signaling)
+        NGE_UQ     = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
+        NGT_UQ     = 0x1A, // Not-greater-than (unordered, nonsignaling)
+        FALSE_OS   = 0x1B, // False (ordered, signaling)
+        NEQ_OS     = 0x1C, // Not-equal (ordered, signaling)
+        GE_OQ      = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
+        GT_OQ      = 0x1E, // Greater-than (ordered, nonsignaling)
+        TRUE_US    = 0x1F, // True (unordered, signaling)
+    };
+
+    // return a (CmpTypeT) b (float)
+    //
+    // See documentation for CompareType above for valid values for CmpTypeT.
+    template<CompareType CmpTypeT>
+    static Float    cmp_ps(Float a, Float b);           // return a (CmtTypeT) b (see above)
+    static Float    cmpgt_ps(Float a, Float b);         // return cmp_ps<CompareType::GT_OQ>(a, b)
+    static Float    cmple_ps(Float a, Float b);         // return cmp_ps<CompareType::LE_OQ>(a, b)
+    static Float    cmplt_ps(Float a, Float b);         // return cmp_ps<CompareType::LT_OQ>(a, b)
+    static Float    cmpneq_ps(Float a, Float b);        // return cmp_ps<CompareType::NEQ_OQ>(a, b)
+    static Float    cmpeq_ps(Float a, Float b);         // return cmp_ps<CompareType::EQ_OQ>(a, b)
+    static Float    cmpge_ps(Float a, Float b);         // return cmp_ps<CompareType::GE_OQ>(a, b)
+    static Integer  cmpeq_epi8(Integer a, Integer b);   // return a == b (int8)
+    static Integer  cmpeq_epi16(Integer a, Integer b);  // return a == b (int16)
+    static Integer  cmpeq_epi32(Integer a, Integer b);  // return a == b (int32)
+    static Integer  cmpeq_epi64(Integer a, Integer b);  // return a == b (int64)
+    static Integer  cmpgt_epi8(Integer a, Integer b);   // return a > b (int8)
+    static Integer  cmpgt_epi16(Integer a, Integer b);  // return a > b (int16)
+    static Integer  cmpgt_epi32(Integer a, Integer b);  // return a > b (int32)
+    static Integer  cmpgt_epi64(Integer a, Integer b);  // return a > b (int64)
+    static Integer  cmplt_epi32(Integer a, Integer b);  // return a < b (int32)
+    static bool     testz_ps(Float a, Float b);         // return all_lanes_zero(a & b) ? 1 : 0 (float)
+    static bool     testz_si(Integer a, Integer b);     // return all_lanes_zero(a & b) ? 1 : 0 (int)
+
+    //-----------------------------------------------------------------------
+    // Blend / shuffle / permute operations
+    //-----------------------------------------------------------------------
+    template<int ImmT>
+    static Float    blend_ps(Float a, Float b);                     // return ImmT ? b : a  (float)
+    static Integer  blendv_epi32(Integer a, Integer b, Float mask); // return mask ? b : a (int)
+    static Float    blendv_ps(Float a, Float b, Float mask);        // return mask ? b : a (float)
+    static Float    broadcast_ss(float const *p);                   // return *p (all elements in vector get same value)
+    static Integer  packs_epi16(Integer a, Integer b);              // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+    static Integer  packs_epi32(Integer a, Integer b);              // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+    static Integer  packus_epi16(Integer a, Integer b);             // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+    static Integer  packus_epi32(Integer a, Integer b);             // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+    static Float    permute_epi32(Integer a, Integer swiz);         // return a[swiz[i]] for each 32-bit lane i (int32)
+    static Float    permute_ps(Float a, Integer swiz);              // return a[swiz[i]] for each 32-bit lane i (float)
+    template<int SwizT>
+    static Integer  shuffle_epi32(Integer a, Integer b);    
+    template<int SwizT>
+    static Integer  shuffle_epi64(Integer a, Integer b);
+    static Integer  shuffle_epi8(Integer a, Integer b);
+    template<int SwizT>
+    static Float    shuffle_pd(Double a, Double b);
+    template<int SwizT>
+    static Float    shuffle_ps(Float a, Float b);
+    static Integer  unpackhi_epi16(Integer a, Integer b);
+    static Integer  unpackhi_epi32(Integer a, Integer b);
+    static Integer  unpackhi_epi64(Integer a, Integer b);
+    static Integer  unpackhi_epi8(Integer a, Integer b);
+    static Float    unpackhi_pd(Double a, Double b);
+    static Float    unpackhi_ps(Float a, Float b);
+    static Integer  unpacklo_epi16(Integer a, Integer b);
+    static Integer  unpacklo_epi32(Integer a, Integer b);
+    static Integer  unpacklo_epi64(Integer a, Integer b);
+    static Integer  unpacklo_epi8(Integer a, Integer b);
+    static Float    unpacklo_pd(Double a, Double b);
+    static Float    unpacklo_ps(Float a, Float b);
+
+    //-----------------------------------------------------------------------
+    // Load / store operations
+    //-----------------------------------------------------------------------
+    enum class ScaleFactor
+    {
+        SF_1,   // No scaling
+        SF_2,   // Scale offset by 2
+        SF_4,   // Scale offset by 4
+        SF_8,   // Scale offset by 8
+    };
+
+    template<ScaleFactor ScaleT>
+    static Float    i32gather_ps(float const* p, Integer idx);  // return *(float*)(((int8*)p) + (idx * ScaleT))
+    static Float    load1_ps(float const *p);                   // return *p    (broadcast 1 value to all elements)
+    static Float    load_ps(float const *p);                    // return *p    (loads SIMD width elements from memory)
+    static Integer  load_si(Integer const *p);                  // return *p
+    static Float    loadu_ps(float const *p);                   // return *p    (same as load_ps but allows for unaligned mem)
+    static Integer  loadu_si(Integer const *p);                 // return *p    (same as load_si but allows for unaligned mem)
+
+    // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+    template<int ScaleT>
+    static Float    mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask);
+
+    static void     maskstore_ps(float *p, Integer mask, Float src);
+    static int      movemask_epi8(Integer a);
+    static int      movemask_pd(Double a);
+    static int      movemask_ps(Float a);
+    static Integer  set1_epi32(int i);                          // return i (all elements are same value)
+    static Integer  set1_epi8(char i);                          // return i (all elements are same value)
+    static Float    set1_ps(float f);                           // return f (all elements are same value)
+    static Float    setzero_ps();                               // return 0 (float)
+    static Integer  setzero_si();                               // return 0 (integer)
+    static void     store_ps(float *p, Float a);                // *p = a   (stores all elements contiguously in memory)
+    static void     store_si(Integer *p, Integer a);            // *p = a
+    static void     stream_ps(float *p, Float a);               // *p = a   (same as store_ps, but doesn't keep memory in cache)
+
+    //=======================================================================
+    // Legacy interface (available only in SIMD256 width)
+    //=======================================================================
+
+    static Float    broadcast_ps(__m128 const *p);
+    template<int ImmT>
+    static __m128d  extractf128_pd(Double a);
+    template<int ImmT>
+    static __m128   extractf128_ps(Float a);
+    template<int ImmT>
+    static __m128i  extractf128_si(Integer a);
+    template<int ImmT>
+    static Double   insertf128_pd(Double a, __m128d b);
+    template<int ImmT>
+    static Float    insertf128_ps(Float a, __m128 b);
+    template<int ImmT>
+    static Integer  insertf128_si(Integer a, __m128i b);
+    static Integer  loadu2_si(__m128 const* phi, __m128 const* plo);
+    template<int ImmT>
+    static Double   permute2f128_pd(Double a, Double b);
+    template<int ImmT>
+    static Float    permute2f128_ps(Float a, Float b);
+    template<int ImmT>
+    static Integer  permute2f128_si(Integer a, Integer b);
+    static Integer  set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0);
+    static void     storeu2_si(__m128i *phi, __m128i *plo, Integer src);
+
+    //=======================================================================
+    // Advanced masking interface (currently available only in SIMD16 width)
+    //=======================================================================
+
+
+    //=======================================================================
+    // Extended Utility Functions (common to SIMD256 and SIMD16)
+    //=======================================================================
+
+    //-----------------------------------------------------------------------
+    // Extended Types
+    //-----------------------------------------------------------------------
+
+    // Vec4, an SOA SIMD set of 4-dimensional vectors
+    union Vec4
+    {
+        Vec4() = default;
+        Vec4(Float in)
+        {
+            s.x = in;
+            s.y = in;
+            s.z = in;
+            s.w = in;
+        }
+        Vec4(Float x, Float y, Float z, Float w)
+        {
+            s.x = x;
+            s.y = y;
+            s.z = z;
+            s.w = w;
+        }
+
+        Float      v[4];
+        Integer      vi[4];
+        struct
+        {
+            Float  x;
+            Float  y;
+            Float  z;
+            Float  w;
+        } s;
+        Float& operator[] (const int i) { return v[i]; }
+        Float const & operator[] (const int i) const { return v[i]; }
+    };
+
+    //-----------------------------------------------------------------------
+    // Extended Functions
+    //-----------------------------------------------------------------------
+    static void     vec4_set1_ps(Vec4& r, const float *p);                  // r[0] = set1(p[0]), r[1] = set1(p[1]), ...
+    static void     vec4_set1_vps(Vec4& r, Float s);                        // r[0] = s, r[1] = s, ...
+    static Float    vec4_dp3_ps(const Vec4& v0, const Vec4& v1);            // return dp3(v0, v1)
+    static Float    vec4_dp4_ps(const Vec4& v0, const Vec4& v1);            // return dp4(v0, v1)
+    static Float    vec4_rcp_length_ps(const Vec4& v);                      // return 1.0f / sqrt(dp4(v, v))
+    static void     vec4_normalize_ps(Vec4& r, const Vec4& v);              // r = v * rcp_length(v)
+    static void     vec4_mul_ps(Vec4& r, const Vec4& v, Float s);           // r = v * set1_vps(s)
+    static void     vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1);   // r = v0 * v1
+    static void     vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1);   // r = v0 + v1
+    static void     vec4_min_ps(Vec4& r, const Vec4& v0, Float s);          // r = (v0 < s) ? v0 : s
+    static void     vec4_max_ps(Vec4& r, const Vec4& v0, Float s);          // r = (v0 > s) ? v0 : s
+
+    // Matrix4x4 * Vector4
+    //   result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * v.s.w)
+    //   result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * v.s.w)
+    //   result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * v.s.w)
+    //   result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * v.s.w)
+    static void mat4x4_vec4_multiply(
+            Vec4& result,
+            const float *pMatrix,
+            const Vec4& v);
+
+    // Matrix4x4 * Vector3 - Direction Vector where w = 0.
+    //   result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 0)
+    //   result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 0)
+    //   result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 0)
+    //   result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 0)
+    static void mat3x3_vec3_w0_multiply(
+            Vec4& result,
+            const float *pMatrix,
+            const Vec4& v);
+
+    // Matrix4x4 * Vector3 - Position vector where w = 1.
+    //   result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
+    //   result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
+    //   result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
+    //   result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 1)
+    static void mat4x4_vec3_w1_multiply(
+            Vec4& result,
+            const float *pMatrix,
+            const Vec4& v);
+
+    // Matrix4x3 * Vector3 - Position vector where w = 1.
+    //   result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
+    //   result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
+    //   result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
+    //   result.s.w = 1
+    static void mat4x3_vec3_w1_multiply(
+            Vec4& result,
+            const float *pMatrix,
+            const Vec4& v);
+};
+#endif // #if 0
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
@ -0,0 +1,377 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#pragma once
+
+#if !defined(__cplusplus)
+#error C++ compilation required
+#endif
+
+#include <immintrin.h>
+#include <inttypes.h>
+#include <stdint.h>
+
+#define SIMD_ARCH_AVX       0
+#define SIMD_ARCH_AVX2      1
+#define SIMD_ARCH_AVX512    2
+
+#if !defined(SIMD_ARCH)
+#define SIMD_ARCH SIMD_ARCH_AVX
+#endif
+
+#if defined(_MSC_VER)
+#define SIMDCALL __vectorcall
+#define SIMDINLINE __forceinline
+#define SIMDALIGN(type_, align_) __declspec(align(align_)) type_
+#else
+#define SIMDCALL
+#define SIMDINLINE inline
+#define SIMDALIGN(type_, align_) type_ __attribute__((aligned(align_)))
+#endif
+
+// For documentation, please see the following include...
+// #include "simdlib_interface.hpp"
+
+namespace SIMDImpl
+{
+    enum class CompareType
+    {
+        EQ_OQ      = 0x00, // Equal (ordered, nonsignaling)
+        LT_OS      = 0x01, // Less-than (ordered, signaling)
+        LE_OS      = 0x02, // Less-than-or-equal (ordered, signaling)
+        UNORD_Q    = 0x03, // Unordered (nonsignaling)
+        NEQ_UQ     = 0x04, // Not-equal (unordered, nonsignaling)
+        NLT_US     = 0x05, // Not-less-than (unordered, signaling)
+        NLE_US     = 0x06, // Not-less-than-or-equal (unordered, signaling)
+        ORD_Q      = 0x07, // Ordered (nonsignaling)
+        EQ_UQ      = 0x08, // Equal (unordered, non-signaling)
+        NGE_US     = 0x09, // Not-greater-than-or-equal (unordered, signaling)
+        NGT_US     = 0x0A, // Not-greater-than (unordered, signaling)
+        FALSE_OQ   = 0x0B, // False (ordered, nonsignaling)
+        NEQ_OQ     = 0x0C, // Not-equal (ordered, non-signaling)
+        GE_OS      = 0x0D, // Greater-than-or-equal (ordered, signaling)
+        GT_OS      = 0x0E, // Greater-than (ordered, signaling)
+        TRUE_UQ    = 0x0F, // True (unordered, non-signaling)
+        EQ_OS      = 0x10, // Equal (ordered, signaling)
+        LT_OQ      = 0x11, // Less-than (ordered, nonsignaling)
+        LE_OQ      = 0x12, // Less-than-or-equal (ordered, nonsignaling)
+        UNORD_S    = 0x13, // Unordered (signaling)
+        NEQ_US     = 0x14, // Not-equal (unordered, signaling)
+        NLT_UQ     = 0x15, // Not-less-than (unordered, nonsignaling)
+        NLE_UQ     = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
+        ORD_S      = 0x17, // Ordered (signaling)
+        EQ_US      = 0x18, // Equal (unordered, signaling)
+        NGE_UQ     = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
+        NGT_UQ     = 0x1A, // Not-greater-than (unordered, nonsignaling)
+        FALSE_OS   = 0x1B, // False (ordered, signaling)
+        NEQ_OS     = 0x1C, // Not-equal (ordered, signaling)
+        GE_OQ      = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
+        GT_OQ      = 0x1E, // Greater-than (ordered, nonsignaling)
+        TRUE_US    = 0x1F, // True (unordered, signaling)
+    };
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+    enum class CompareTypeInt
+    {
+        EQ  = _MM_CMPINT_EQ,    // Equal
+        LT  = _MM_CMPINT_LT,    // Less than
+        LE  = _MM_CMPINT_LE,    // Less than or Equal
+        NE  = _MM_CMPINT_NE,    // Not Equal
+        GE  = _MM_CMPINT_GE,    // Greater than or Equal
+        GT  = _MM_CMPINT_GT,    // Greater than
+    };
+#endif // SIMD_ARCH >= SIMD_ARCH_AVX512
+
+    enum class ScaleFactor
+    {
+        SF_1 = 1,   // No scaling
+        SF_2 = 2,   // Scale offset by 2
+        SF_4 = 4,   // Scale offset by 4
+        SF_8 = 8,   // Scale offset by 8
+    };
+
+    enum class RoundMode
+    {
+        TO_NEAREST_INT  = 0x00, // Round to nearest integer == TRUNCATE(value + 0.5)
+        TO_NEG_INF      = 0x01, // Round to negative infinity
+        TO_POS_INF      = 0x02, // Round to positive infinity
+        TO_ZERO         = 0x03, // Round to 0 a.k.a. truncate
+        CUR_DIRECTION   = 0x04, // Round in direction set in MXCSR register
+        
+        RAISE_EXC       = 0x00, // Raise exception on overflow
+        NO_EXC          = 0x08, // Suppress exceptions
+        
+        NINT            = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(RAISE_EXC),
+        NINT_NOEXC      = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(NO_EXC),
+        FLOOR           = static_cast<int>(TO_NEG_INF)      | static_cast<int>(RAISE_EXC),
+        FLOOR_NOEXC     = static_cast<int>(TO_NEG_INF)      | static_cast<int>(NO_EXC),
+        CEIL            = static_cast<int>(TO_POS_INF)      | static_cast<int>(RAISE_EXC),
+        CEIL_NOEXC      = static_cast<int>(TO_POS_INF)      | static_cast<int>(NO_EXC),
+        TRUNC           = static_cast<int>(TO_ZERO)         | static_cast<int>(RAISE_EXC),
+        TRUNC_NOEXC     = static_cast<int>(TO_ZERO)         | static_cast<int>(NO_EXC),
+        RINT            = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(RAISE_EXC),
+        NEARBYINT       = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(NO_EXC),
+    };
+
+    struct Traits
+    {
+        using CompareType = SIMDImpl::CompareType;
+        using ScaleFactor = SIMDImpl::ScaleFactor;
+        using RoundMode   = SIMDImpl::RoundMode;
+    };
+
+    // Attribute, 4-dimensional attribute in SIMD SOA layout
+    template<typename Float, typename Integer, typename Double>
+    union Vec4
+    {
+        Float   v[4];
+        Integer vi[4];
+        Double  vd[4];
+        struct
+        {
+            Float  x;
+            Float  y;
+            Float  z;
+            Float  w;
+        };
+        SIMDINLINE Float& operator[] (const int i) { return v[i]; }
+        SIMDINLINE Float const & operator[] (const int i) const { return v[i]; }
+        SIMDINLINE Vec4& operator=(Vec4 const & in)
+        {
+            v[0] = in.v[0];
+            v[1] = in.v[1];
+            v[2] = in.v[2];
+            v[3] = in.v[3];
+            return *this;
+        }
+    };
+
+    namespace SIMD128Impl
+    {
+        union Float
+        {
+            SIMDINLINE Float() = default;
+            SIMDINLINE Float(__m128 in) : v(in) {}
+            SIMDINLINE Float& operator=(__m128 in) { v = in; return *this; }
+            SIMDINLINE Float& operator=(Float const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m128() const { return v; }
+
+            SIMDALIGN(__m128, 16) v;
+        };
+
+        union Integer
+        {
+            SIMDINLINE Integer() = default;
+            SIMDINLINE Integer(__m128i in) : v(in) {}
+            SIMDINLINE Integer& operator=(__m128i in) { v = in; return *this; }
+            SIMDINLINE Integer& operator=(Integer const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m128i() const { return v; }
+            SIMDALIGN(__m128i, 16) v;
+        };
+
+        union Double
+        {
+            SIMDINLINE Double() = default;
+            SIMDINLINE Double(__m128d in) : v(in) {}
+            SIMDINLINE Double& operator=(__m128d in) { v = in; return *this; }
+            SIMDINLINE Double& operator=(Double const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m128d() const { return v; }
+            SIMDALIGN(__m128d, 16) v;
+        };
+
+        using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
+        using Mask = uint8_t;
+
+        static const uint32_t SIMD_WIDTH = 4;
+    } // ns SIMD128Impl
+
+    namespace SIMD256Impl
+    {
+        union Float
+        {
+            SIMDINLINE Float() = default;
+            SIMDINLINE Float(__m256 in) : v(in) {}
+            SIMDINLINE Float(SIMD128Impl::Float in_lo, SIMD128Impl::Float in_hi = _mm_setzero_ps())
+            {
+                v = _mm256_insertf128_ps(_mm256_castps128_ps256(in_lo), in_hi, 0x1);
+            }
+            SIMDINLINE Float& operator=(__m256 in) { v = in; return *this; }
+            SIMDINLINE Float& operator=(Float const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m256() const { return v; }
+
+            SIMDALIGN(__m256, 32) v;
+            SIMD128Impl::Float v4[2];
+        };
+
+        union Integer
+        {
+            SIMDINLINE Integer() = default;
+            SIMDINLINE Integer(__m256i in) : v(in) {}
+            SIMDINLINE Integer(SIMD128Impl::Integer in_lo, SIMD128Impl::Integer in_hi = _mm_setzero_si128())
+            {
+                v = _mm256_insertf128_si256(_mm256_castsi128_si256(in_lo), in_hi, 0x1);
+            }
+            SIMDINLINE Integer& operator=(__m256i in) { v = in; return *this; }
+            SIMDINLINE Integer& operator=(Integer const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m256i() const { return v; }
+
+            SIMDALIGN(__m256i, 32) v;
+            SIMD128Impl::Integer v4[2];
+        };
+
+        union Double
+        {
+            SIMDINLINE Double() = default;
+            SIMDINLINE Double(__m256d in) : v(in) {}
+            SIMDINLINE Double(SIMD128Impl::Double in_lo, SIMD128Impl::Double in_hi = _mm_setzero_pd())
+            {
+                v = _mm256_insertf128_pd(_mm256_castpd128_pd256(in_lo), in_hi, 0x1);
+            }
+            SIMDINLINE Double& operator=(__m256d in) { v = in; return *this; }
+            SIMDINLINE Double& operator=(Double const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m256d() const { return v; }
+
+            SIMDALIGN(__m256d, 32) v;
+            SIMD128Impl::Double v4[2];
+        };
+
+        using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
+        using Mask = uint8_t;
+
+        static const uint32_t SIMD_WIDTH = 8;
+    } // ns SIMD256Impl
+
+    namespace SIMD512Impl
+    {
+#if !defined(_MM_K0_REG)
+        // Define AVX512 types if not included via immintrin.h.
+        // All data members of these types are ONLY to viewed
+        // in a debugger.  Do NOT access them via code!
+        union __m512
+        {
+        private:
+            float m512_f32[16];
+        };
+        struct __m512d
+        {
+        private:
+            double m512d_f64[8];
+        };
+
+        union __m512i
+        {
+        private:
+            int8_t              m512i_i8[64];
+            int16_t             m512i_i16[32];
+            int32_t             m512i_i32[16];
+            int64_t             m512i_i64[8];
+            uint8_t             m512i_u8[64];
+            uint16_t            m512i_u16[32];
+            uint32_t            m512i_u32[16];
+            uint64_t            m512i_u64[8];
+        };
+
+        using __mmask16 = uint16_t;
+#endif
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+#define SIMD_ALIGNMENT_BYTES 64
+#else
+#define SIMD_ALIGNMENT_BYTES 32
+#endif
+
+        union Float
+        {
+            SIMDINLINE Float() = default;
+            SIMDINLINE Float(__m512 in) : v(in) {}
+            SIMDINLINE Float(SIMD256Impl::Float in_lo, SIMD256Impl::Float in_hi = _mm256_setzero_ps()) { v8[0] = in_lo; v8[1] = in_hi; }
+            SIMDINLINE Float& operator=(__m512 in) { v = in; return *this; }
+            SIMDINLINE Float& operator=(Float const & in)
+            {
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+                v = in.v;
+#else
+                v8[0] = in.v8[0];
+                v8[1] = in.v8[1];
+#endif
+                return *this;
+            }
+            SIMDINLINE operator __m512() const { return v; }
+
+            SIMDALIGN(__m512, SIMD_ALIGNMENT_BYTES) v;
+            SIMD256Impl::Float v8[2];
+        };
+
+        union Integer
+        {
+            SIMDINLINE Integer() = default;
+            SIMDINLINE Integer(__m512i in) : v(in) {}
+            SIMDINLINE Integer(SIMD256Impl::Integer in_lo, SIMD256Impl::Integer in_hi = _mm256_setzero_si256()) { v8[0] = in_lo; v8[1] = in_hi; }
+            SIMDINLINE Integer& operator=(__m512i in) { v = in; return *this; }
+            SIMDINLINE Integer& operator=(Integer const & in)
+            {
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+                v = in.v;
+#else
+                v8[0] = in.v8[0];
+                v8[1] = in.v8[1];
+#endif
+                return *this;
+            }
+
+            SIMDINLINE operator __m512i() const { return v; }
+
+            SIMDALIGN(__m512i, SIMD_ALIGNMENT_BYTES) v;
+            SIMD256Impl::Integer v8[2];
+        };
+
+        union Double
+        {
+            SIMDINLINE Double() = default;
+            SIMDINLINE Double(__m512d in) : v(in) {}
+            SIMDINLINE Double(SIMD256Impl::Double in_lo, SIMD256Impl::Double in_hi = _mm256_setzero_pd()) { v8[0] = in_lo; v8[1] = in_hi; }
+            SIMDINLINE Double& operator=(__m512d in) { v = in; return *this; }
+            SIMDINLINE Double& operator=(Double const & in)
+            {
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+                v = in.v;
+#else
+                v8[0] = in.v8[0];
+                v8[1] = in.v8[1];
+#endif
+                return *this;
+            }
+
+            SIMDINLINE operator __m512d() const { return v; }
+
+            SIMDALIGN(__m512d, SIMD_ALIGNMENT_BYTES) v;
+            SIMD256Impl::Double v8[2];
+        };
+
+        typedef SIMDImpl::Vec4<Float, Integer, Double> SIMDALIGN(Vec4, 64);
+        using Mask = __mmask16;
+
+        static const uint32_t SIMD_WIDTH = 16;
+
+#undef SIMD_ALIGNMENT_BYTES
+    } // ns SIMD512Impl
+} // ns SIMDImpl
--- a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
@ -43,10 +43,10 @@ enum SWR_BACKEND_FUNCS
 };

 #if KNOB_SIMD_WIDTH == 8
-static const simdscalar vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
-static const simdscalar vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
-static const simdscalar vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
-static const simdscalar vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
+static const __m256 vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
+static const __m256 vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
+static const __m256 vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
+static const __m256 vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
 #define MASK 0xff
 #endif

@ -163,52 +163,52 @@ struct generateInputCoverage
            uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
            if(T::MultisampleT::numSamples == 1)
            {
-                sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
+                sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
            }
            else if(T::MultisampleT::numSamples == 2)
            {
-                sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
+                sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
            }
            else if(T::MultisampleT::numSamples == 4)
            {
-                sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
+                sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
            }
            else if(T::MultisampleT::numSamples == 8)
            {
-                sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+                sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
            }
            else if(T::MultisampleT::numSamples == 16)
            {
-                sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
-                sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
+                sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
+                sampleCoverage[1] = _simd_set1_epi32(centerCoverage);
            }
        }
        else
        {
-            __m256i src = _mm256_set1_epi32(0);
-            __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
+            simdscalari src = _simd_set1_epi32(0);
+            simdscalari index0 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;

            if(T::MultisampleT::numSamples == 1)
            {
-                mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
+                mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
            }
            else if(T::MultisampleT::numSamples == 2)
            {
-                mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+                mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
            }
            else if(T::MultisampleT::numSamples == 4)
            {
-                mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+                mask[0] = _simd_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
            }
            else if(T::MultisampleT::numSamples == 8)
            {
-                mask[0] = _mm256_set1_epi32(-1);
+                mask[0] = _simd_set1_epi32(-1);
            }
            else if(T::MultisampleT::numSamples == 16)
            {
-                mask[0] = _mm256_set1_epi32(-1);
-                mask[1] = _mm256_set1_epi32(-1);
-                index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
+                mask[0] = _simd_set1_epi32(-1);
+                mask[1] = _simd_set1_epi32(-1);
+                index1 = _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
            }

            // gather coverage for samples 0-7
@ -253,14 +253,14 @@ struct generateInputCoverage
            packedSampleCoverage = packedCoverage0;
        }
    #else
-        simdscalari permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
+        simdscalari permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
        // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
        packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);

        simdscalari packedSampleCoverage;
        if(T::MultisampleT::numSamples > 8)
        {
-            permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
+            permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
            // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
            packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);

@ -293,7 +293,7 @@ struct generateInputCoverage
    {
        uint32_t inputMask[KNOB_SIMD_WIDTH];
        generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
-        inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
+        inputCoverage = _simd_castsi_ps(_simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
    }

 };
@ -305,10 +305,10 @@ struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
    {
        // will need to update for avx512
        assert(KNOB_SIMD_WIDTH == 8);
-        simdscalari vec = _mm256_set1_epi32(coverageMask[0]);
-        const simdscalari bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+        simdscalari vec = _simd_set1_epi32(coverageMask[0]);
+        const simdscalari bit = _simd_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
        vec = _simd_and_si(vec, bit);
-        vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
+        vec = _simd_cmplt_epi32(_simd_setzero_si(), vec);
        vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
        inputCoverage = _simd_castsi_ps(vec);
    }
@ -357,7 +357,7 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS
    (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);

    // look up and set the sample offsets from UL pixel corner for first covered sample 
-    __m256 vXSample = _mm256_set_ps(samplePos.X(sampleNum[7]),
+    simdscalar vXSample = _simd_set_ps(samplePos.X(sampleNum[7]),
                                    samplePos.X(sampleNum[6]),
                                    samplePos.X(sampleNum[5]),
                                    samplePos.X(sampleNum[4]),
@ -366,7 +366,7 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS
                                    samplePos.X(sampleNum[1]),
                                    samplePos.X(sampleNum[0]));

-    __m256 vYSample = _mm256_set_ps(samplePos.Y(sampleNum[7]),
+    simdscalar vYSample = _simd_set_ps(samplePos.Y(sampleNum[7]),
                                    samplePos.Y(sampleNum[6]),
                                    samplePos.Y(sampleNum[5]),
                                    samplePos.Y(sampleNum[4]),
@ -380,7 +380,7 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS

    // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
    static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
-    simdscalari vInputCoveragei =  _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
+    simdscalari vInputCoveragei =  _simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
    simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);

    static const simdscalari vZero = _simd_setzero_si();
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@ -88,7 +88,7 @@ INLINE void ProcessAttributes(
            inputSlot = backendState.vertexAttribOffset + i;
        }

-        __m128 attrib[3];    // triangle attribs (always 4 wide)
+        simd4scalar attrib[3];    // triangle attribs (always 4 wide)
        float* pAttribStart = pBuffer;

        if (HasConstantInterpT::value || IsDegenerate::value)
@ -128,7 +128,7 @@ INLINE void ProcessAttributes(

                for (uint32_t i = 0; i < NumVertsT::value; ++i)
                {
-                    _mm_store_ps(pBuffer, attrib[vid]);
+                    SIMD128::store_ps(pBuffer, attrib[vid]);
                    pBuffer += 4;
                }
            }
@ -138,7 +138,7 @@ INLINE void ProcessAttributes(

                for (uint32_t i = 0; i < NumVertsT::value; ++i)
                {
-                    _mm_store_ps(pBuffer, attrib[i]);
+                    SIMD128::store_ps(pBuffer, attrib[i]);
                    pBuffer += 4;
                }
            }
@ -149,7 +149,7 @@ INLINE void ProcessAttributes(

            for (uint32_t i = 0; i < NumVertsT::value; ++i)
            {
-                _mm_store_ps(pBuffer, attrib[i]);
+                SIMD128::store_ps(pBuffer, attrib[i]);
                pBuffer += 4;
            }
        }
@ -160,7 +160,7 @@ INLINE void ProcessAttributes(
        // effect of the missing vertices in the triangle interpolation.
        for (uint32_t v = NumVertsT::value; v < 3; ++v)
        {
-            _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
+            SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
            pBuffer += 4;
        }

@ -279,8 +279,7 @@ struct GatherScissors_simd16<16>
 {
    static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
        simd16scalari &scisXmin, simd16scalari &scisYmin,
-        simd16scalari &scisXmax, simd16scalari &scisYmax)
-    {
+        simd16scalari &scisXmax, simd16scalari &scisYmax) {
        scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
            pScissorsInFixedPoint[pViewportIndex[1]].xmin,
            pScissorsInFixedPoint[pViewportIndex[2]].xmin,
@ -390,14 +389,14 @@ void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask,
        uint32_t clipAttribSlot = clipSlot == 0 ?
            VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;

-        __m128 primClipDist[3];
+        simd4scalar primClipDist[3];
        pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);

        float vertClipDist[NumVerts];
        for (uint32_t e = 0; e < NumVerts; ++e)
        {
            OSALIGNSIMD(float) aVertClipDist[4];
-            _mm_store_ps(aVertClipDist, primClipDist[e]);
+            SIMD128::store_ps(aVertClipDist, primClipDist[e]);
            vertClipDist[e] = aVertClipDist[clipComp];
        };

@ -625,13 +624,14 @@ void BinTriangles(
            (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
    }

+    simdBBox bbox;
+
    if (!triMask)
    {
        goto endBinTriangles;
    }

    // Calc bounding box of triangles
-    simdBBox bbox;
    calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);

    // determine if triangle falls between pixel centers and discard
@ -673,28 +673,30 @@ void BinTriangles(
    // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
    // Gather the AOS effective scissor rects based on the per-prim VP index.
    /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-    simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
-    if (state.backendState.readViewportArrayIndex)
    {
-        GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-            scisXmin, scisYmin, scisXmax, scisYmax);
-    }
-    else // broadcast fast path for non-VPAI case.
-    {
-        scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-        scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-        scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-        scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-    }
+        simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+        if (state.backendState.readViewportArrayIndex)
+        {
+            GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                scisXmin, scisYmin, scisXmax, scisYmax);
+        }
+        else // broadcast fast path for non-VPAI case.
+        {
+            scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+            scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+            scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+            scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+        }

-    // Make triangle bbox inclusive
-    bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1));
-    bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1));
+        // Make triangle bbox inclusive
+        bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1));
+        bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1));

-    bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
-    bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
-    bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax);
-    bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax);
+        bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+        bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+        bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax);
+        bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax);
+    }

    if (CT::IsConservativeT::value)
    {
@ -768,7 +770,7 @@ endBinTriangles:

    // transpose verts needed for backend
    /// @todo modify BE to take non-transformed verts
-    __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
+    simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
    vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
    vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
    vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
@ -837,10 +839,10 @@ endBinTriangles:
        // store triangle vertex data
        desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);

-        _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
-        _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
-        _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
-        _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);

        // store user clip distances
        if (rastState.clipDistanceMask)
@ -870,7 +872,7 @@ endBinTriangles:

 #if USE_SIMD16_FRONTEND
 template <typename CT>
-void SIMDAPI BinTriangles_simd16(
+void SIMDCALL BinTriangles_simd16(
    DRAW_CONTEXT *pDC,
    PA_STATE& pa,
    uint32_t workerId,
@ -1124,29 +1126,31 @@ void SIMDAPI BinTriangles_simd16(
    // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
    // Gather the AOS effective scissor rects based on the per-prim VP index.
    /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-    simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
-
-    if (state.backendState.readViewportArrayIndex)
    {
-        GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-            scisXmin, scisYmin, scisXmax, scisYmax);
-    }
-    else // broadcast fast path for non-VPAI case.
-    {
-        scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-        scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-        scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-        scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-    }
+        simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;

-    // Make triangle bbox inclusive
-    bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1));
-    bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1));
+        if (state.backendState.readViewportArrayIndex)
+        {
+            GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                scisXmin, scisYmin, scisXmax, scisYmax);
+        }
+        else // broadcast fast path for non-VPAI case.
+        {
+            scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+            scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+            scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+            scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+        }

-    bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
-    bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
-    bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax);
-    bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax);
+        // Make triangle bbox inclusive
+        bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1));
+        bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1));
+
+        bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+        bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+        bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax);
+        bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax);
+    }

    if (CT::IsConservativeT::value)
    {
@ -1221,10 +1225,10 @@ endBinTriangles:

    // transpose verts needed for backend
    /// @todo modify BE to take non-transformed verts
-    __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH

    vTranspose3x8(vHorizX[0], _simd16_extract_ps(tri[0].x, 0), _simd16_extract_ps(tri[1].x, 0), _simd16_extract_ps(tri[2].x, 0));
    vTranspose3x8(vHorizY[0], _simd16_extract_ps(tri[0].y, 0), _simd16_extract_ps(tri[1].y, 0), _simd16_extract_ps(tri[2].y, 0));
@ -1547,24 +1551,26 @@ void BinPostSetupPoints(
        // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
        // Gather the AOS effective scissor rects based on the per-prim VP index.
        /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-        simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
-        if (state.backendState.readViewportArrayIndex)
        {
-            GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-                scisXmin, scisYmin, scisXmax, scisYmax);
-        }
-        else // broadcast fast path for non-VPAI case.
-        {
-            scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-            scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-            scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-            scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-        }
+            simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+            if (state.backendState.readViewportArrayIndex)
+            {
+                GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                    scisXmin, scisYmin, scisXmax, scisYmax);
+            }
+            else // broadcast fast path for non-VPAI case.
+            {
+                scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+                scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+                scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+                scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+            }

-        bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
-        bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
-        bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
-        bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+            bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+            bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+            bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+            bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+        }

        // Cull bloated points completely outside scissor
        simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
@ -1934,24 +1940,26 @@ void BinPostSetupPoints_simd16(
        // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
        // Gather the AOS effective scissor rects based on the per-prim VP index.
        /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-        simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
-        if (state.backendState.readViewportArrayIndex)
        {
-            GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-                scisXmin, scisYmin, scisXmax, scisYmax);
-        }
-        else // broadcast fast path for non-VPAI case.
-        {
-            scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-            scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-            scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-            scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-        }
+            simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
+            if (state.backendState.readViewportArrayIndex)
+            {
+                GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                    scisXmin, scisYmin, scisXmax, scisYmax);
+            }
+            else // broadcast fast path for non-VPAI case.
+            {
+                scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+                scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+                scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+                scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+            }

-        bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
-        bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
-        bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
-        bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+            bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+            bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+            bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
+            bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+        }

        // Cull bloated points completely outside scissor
        simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
@ -2071,7 +2079,7 @@ void BinPostSetupPoints_simd16(
    AR_END(FEBinPoints, 1);
 }

-void SIMDAPI BinPoints_simd16(
+void SIMDCALL BinPoints_simd16(
    DRAW_CONTEXT *pDC,
    PA_STATE& pa,
    uint32_t workerId,
@ -2168,6 +2176,8 @@ void BinPostSetupLines(
    simdscalar& vRecipW0 = recipW[0];
    simdscalar& vRecipW1 = recipW[1];

+    simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
+
    // convert to fixed point
    simdscalari vXi[2], vYi[2];
    vXi[0] = fpToFixedPointVertical(prim[0].x);
@ -2214,24 +2224,26 @@ void BinPostSetupLines(
    bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);

    // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
-    simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
-    if (state.backendState.readViewportArrayIndex)
    {
-        GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-            scisXmin, scisYmin, scisXmax, scisYmax);
-    }
-    else // broadcast fast path for non-VPAI case.
-    {
-        scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-        scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-        scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-        scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-    }
+        simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+        if (state.backendState.readViewportArrayIndex)
+        {
+            GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                scisXmin, scisYmin, scisXmax, scisYmax);
+        }
+        else // broadcast fast path for non-VPAI case.
+        {
+            scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+            scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+            scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+            scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+        }

-    bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
-    bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
-    bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
-    bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+        bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+        bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+        bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+        bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+    }

    // Cull prims completely outside scissor
    {
@ -2261,7 +2273,6 @@ void BinPostSetupLines(

    // transpose verts needed for backend
    /// @todo modify BE to take non-transformed verts
-    __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
    vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
    vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
    vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
@ -2310,10 +2321,10 @@ void BinPostSetupLines(

        // store line vertex data
        desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
-        _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
-        _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
-        _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
-        _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);

        // store user clip distances
        if (rastState.clipDistanceMask)
@ -2417,25 +2428,27 @@ void BinPostSetupLines_simd16(
    bbox.ymax = _simd16_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);

    // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
-    simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
-
-    if (state.backendState.readViewportArrayIndex)
    {
-        GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-            scisXmin, scisYmin, scisXmax, scisYmax);
-    }
-    else // broadcast fast path for non-VPAI case.
-    {
-        scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-        scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-        scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-        scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-    }
+        simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;

-    bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
-    bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
-    bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
-    bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+        if (state.backendState.readViewportArrayIndex)
+        {
+            GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                scisXmin, scisYmin, scisXmax, scisYmax);
+        }
+        else // broadcast fast path for non-VPAI case.
+        {
+            scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+            scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+            scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+            scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+        }
+
+        bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+        bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+        bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
+        bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+    }

    // Cull prims completely outside scissor
    {
@ -2468,10 +2481,10 @@ void BinPostSetupLines_simd16(

    // transpose verts needed for backend
    /// @todo modify BE to take non-transformed verts
-    __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH

    vTranspose3x8(vHorizX[0], _simd16_extract_ps(prim[0].x, 0), _simd16_extract_ps(prim[1].x, 0), unused);
    vTranspose3x8(vHorizY[0], _simd16_extract_ps(prim[0].y, 0), _simd16_extract_ps(prim[1].y, 0), unused);
@ -2650,7 +2663,7 @@ void BinLines(
 }

 #if USE_SIMD16_FRONTEND
-void SIMDAPI BinLines_simd16(
+void SIMDCALL BinLines_simd16(
    DRAW_CONTEXT *pDC,
    PA_STATE& pa,
    uint32_t workerId,
--- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
@ -188,7 +188,7 @@ void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector p
 }

 #if USE_SIMD16_FRONTEND
-void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
+void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
 {
    SWR_CONTEXT *pContext = pDC->pContext;
    AR_BEGIN(FEClipTriangles, pDC->drawId);
@ -203,7 +203,7 @@ void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t work
    AR_END(FEClipTriangles, 1);
 }

-void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
+void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
 {
    SWR_CONTEXT *pContext = pDC->pContext;
    AR_BEGIN(FEClipLines, pDC->drawId);
@ -218,7 +218,7 @@ void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId
    AR_END(FEClipLines, 1);
 }

-void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
+void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
 {
    SWR_CONTEXT *pContext = pDC->pContext;
    AR_BEGIN(FEClipPoints, pDC->drawId);
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@ -1095,7 +1095,7 @@ public:
            AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
            // we have to clip tris, execute the clipper, which will also
            // call the binner
-            ClipSimd(vMask16(primMask), vMask16(clipMask), pa, primId);
+            ClipSimd(_simd16_vmask_ps(primMask), _simd16_vmask_ps(clipMask), pa, primId);
            AR_END(FEGuardbandClip, 1);
        }
        else if (validMask)
@ -1180,7 +1180,7 @@ private:
    {
        simd16scalari vOffsets = ComputeOffsets(attrib, vIndices, component);
        simd16scalar vSrc = _simd16_setzero_ps();
-        return _simd16_mask_i32gather_ps(vSrc, pBuffer, vOffsets, _simd16_castps_si(vMask), 1);
+        return _simd16_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
    }

 #endif
@ -1895,8 +1895,8 @@ void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvecto
 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
 #if USE_SIMD16_FRONTEND
-void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
-void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
-void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
+void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
+void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
+void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
 #endif

--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@ -218,7 +218,7 @@ typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t worke

 #if ENABLE_AVX512_SIMD16
 // function signature for pipeline stages that execute after primitive assembly
-typedef void(SIMDAPI *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
+typedef void(SIMDCALL *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
    uint32_t primMask, simd16scalari primID);

 #endif
--- a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
@ -202,7 +202,7 @@ INLINE void StoreSOA(const simdvector &src, uint8_t *pDst)
 /// @param pSrc - source data in SOA form
 /// @param dst - output data in SOA form
 template<SWR_FORMAT SrcFormat>
-INLINE void SIMDAPI LoadSOA(const uint8_t *pSrc, simd16vector &dst)
+INLINE void SIMDCALL LoadSOA(const uint8_t *pSrc, simd16vector &dst)
 {
    // fast path for float32
    if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32))
@ -247,7 +247,7 @@ INLINE void SIMDAPI LoadSOA(const uint8_t *pSrc, simd16vector &dst)
 /// @param vComp - SIMD vector of floats
 /// @param Component - component
 template<SWR_FORMAT Format>
-INLINE simd16scalar SIMDAPI Clamp(simd16scalar vComp, uint32_t Component)
+INLINE simd16scalar SIMDCALL Clamp(simd16scalar vComp, uint32_t Component)
 {
    if (FormatTraits<Format>::isNormalized(Component))
    {
@ -293,7 +293,7 @@ INLINE simd16scalar SIMDAPI Clamp(simd16scalar vComp, uint32_t Component)
 /// @param vComp - SIMD vector of floats
 /// @param Component - component
 template<SWR_FORMAT Format>
-INLINE simd16scalar SIMDAPI Normalize(simd16scalar vComp, uint32_t Component)
+INLINE simd16scalar SIMDCALL Normalize(simd16scalar vComp, uint32_t Component)
 {
    if (FormatTraits<Format>::isNormalized(Component))
    {
@ -309,7 +309,7 @@ INLINE simd16scalar SIMDAPI Normalize(simd16scalar vComp, uint32_t Component)
 /// @param src - source data in SOA form
 /// @param dst - output data in SOA form
 template<SWR_FORMAT DstFormat>
-INLINE void SIMDAPI StoreSOA(const simd16vector &src, uint8_t *pDst)
+INLINE void SIMDCALL StoreSOA(const simd16vector &src, uint8_t *pDst)
 {
    // fast path for float32
    if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32))
--- a/src/gallium/drivers/swr/rasterizer/core/format_types.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h
@ -43,7 +43,7 @@ struct PackTraits
    static simdscalar pack(simdscalar &in) = delete;
 #if ENABLE_AVX512_SIMD16
    static simd16scalar loadSOA_16(const uint8_t *pSrc) = delete;
-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src) = delete;
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src) = delete;
    static simd16scalar unpack(simd16scalar &in) = delete;
    static simd16scalar pack(simd16scalar &in) = delete;
 #endif
@ -63,7 +63,7 @@ struct PackTraits<0, false>
    static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
 #if ENABLE_AVX512_SIMD16
    static simd16scalar loadSOA_16(const uint8_t *pSrc) { return _simd16_setzero_ps(); }
-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src) { return; }
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src) { return; }
    static simd16scalar unpack(simd16scalar &in) { return _simd16_setzero_ps(); }
    static simd16scalar pack(simd16scalar &in) { return _simd16_setzero_ps(); }
 #endif
@ -109,7 +109,7 @@ struct PackTraits<8, false>

        __m256i result = _mm256_castsi128_si256(resLo);
        result = _mm256_insertf128_si256(result, resHi, 1);
-        return _mm256_castsi256_ps(result);
+        return simdscalar{ _mm256_castsi256_ps(result) };
 #else
        return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
 #endif
@ -144,7 +144,7 @@ struct PackTraits<8, false>
        return result;
    }

-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
    {
        // store simd16 bytes
        _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
@ -152,7 +152,8 @@ struct PackTraits<8, false>

    static simd16scalar unpack(simd16scalar &in)
    {
-        simd16scalari result = _simd16_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))));
+        simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
+        simd16scalari result = _simd16_cvtepu8_epi32(tmp);

        return _simd16_castsi_ps(result);
    }
@ -259,7 +260,7 @@ struct PackTraits<8, true>
        return result;
    }

-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
    {
        // store simd16 bytes
        _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
@ -267,7 +268,8 @@ struct PackTraits<8, true>

    static simd16scalar unpack(simd16scalar &in)
    {
-        simd16scalari result = _simd16_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))));
+        simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
+        simd16scalari result = _simd16_cvtepu8_epi32(tmp);

        return _simd16_castsi_ps(result);
    }
@ -370,7 +372,7 @@ struct PackTraits<16, false>
        return result;
    }

-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
    {
        _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
    }
@ -469,7 +471,7 @@ struct PackTraits<16, true>
        return result;
    }

-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
    {
        _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
    }
@ -514,7 +516,7 @@ struct PackTraits<32, false>
        return _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
    }

-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
    {
        _simd16_store_ps(reinterpret_cast<float *>(pDst), src);
    }
@ -812,7 +814,7 @@ static inline __m128 ConvertFloatToSRGB2(__m128& Src)

 #if ENABLE_AVX512_SIMD16
 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
-inline static simd16scalar SIMDAPI fastpow(simd16scalar value)
+inline static simd16scalar SIMDCALL fastpow(simd16scalar value)
 {
    static const float factor1 = exp2(127.0f * expden / expnum - 127.0f)
        * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum);
@ -834,7 +836,7 @@ inline static simd16scalar SIMDAPI fastpow(simd16scalar value)
    return result;
 }

-inline static simd16scalar SIMDAPI pow512_4(simd16scalar arg)
+inline static simd16scalar SIMDCALL pow512_4(simd16scalar arg)
 {
    // 5/12 is too small, so compute the 4th root of 20/12 instead.
    // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
@ -855,7 +857,7 @@ inline static simd16scalar SIMDAPI pow512_4(simd16scalar arg)
    return xavg;
 }

-inline static simd16scalar SIMDAPI powf_wrapper(const simd16scalar base, float exp)
+inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar base, float exp)
 {
    const float *f = reinterpret_cast<const float *>(&base);

@ -1410,7 +1412,7 @@ struct ComponentTraits
        return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
    }

-    INLINE static void SIMDAPI storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar src)
+    INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar src)
    {
        switch (comp)
        {
--- a/src/gallium/drivers/swr/rasterizer/core/format_utils.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_utils.h
@ -31,58 +31,58 @@
 #include "common/simdintrin.h"

 INLINE
-void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
+void vTranspose(simd4scalar &row0, simd4scalar &row1, simd4scalar &row2, simd4scalar &row3)
 {
-    __m128i row0i = _mm_castps_si128(row0);
-    __m128i row1i = _mm_castps_si128(row1);
-    __m128i row2i = _mm_castps_si128(row2);
-    __m128i row3i = _mm_castps_si128(row3);
+    simd4scalari row0i = SIMD128::castps_si(row0);
+    simd4scalari row1i = SIMD128::castps_si(row1);
+    simd4scalari row2i = SIMD128::castps_si(row2);
+    simd4scalari row3i = SIMD128::castps_si(row3);

-    __m128i vTemp = row2i;
-    row2i = _mm_unpacklo_epi32(row2i, row3i);
-    vTemp = _mm_unpackhi_epi32(vTemp, row3i);
+    simd4scalari vTemp = row2i;
+    row2i = SIMD128::unpacklo_epi32(row2i, row3i);
+    vTemp = SIMD128::unpackhi_epi32(vTemp, row3i);

    row3i = row0i;
-    row0i = _mm_unpacklo_epi32(row0i, row1i);
-    row3i = _mm_unpackhi_epi32(row3i, row1i);
+    row0i = SIMD128::unpacklo_epi32(row0i, row1i);
+    row3i = SIMD128::unpackhi_epi32(row3i, row1i);

    row1i = row0i;
-    row0i = _mm_unpacklo_epi64(row0i, row2i);
-    row1i = _mm_unpackhi_epi64(row1i, row2i);
+    row0i = SIMD128::unpacklo_epi64(row0i, row2i);
+    row1i = SIMD128::unpackhi_epi64(row1i, row2i);

    row2i = row3i;
-    row2i = _mm_unpacklo_epi64(row2i, vTemp);
-    row3i = _mm_unpackhi_epi64(row3i, vTemp);
+    row2i = SIMD128::unpacklo_epi64(row2i, vTemp);
+    row3i = SIMD128::unpackhi_epi64(row3i, vTemp);

-    row0 = _mm_castsi128_ps(row0i);
-    row1 = _mm_castsi128_ps(row1i);
-    row2 = _mm_castsi128_ps(row2i);
-    row3 = _mm_castsi128_ps(row3i);
+    row0 = SIMD128::castsi_ps(row0i);
+    row1 = SIMD128::castsi_ps(row1i);
+    row2 = SIMD128::castsi_ps(row2i);
+    row3 = SIMD128::castsi_ps(row3i);
 }

 INLINE
-void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
+void vTranspose(simd4scalari &row0, simd4scalari &row1, simd4scalari &row2, simd4scalari &row3)
 {
-    __m128i vTemp = row2;
-    row2 = _mm_unpacklo_epi32(row2, row3);
-    vTemp = _mm_unpackhi_epi32(vTemp, row3);
+    simd4scalari vTemp = row2;
+    row2 = SIMD128::unpacklo_epi32(row2, row3);
+    vTemp = SIMD128::unpackhi_epi32(vTemp, row3);

    row3 = row0;
-    row0 = _mm_unpacklo_epi32(row0, row1);
-    row3 = _mm_unpackhi_epi32(row3, row1);
+    row0 = SIMD128::unpacklo_epi32(row0, row1);
+    row3 = SIMD128::unpackhi_epi32(row3, row1);

    row1 = row0;
-    row0 = _mm_unpacklo_epi64(row0, row2);
-    row1 = _mm_unpackhi_epi64(row1, row2);
+    row0 = SIMD128::unpacklo_epi64(row0, row2);
+    row1 = SIMD128::unpackhi_epi64(row1, row2);

    row2 = row3;
-    row2 = _mm_unpacklo_epi64(row2, vTemp);
-    row3 = _mm_unpackhi_epi64(row3, vTemp);
+    row2 = SIMD128::unpacklo_epi64(row2, vTemp);
+    row3 = SIMD128::unpackhi_epi64(row3, vTemp);
 }

 #if KNOB_SIMD_WIDTH == 8
 INLINE
-void vTranspose3x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2)
+void vTranspose3x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2)
 {
    simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2);                  //x0z0x1z1 x4z4x5z5
    simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps());     //y0w0y1w1 y4w4y5w5
@ -94,10 +94,10 @@ void vTranspose3x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar
    simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);              //x2y2z2w2 x6y6z6w6
    simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);              //x3y3z3w3 x7y7z7w7

-    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
-    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
-    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
-    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
+    vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
+    vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
+    vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
+    vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);

    vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
    vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
@ -106,7 +106,7 @@ void vTranspose3x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar
 }

 INLINE
-void vTranspose4x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3)
+void vTranspose4x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3)
 {
    simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2);      //x0z0x1z1 x4z4x5z5
    simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3);      //y0w0y1w1 y4w4y5w5
@ -118,10 +118,10 @@ void vTranspose4x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar
    simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);  //x2y2z2w2 x6y6z6w6
    simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);  //x3y3z3w3 x7y7z7w7

-    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
-    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
-    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
-    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
+    vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
+    vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
+    vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
+    vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);

    vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
    vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
@ -227,16 +227,16 @@ struct Transpose8_8_8_8

 #if KNOB_SIMD_WIDTH == 8
 #if KNOB_ARCH <= KNOB_ARCH_AVX
-        __m128i c0c1 = _mm256_castsi256_si128(src);                                           // rrrrrrrrgggggggg
-        __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1));  // bbbbbbbbaaaaaaaa
-        __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3);                                        // rrrrrrrrbbbbbbbb
-        __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3);                                        // ggggggggaaaaaaaa
-        __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3);                                          // rgrgrgrgrgrgrgrg
-        __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3);                                          // babababababababa
-        __m128i c0123lo = _mm_unpacklo_epi16(c01, c23);                                       // rgbargbargbargba
-        __m128i c0123hi = _mm_unpackhi_epi16(c01, c23);                                       // rgbargbargbargba
-        _mm_store_si128((__m128i*)pDst, c0123lo);
-        _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
+        simd4scalari c0c1 = src.v4[0];                                                          // rrrrrrrrgggggggg
+        simd4scalari c2c3 = SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1));  // bbbbbbbbaaaaaaaa
+        simd4scalari c0c2 = SIMD128::unpacklo_epi64(c0c1, c2c3);                                        // rrrrrrrrbbbbbbbb
+        simd4scalari c1c3 = SIMD128::unpackhi_epi64(c0c1, c2c3);                                        // ggggggggaaaaaaaa
+        simd4scalari c01 = SIMD128::unpacklo_epi8(c0c2, c1c3);                                          // rgrgrgrgrgrgrgrg
+        simd4scalari c23 = SIMD128::unpackhi_epi8(c0c2, c1c3);                                          // babababababababa
+        simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23);                                       // rgbargbargbargba
+        simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23);                                       // rgbargbargbargba
+        SIMD128::store_si((simd4scalari*)pDst, c0123lo);
+        SIMD128::store_si((simd4scalari*)(pDst + 16), c0123hi);
 #else
        simdscalari dst01 = _simd_shuffle_epi8(src,
            _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
@ -254,10 +254,10 @@ struct Transpose8_8_8_8

    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
    {
-        __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc));     // rrrrrrrrrrrrrrrr
-        __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
-        __m128i src2 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
-        __m128i src3 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
+        simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc));     // rrrrrrrrrrrrrrrr
+        simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg
+        simd4scalari src2 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
+        simd4scalari src3 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 3); // aaaaaaaaaaaaaaaa

        simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
        simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
@ -305,10 +305,10 @@ struct Transpose8_8
 #if KNOB_SIMD_WIDTH == 8
        simdscalari src = _simd_load_si((const simdscalari*)pSrc);

-        __m128i rg = _mm256_castsi256_si128(src);           // rrrrrrrr gggggggg
-        __m128i g = _mm_unpackhi_epi64(rg, rg);             // gggggggg gggggggg
-        rg = _mm_unpacklo_epi8(rg, g);
-        _mm_store_si128((__m128i*)pDst, rg);
+        simd4scalari rg = src.v4[0];           // rrrrrrrr gggggggg
+        simd4scalari g = SIMD128::unpackhi_epi64(rg, rg);             // gggggggg gggggggg
+        rg = SIMD128::unpacklo_epi8(rg, g);
+        SIMD128::store_si((simd4scalari*)pDst, rg);
 #else
 #error Unsupported vector width
 #endif
@ -317,8 +317,8 @@ struct Transpose8_8

    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
    {
-        __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc));     // rrrrrrrrrrrrrrrr
-        __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
+        simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc));     // rrrrrrrrrrrrrrrr
+        simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg

        simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
        simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
@ -349,16 +349,16 @@ struct Transpose32_32_32_32
        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
        simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);

-        __m128 vDst[8];
+        simd4scalar vDst[8];
        vTranspose4x8(vDst, src0, src1, src2, src3);
-        _mm_store_ps((float*)pDst, vDst[0]);
-        _mm_store_ps((float*)pDst+4, vDst[1]);
-        _mm_store_ps((float*)pDst+8, vDst[2]);
-        _mm_store_ps((float*)pDst+12, vDst[3]);
-        _mm_store_ps((float*)pDst+16, vDst[4]);
-        _mm_store_ps((float*)pDst+20, vDst[5]);
-        _mm_store_ps((float*)pDst+24, vDst[6]);
-        _mm_store_ps((float*)pDst+28, vDst[7]);
+        SIMD128::store_ps((float*)pDst, vDst[0]);
+        SIMD128::store_ps((float*)pDst+4, vDst[1]);
+        SIMD128::store_ps((float*)pDst+8, vDst[2]);
+        SIMD128::store_ps((float*)pDst+12, vDst[3]);
+        SIMD128::store_ps((float*)pDst+16, vDst[4]);
+        SIMD128::store_ps((float*)pDst+20, vDst[5]);
+        SIMD128::store_ps((float*)pDst+24, vDst[6]);
+        SIMD128::store_ps((float*)pDst+28, vDst[7]);
 #else
 #error Unsupported vector width
 #endif
@ -400,16 +400,16 @@ struct Transpose32_32_32
        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);

-        __m128 vDst[8];
+        simd4scalar vDst[8];
        vTranspose3x8(vDst, src0, src1, src2);
-        _mm_store_ps((float*)pDst, vDst[0]);
-        _mm_store_ps((float*)pDst + 4, vDst[1]);
-        _mm_store_ps((float*)pDst + 8, vDst[2]);
-        _mm_store_ps((float*)pDst + 12, vDst[3]);
-        _mm_store_ps((float*)pDst + 16, vDst[4]);
-        _mm_store_ps((float*)pDst + 20, vDst[5]);
-        _mm_store_ps((float*)pDst + 24, vDst[6]);
-        _mm_store_ps((float*)pDst + 28, vDst[7]);
+        SIMD128::store_ps((float*)pDst, vDst[0]);
+        SIMD128::store_ps((float*)pDst + 4, vDst[1]);
+        SIMD128::store_ps((float*)pDst + 8, vDst[2]);
+        SIMD128::store_ps((float*)pDst + 12, vDst[3]);
+        SIMD128::store_ps((float*)pDst + 16, vDst[4]);
+        SIMD128::store_ps((float*)pDst + 20, vDst[5]);
+        SIMD128::store_ps((float*)pDst + 24, vDst[6]);
+        SIMD128::store_ps((float*)pDst + 28, vDst[7]);
 #else
 #error Unsupported vector width
 #endif
@ -448,21 +448,21 @@ struct Transpose32_32
    {
 #if KNOB_SIMD_WIDTH == 8
        const float* pfSrc = (const float*)pSrc;
-        __m128 src_r0 = _mm_load_ps(pfSrc + 0);
-        __m128 src_r1 = _mm_load_ps(pfSrc + 4);
-        __m128 src_g0 = _mm_load_ps(pfSrc + 8);
-        __m128 src_g1 = _mm_load_ps(pfSrc + 12);
+        simd4scalar src_r0 = SIMD128::load_ps(pfSrc + 0);
+        simd4scalar src_r1 = SIMD128::load_ps(pfSrc + 4);
+        simd4scalar src_g0 = SIMD128::load_ps(pfSrc + 8);
+        simd4scalar src_g1 = SIMD128::load_ps(pfSrc + 12);

-        __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
-        __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
-        __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
-        __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
+        simd4scalar dst0 = SIMD128::unpacklo_ps(src_r0, src_g0);
+        simd4scalar dst1 = SIMD128::unpackhi_ps(src_r0, src_g0);
+        simd4scalar dst2 = SIMD128::unpacklo_ps(src_r1, src_g1);
+        simd4scalar dst3 = SIMD128::unpackhi_ps(src_r1, src_g1);

        float* pfDst = (float*)pDst;
-        _mm_store_ps(pfDst + 0, dst0);
-        _mm_store_ps(pfDst + 4, dst1);
-        _mm_store_ps(pfDst + 8, dst2);
-        _mm_store_ps(pfDst + 12, dst3);
+        SIMD128::store_ps(pfDst + 0, dst0);
+        SIMD128::store_ps(pfDst + 4, dst1);
+        SIMD128::store_ps(pfDst + 8, dst2);
+        SIMD128::store_ps(pfDst + 12, dst3);
 #else
 #error Unsupported vector width
 #endif
@ -504,25 +504,25 @@ struct Transpose16_16_16_16
        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
        simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));

-        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
-        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
-        __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
-        __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
+        simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
+        simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
+        simd4scalari src_b = _simd_extractf128_si(src_ba, 0);
+        simd4scalari src_a = _simd_extractf128_si(src_ba, 1);

-        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
-        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
-        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
-        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
+        simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
+        simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
+        simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
+        simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);

-        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
-        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
-        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
-        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
+        simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
+        simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
+        simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
+        simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);

-        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
-        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
-        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
-        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
+        SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
+        SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
+        SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
+        SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
 #else
 #error Unsupported vector width
 #endif
@ -573,25 +573,25 @@ struct Transpose16_16_16
 #if KNOB_SIMD_WIDTH == 8
        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);

-        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
-        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
-        __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
-        __m128i src_a = _mm_undefined_si128();
+        simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
+        simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
+        simd4scalari src_b = SIMD128::load_si((const simd4scalari*)(pSrc + sizeof(simdscalari)));
+        simd4scalari src_a = SIMD128::setzero_si();

-        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
-        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
-        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
-        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
+        simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
+        simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
+        simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
+        simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);

-        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
-        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
-        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
-        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
+        simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
+        simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
+        simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
+        simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);

-        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
-        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
-        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
-        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
+        SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
+        SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
+        SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
+        SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
 #else
 #error Unsupported vector width
 #endif
@ -642,17 +642,17 @@ struct Transpose16_16
 #if KNOB_SIMD_WIDTH == 8
        simdscalar src = _simd_load_ps((const float*)pSrc);

-        __m128 comp0 = _mm256_castps256_ps128(src);
-        __m128 comp1 = _mm256_extractf128_ps(src, 1);
+        simd4scalar comp0 = _simd_extractf128_ps(src, 0);
+        simd4scalar comp1 = _simd_extractf128_ps(src, 1);

-        __m128i comp0i = _mm_castps_si128(comp0);
-        __m128i comp1i = _mm_castps_si128(comp1);
+        simd4scalari comp0i = SIMD128::castps_si(comp0);
+        simd4scalari comp1i = SIMD128::castps_si(comp1);

-        __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
-        __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
+        simd4scalari resLo = SIMD128::unpacklo_epi16(comp0i, comp1i);
+        simd4scalari resHi = SIMD128::unpackhi_epi16(comp0i, comp1i);

-        _mm_store_si128((__m128i*)pDst, resLo);
-        _mm_store_si128((__m128i*)pDst + 1, resHi);
+        SIMD128::store_si((simd4scalari*)pDst, resLo);
+        SIMD128::store_si((simd4scalari*)pDst + 1, resHi);
 #else
 #error Unsupported vector width
 #endif
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@ -527,7 +527,7 @@ static void StreamOut(
        // Write all entries into primitive data buffer for SOS.
        while (_BitScanForward(&slot, soMask))
        {
-            __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
+            simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
            uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
            pa.AssembleSingle(paSlot, primIndex, attrib);

@ -941,7 +941,9 @@ static void GeometryShaderStage(

                            if (HasStreamOutT::value)
                            {
+#if ENABLE_AVX512_SIMD16
                                gsPa.useAlternateOffset = false;
+#endif
                                StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
                            }

@ -1279,7 +1281,9 @@ static void TessellationStages(
            {
                if (HasStreamOutT::value)
                {
+#if ENABLE_AVX512_SIMD16
                    tessPa.useAlternateOffset = false;
+#endif
                    StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
                }

--- a/src/gallium/drivers/swr/rasterizer/core/frontend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h
@ -391,7 +391,7 @@ struct PA_STATE_BASE;  // forward decl
 void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
 void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
 #if USE_SIMD16_FRONTEND
-void SIMDAPI BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
-void SIMDAPI BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
+void SIMDCALL BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
+void SIMDCALL BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
 #endif

--- a/src/gallium/drivers/swr/rasterizer/core/pa.h
+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h
@ -92,7 +92,7 @@ struct PA_STATE
 #if ENABLE_AVX512_SIMD16
    virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0;
 #endif
-    virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
+    virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
    virtual bool NextPrim() = 0;
    virtual SIMDVERTEX& GetNextVsOutput() = 0;
    virtual bool GetNextStreamOutput() = 0;
@ -139,7 +139,7 @@ struct PA_STATE_OPT : public PA_STATE
 #if ENABLE_AVX512_SIMD16
    typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-    typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+    typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);

    PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
 #if ENABLE_AVX512_SIMD16
@ -205,7 +205,7 @@ struct PA_STATE_OPT : public PA_STATE

 #endif
    // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
-    void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
+    void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
    {
        return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
    }
@ -767,7 +767,7 @@ PRAGMA_WARNING_POP()
    }

 #endif
-    void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
+    void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
    {
        // move to slot
        for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
@ -1253,7 +1253,7 @@ struct PA_TESS : PA_STATE
                    _simd16_setzero_ps(),
                    pBase,
                    indices,
-                    mask,
+                    _simd16_castsi_ps(mask),
                    4 /* gcc doesn't like sizeof(float) */);

                verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
@ -1263,7 +1263,7 @@ struct PA_TESS : PA_STATE
                    pBase,
                    indices,
                    _simd_castsi_ps(mask),
-                    4 /* gcc doesn't like sizeof(float) */);
+                    4); // gcc doesn't like sizeof(float)
 #endif
                pBase += m_attributeStrideInVectors * SIMD_WIDTH;
            }
@ -1302,7 +1302,7 @@ struct PA_TESS : PA_STATE
                    _simd16_setzero_ps(),
                    pBase,
                    indices,
-                    mask,
+                    _simd16_castsi_ps(mask),
                    4 /* gcc doesn't like sizeof(float) */);
 #else
                simdscalar temp = _simd_mask_i32gather_ps(
@ -1321,7 +1321,7 @@ struct PA_TESS : PA_STATE
    }

 #endif
-    void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
+    void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
    {
        SWR_ASSERT(slot < m_numAttributes);
        SWR_ASSERT(primIndex < PA_TESS::NumPrims());
--- a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
@ -34,103 +34,103 @@

 #if (KNOB_SIMD_WIDTH == 8)

-INLINE __m128 swizzleLane0(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane0(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
    simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
    simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
 }

-INLINE __m128 swizzleLane1(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane1(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
    simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
    simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
 }

-INLINE __m128 swizzleLane2(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane2(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
    simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
    simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
 }

-INLINE __m128 swizzleLane3(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane3(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
    simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
    simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
 }

-INLINE __m128 swizzleLane4(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane4(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
    simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
    simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
 }

-INLINE __m128 swizzleLane5(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane5(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
    simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
    simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
 }

-INLINE __m128 swizzleLane6(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane6(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
    simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
    simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
 }

-INLINE __m128 swizzleLane7(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane7(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
    simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
    simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
 }

-INLINE __m128 swizzleLane0(const simdvector &v)
+INLINE simd4scalar swizzleLane0(const simdvector &v)
 {
    return swizzleLane0(v.x, v.y, v.z, v.w);
 }

-INLINE __m128 swizzleLane1(const simdvector &v)
+INLINE simd4scalar swizzleLane1(const simdvector &v)
 {
    return swizzleLane1(v.x, v.y, v.z, v.w);
 }

-INLINE __m128 swizzleLane2(const simdvector &v)
+INLINE simd4scalar swizzleLane2(const simdvector &v)
 {
    return swizzleLane2(v.x, v.y, v.z, v.w);
 }

-INLINE __m128 swizzleLane3(const simdvector &v)
+INLINE simd4scalar swizzleLane3(const simdvector &v)
 {
    return swizzleLane3(v.x, v.y, v.z, v.w);
 }

-INLINE __m128 swizzleLane4(const simdvector &v)
+INLINE simd4scalar swizzleLane4(const simdvector &v)
 {
    return swizzleLane4(v.x, v.y, v.z, v.w);
 }

-INLINE __m128 swizzleLane5(const simdvector &v)
+INLINE simd4scalar swizzleLane5(const simdvector &v)
 {
    return swizzleLane5(v.x, v.y, v.z, v.w);
 }

-INLINE __m128 swizzleLane6(const simdvector &v)
+INLINE simd4scalar swizzleLane6(const simdvector &v)
 {
    return swizzleLane6(v.x, v.y, v.z, v.w);
 }

-INLINE __m128 swizzleLane7(const simdvector &v)
+INLINE simd4scalar swizzleLane7(const simdvector &v)
 {
    return swizzleLane7(v.x, v.y, v.z, v.w);
 }

-INLINE __m128 swizzleLaneN(const simdvector &v, int lane)
+INLINE simd4scalar swizzleLaneN(const simdvector &v, int lane)
 {
    switch (lane)
    {
@ -156,87 +156,87 @@ INLINE __m128 swizzleLaneN(const simdvector &v, int lane)
 }

 #if ENABLE_AVX512_SIMD16
-INLINE __m128 swizzleLane0(const simd16vector &v)
+INLINE simd4scalar swizzleLane0(const simd16vector &v)
 {
    return swizzleLane0(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }

-INLINE __m128 swizzleLane1(const simd16vector &v)
+INLINE simd4scalar swizzleLane1(const simd16vector &v)
 {
    return swizzleLane1(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }

-INLINE __m128 swizzleLane2(const simd16vector &v)
+INLINE simd4scalar swizzleLane2(const simd16vector &v)
 {
    return swizzleLane2(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }

-INLINE __m128 swizzleLane3(const simd16vector &v)
+INLINE simd4scalar swizzleLane3(const simd16vector &v)
 {
    return swizzleLane3(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }

-INLINE __m128 swizzleLane4(const simd16vector &v)
+INLINE simd4scalar swizzleLane4(const simd16vector &v)
 {
    return swizzleLane4(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }

-INLINE __m128 swizzleLane5(const simd16vector &v)
+INLINE simd4scalar swizzleLane5(const simd16vector &v)
 {
    return swizzleLane5(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }

-INLINE __m128 swizzleLane6(const simd16vector &v)
+INLINE simd4scalar swizzleLane6(const simd16vector &v)
 {
    return swizzleLane6(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }

-INLINE __m128 swizzleLane7(const simd16vector &v)
+INLINE simd4scalar swizzleLane7(const simd16vector &v)
 {
    return swizzleLane7(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }

-INLINE __m128 swizzleLane8(const simd16vector &v)
+INLINE simd4scalar swizzleLane8(const simd16vector &v)
 {
    return swizzleLane0(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }

-INLINE __m128 swizzleLane9(const simd16vector &v)
+INLINE simd4scalar swizzleLane9(const simd16vector &v)
 {
    return swizzleLane1(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }

-INLINE __m128 swizzleLaneA(const simd16vector &v)
+INLINE simd4scalar swizzleLaneA(const simd16vector &v)
 {
    return swizzleLane2(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }

-INLINE __m128 swizzleLaneB(const simd16vector &v)
+INLINE simd4scalar swizzleLaneB(const simd16vector &v)
 {
    return swizzleLane3(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }

-INLINE __m128 swizzleLaneC(const simd16vector &v)
+INLINE simd4scalar swizzleLaneC(const simd16vector &v)
 {
    return swizzleLane4(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }

-INLINE __m128 swizzleLaneD(const simd16vector &v)
+INLINE simd4scalar swizzleLaneD(const simd16vector &v)
 {
    return swizzleLane5(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }

-INLINE __m128 swizzleLaneE(const simd16vector &v)
+INLINE simd4scalar swizzleLaneE(const simd16vector &v)
 {
    return swizzleLane6(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }

-INLINE __m128 swizzleLaneF(const simd16vector &v)
+INLINE simd4scalar swizzleLaneF(const simd16vector &v)
 {
    return swizzleLane7(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }

-INLINE __m128 swizzleLaneN(const simd16vector &v, int lane)
+INLINE simd4scalar swizzleLaneN(const simd16vector &v, int lane)
 {
    switch (lane)
    {
@ -286,7 +286,7 @@ bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);

 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@ -294,7 +294,7 @@ bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);

 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@ -302,7 +302,7 @@ bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);

 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@ -310,7 +310,7 @@ bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);

 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@ -318,7 +318,7 @@ bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);

 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@ -326,7 +326,7 @@ bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);

 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@ -334,13 +334,13 @@ bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);

 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 #if ENABLE_AVX512_SIMD16
 bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);

 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@ -350,10 +350,10 @@ bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);

 template <uint32_t TotalControlPoints>
-void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
    // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
    // KNOB_SIMD_WIDTH * 1 patch.  This function is called once per attribute.
@ -788,7 +788,7 @@ bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 }

 #endif
-void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
    const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
@ -1057,7 +1057,7 @@ bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 }

 #endif
-void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
    const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
@ -1325,7 +1325,7 @@ bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 }

 #endif
-void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
    const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot);
@ -1491,7 +1491,7 @@ bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 }

 #endif
-void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
    const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
@ -1741,7 +1741,7 @@ bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 }

 #endif
-void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
    PaLineStripSingle0(pa, slot, primIndex, verts);

@ -1855,7 +1855,7 @@ bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 }

 #endif
-void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
    const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
@ -2075,7 +2075,7 @@ bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 }

 #endif
-void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
    const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
@ -2239,7 +2239,7 @@ bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 }

 #endif
-void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
    const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
@ -2529,7 +2529,7 @@ void PaRectListSingle0(
    PA_STATE_OPT& pa,
    uint32_t slot,
    uint32_t primIndex,
-    __m128 verts[])
+    simd4scalar verts[])
 {
    // We have 12 simdscalars contained within 3 simdvectors which
    // hold at least 8 triangles worth of data. We want to assemble a single
--- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
@ -199,15 +199,15 @@ struct StorePixels<32, 2>
    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
    {
        // Each 4-pixel row is 16-bytes
-        __m128i *pZRow01 = (__m128i*)pSrc;
-        __m128i vQuad00 = _mm_load_si128(pZRow01);
-        __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
+        simd4scalari *pZRow01 = (simd4scalari*)pSrc;
+        simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
+        simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);

-        __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
-        __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
+        simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
+        simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);

-        _mm_storeu_si128((__m128i*)ppDsts[0], vRow00);
-        _mm_storeu_si128((__m128i*)ppDsts[1], vRow10);
+        SIMD128::storeu_si((simd4scalari*)ppDsts[0], vRow00);
+        SIMD128::storeu_si((simd4scalari*)ppDsts[1], vRow10);
    }
 };

@ -218,20 +218,20 @@ struct StorePixels<32, 4>
    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
    {
        // 4 x 16 bytes = 64 bytes, 16 pixels
-        const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
+        const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);

-        __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
+        simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);

        // Unswizzle from SWR-Z order
-        __m128i quad0 = _mm_load_si128(&pSrc128[0]);                        // 0 1 2 3
-        __m128i quad1 = _mm_load_si128(&pSrc128[1]);                        // 4 5 6 7
-        __m128i quad2 = _mm_load_si128(&pSrc128[2]);                        // 8 9 A B
-        __m128i quad3 = _mm_load_si128(&pSrc128[3]);                        // C D E F
+        simd4scalari quad0 = SIMD128::load_si(&pSrc128[0]);                        // 0 1 2 3
+        simd4scalari quad1 = SIMD128::load_si(&pSrc128[1]);                        // 4 5 6 7
+        simd4scalari quad2 = SIMD128::load_si(&pSrc128[2]);                        // 8 9 A B
+        simd4scalari quad3 = SIMD128::load_si(&pSrc128[3]);                        // C D E F

-        _mm_storeu_si128(ppDsts128[0], _mm_unpacklo_epi64(quad0, quad1));   // 0 1 4 5
-        _mm_storeu_si128(ppDsts128[1], _mm_unpackhi_epi64(quad0, quad1));   // 2 3 6 7
-        _mm_storeu_si128(ppDsts128[2], _mm_unpacklo_epi64(quad2, quad3));   // 8 9 C D
-        _mm_storeu_si128(ppDsts128[3], _mm_unpackhi_epi64(quad2, quad3));   // A B E F
+        SIMD128::storeu_si(ppDsts128[0], SIMD128::unpacklo_epi64(quad0, quad1));   // 0 1 4 5
+        SIMD128::storeu_si(ppDsts128[1], SIMD128::unpackhi_epi64(quad0, quad1));   // 2 3 6 7
+        SIMD128::storeu_si(ppDsts128[2], SIMD128::unpacklo_epi64(quad2, quad3));   // 8 9 C D
+        SIMD128::storeu_si(ppDsts128[3], SIMD128::unpackhi_epi64(quad2, quad3));   // A B E F
    }
 };

@ -251,10 +251,10 @@ struct StorePixels<64, 4>
    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
    {
        // Each 4-pixel row is 32 bytes.
-        const __m128i* pPixSrc = (const __m128i*)pSrc;
+        const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;

        // order of pointers match SWR-Z layout
-        __m128i** pvDsts = (__m128i**)&ppDsts[0];
+        simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
        *pvDsts[0] = pPixSrc[0];
        *pvDsts[1] = pPixSrc[1];
        *pvDsts[2] = pPixSrc[2];
@ -269,9 +269,9 @@ struct StorePixels<64, 8>
    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
    {
        // 8 x 16 bytes = 128 bytes, 16 pixels
-        const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
+        const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);

-        __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
+        simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);

        // order of pointers match SWR-Z layout
        *ppDsts128[0] = pSrc128[0];     // 0 1
@ -301,10 +301,10 @@ struct StorePixels<128, 8>
    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
    {
        // Each 4-pixel row is 64 bytes.
-        const __m128i* pPixSrc = (const __m128i*)pSrc;
+        const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;

        // Unswizzle from SWR-Z order
-        __m128i** pvDsts = (__m128i**)&ppDsts[0];
+        simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
        *pvDsts[0] = pPixSrc[0];
        *pvDsts[1] = pPixSrc[2];
        *pvDsts[2] = pPixSrc[1];
@ -323,9 +323,9 @@ struct StorePixels<128, 16>
    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
    {
        // 16 x 16 bytes = 256 bytes, 16 pixels
-        const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
+        const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);

-        __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
+        simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);

        for (uint32_t i = 0; i < 16; i += 4)
        {
@ -563,8 +563,8 @@ struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
        temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));

        // merge/store data into destination but don't overwrite the X8 bits
-        simdscalari destlo = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]));
-        simdscalari desthi = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]));
+        simdscalari destlo = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]));
+        simdscalari desthi = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]));

        simd16scalari dest = _simd16_setzero_si();

@ -575,8 +575,8 @@ struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>

        dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));

-        _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), _simd16_extract_si(dest, 0));
-        _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), _simd16_extract_si(dest, 1));
+        _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]), _simd16_extract_si(dest, 0));
+        _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]), _simd16_extract_si(dest, 1));
 #else
        static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel

@ -593,25 +593,25 @@ struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>

        // Store data into destination but don't overwrite the X8 bits
        // Each 4-pixel row is 16-bytes
-        __m128i *pZRow01 = (__m128i*)aosTile;
-        __m128i vQuad00 = _mm_load_si128(pZRow01);
-        __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
+        simd4scalari *pZRow01 = (simd4scalari*)aosTile;
+        simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
+        simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);

-        __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
-        __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
+        simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
+        simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);

-        __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]);
-        __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]);
+        simd4scalari vDst0 = SIMD128::loadu_si((const simd4scalari*)ppDsts[0]);
+        simd4scalari vDst1 = SIMD128::loadu_si((const simd4scalari*)ppDsts[1]);

-        __m128i vMask = _mm_set1_epi32(0xFFFFFF);
+        simd4scalari vMask = _mm_set1_epi32(0xFFFFFF);

-        vDst0 = _mm_andnot_si128(vMask, vDst0);
-        vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask));
-        vDst1 = _mm_andnot_si128(vMask, vDst1);
-        vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask));
+        vDst0 = SIMD128::andnot_si(vMask, vDst0);
+        vDst0 = SIMD128::or_si(vDst0, SIMD128::and_si(vRow00, vMask));
+        vDst1 = SIMD128::andnot_si(vMask, vDst1);
+        vDst1 = SIMD128::or_si(vDst1, SIMD128::and_si(vRow10, vMask));

-        _mm_storeu_si128((__m128i*)ppDsts[0], vDst0);
-        _mm_storeu_si128((__m128i*)ppDsts[1], vDst1);
+        SIMD128::storeu_si((simd4scalari*)ppDsts[0], vDst0);
+        SIMD128::storeu_si((simd4scalari*)ppDsts[1], vDst1);
 #endif
    }
 };
@ -683,8 +683,8 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDs
    // store 8x2 memory order:
    //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
    //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
-    _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
-    _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
+    _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
+    _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
 }

 #endif
@ -736,15 +736,15 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst

    // splitting into two sets of 4 wide integer vector types
    // because AVX doesn't have instructions to support this operation at 8 wide
-    __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
-    __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
-    __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
-    __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
+    simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
+    simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
+    simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
+    simd4scalari srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a

-    __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
-    __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
-    __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
-    __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
+    simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
+    simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
+    simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
+    simd4scalari srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a

    srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
    srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
@ -753,18 +753,18 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst
    srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
    srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000

-    srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
-    srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
+    srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
+    srcLo2 = SIMD128::or_si(srcLo2, srcLo3); // ab00ab00ab00ab00

-    srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
-    srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
+    srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
+    srcHi2 = SIMD128::or_si(srcHi2, srcHi3); // ab00ab00ab00ab00

-    srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
-    srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
+    srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // abgrabgrabgrabgr
+    srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // abgrabgrabgrabgr

    // unpack into rows that get the tiling order correct
-    __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0);  // abgrabgrabgrabgrabgrabgrabgrabgr
-    __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
+    simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0);  // abgrabgrabgrabgrabgrabgrabgrabgr
+    simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);

    simdscalari final = _mm256_castsi128_si256(vRow00);
    final = _mm256_insertf128_si256(final, vRow10, 1);
@ -785,7 +785,7 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst
    final = _mm256_permute4x64_epi64(final, 0xD8);
 #endif

-    _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
+    _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
 }

 #if USE_8x2_TILE_BACKEND
@ -848,8 +848,8 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8
    // store 8x2 memory order:
    //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
    //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
-    _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
-    _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
+    _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
+    _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
 }

 #endif
@ -894,29 +894,29 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_

    // splitting into two sets of 4 wide integer vector types
    // because AVX doesn't have instructions to support this operation at 8 wide
-    __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
-    __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
-    __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
+    simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
+    simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
+    simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b

-    __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
-    __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
-    __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
+    simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
+    simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
+    simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b

    srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
    srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
    srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
    srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00

-    srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
+    srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr

-    srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
+    srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr

-    srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
-    srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
+    srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
+    srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr

    // unpack into rows that get the tiling order correct
-    __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0);  // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
-    __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
+    simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0);  // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
+    simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);

    simdscalari final = _mm256_castsi128_si256(vRow00);
    final = _mm256_insertf128_si256(final, vRow10, 1);
@ -936,7 +936,7 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_

 #endif

-    _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
+    _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
 }

 template<>