swr/rast: Switch intrinsic usage to SIMDLib

Switch from a macro-based simd intrinsics layer to a more C++
implementation, which also adds AVX512 optimizations to 128-bit
and 256-bit SIMD.

Reviewed-by: Bruce Cherniak <bruce.cherniak at intel.com>
This commit is contained in:
Tim Rowley 2017-06-15 15:24:07 -05:00
parent 8b66d18a3b
commit fc4f6c44c4
30 changed files with 6206 additions and 2663 deletions

View file

@ -65,6 +65,19 @@ COMMON_CXX_SOURCES := \
rasterizer/common/rdtsc_buckets_shared.h \
rasterizer/common/simd16intrin.h \
rasterizer/common/simdintrin.h \
rasterizer/common/simdlib.hpp \
rasterizer/common/simdlib_128_avx.inl \
rasterizer/common/simdlib_128_avx2.inl \
rasterizer/common/simdlib_128_avx512.inl \
rasterizer/common/simdlib_256_avx.inl \
rasterizer/common/simdlib_256_avx2.inl \
rasterizer/common/simdlib_256_avx512.inl \
rasterizer/common/simdlib_512_avx512.inl \
rasterizer/common/simdlib_512_avx512_masks.inl \
rasterizer/common/simdlib_512_emu.inl \
rasterizer/common/simdlib_512_emu_masks.inl \
rasterizer/common/simdlib_interface.hpp \
rasterizer/common/simdlib_types.hpp \
rasterizer/common/swr_assert.cpp \
rasterizer/common/swr_assert.h

View file

@ -26,89 +26,37 @@
#include "os.h"
#include <cassert>
#define SIMD_ARCH KNOB_ARCH
#include "simdlib_types.hpp"
#include <emmintrin.h>
#include <immintrin.h>
#include <xmmintrin.h>
typedef SIMDImpl::SIMD128Impl::Float simd4scalar;
typedef SIMDImpl::SIMD128Impl::Double simd4scalard;
typedef SIMDImpl::SIMD128Impl::Integer simd4scalari;
typedef SIMDImpl::SIMD128Impl::Vec4 simd4vector;
typedef SIMDImpl::SIMD128Impl::Mask simd4mask;
typedef SIMDImpl::SIMD256Impl::Float simd8scalar;
typedef SIMDImpl::SIMD256Impl::Double simd8scalard;
typedef SIMDImpl::SIMD256Impl::Integer simd8scalari;
typedef SIMDImpl::SIMD256Impl::Vec4 simd8vector;
typedef SIMDImpl::SIMD256Impl::Mask simd8mask;
typedef SIMDImpl::SIMD512Impl::Float simd16scalar;
typedef SIMDImpl::SIMD512Impl::Double simd16scalard;
typedef SIMDImpl::SIMD512Impl::Integer simd16scalari;
typedef SIMDImpl::SIMD512Impl::Vec4 simd16vector;
typedef SIMDImpl::SIMD512Impl::Mask simd16mask;
#if KNOB_SIMD_WIDTH == 8
typedef __m256 simdscalar;
typedef __m256i simdscalari;
typedef uint8_t simdmask;
typedef simd8scalar simdscalar;
typedef simd8scalard simdscalard;
typedef simd8scalari simdscalari;
typedef simd8vector simdvector;
typedef simd8mask simdmask;
#else
#error Unsupported vector width
#endif
// simd vector
OSALIGNSIMD(union) simdvector
{
simdscalar v[4];
struct
{
simdscalar x, y, z, w;
};
simdscalar& operator[] (const int i) { return v[i]; }
const simdscalar& operator[] (const int i) const { return v[i]; }
};
#if ENABLE_AVX512_SIMD16
#if KNOB_SIMD16_WIDTH == 16
#if ENABLE_AVX512_EMULATION
struct simd16scalar
{
__m256 lo;
__m256 hi;
};
struct simd16scalard
{
__m256d lo;
__m256d hi;
};
struct simd16scalari
{
__m256i lo;
__m256i hi;
};
typedef uint16_t simd16mask;
#else
typedef __m512 simd16scalar;
typedef __m512d simd16scalard;
typedef __m512i simd16scalari;
typedef __mmask16 simd16mask;
#endif//ENABLE_AVX512_EMULATION
#else
#error Unsupported vector width
#endif//KNOB_SIMD16_WIDTH == 16
#define _simd16_masklo(mask) ((mask) & 0xFF)
#define _simd16_maskhi(mask) (((mask) >> 8) & 0xFF)
#define _simd16_setmask(hi, lo) (((hi) << 8) | (lo))
#if defined(_WIN32)
#define SIMDAPI __vectorcall
#else
#define SIMDAPI
#endif
OSALIGN(union, KNOB_SIMD16_BYTES) simd16vector
{
simd16scalar v[4];
struct
{
simd16scalar x, y, z, w;
};
simd16scalar& operator[] (const int i) { return v[i]; }
const simd16scalar& operator[] (const int i) const { return v[i]; }
};
#endif // ENABLE_AVX512_SIMD16
INLINE
UINT pdep_u32(UINT a, UINT mask)
{

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,550 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#pragma once
#include "simdlib_types.hpp"
// For documentation, please see the following include...
// #include "simdlib_interface.hpp"
namespace SIMDImpl
{
namespace SIMD128Impl
{
#if SIMD_ARCH >= SIMD_ARCH_AVX
struct AVXImpl
{
#define __SIMD_LIB_AVX_HPP__
#include "simdlib_128_avx.inl"
#undef __SIMD_LIB_AVX_HPP__
}; // struct AVXImpl
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
#if SIMD_ARCH >= SIMD_ARCH_AVX2
struct AVX2Impl : AVXImpl
{
#define __SIMD_LIB_AVX2_HPP__
#include "simdlib_128_avx2.inl"
#undef __SIMD_LIB_AVX2_HPP__
}; // struct AVX2Impl
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
#if SIMD_ARCH >= SIMD_ARCH_AVX512
struct AVX512Impl : AVX2Impl
{
#define __SIMD_LIB_AVX512_HPP__
#include "simdlib_128_avx512.inl"
#undef __SIMD_LIB_AVX512_HPP__
}; // struct AVX2Impl
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
struct Traits : SIMDImpl::Traits
{
#if SIMD_ARCH == SIMD_ARCH_AVX
using IsaImpl = AVXImpl;
#elif SIMD_ARCH == SIMD_ARCH_AVX2
using IsaImpl = AVX2Impl;
#elif SIMD_ARCH == SIMD_ARCH_AVX512
using IsaImpl = AVX512Impl;
#else
#error Invalid value for SIMD_ARCH
#endif
using Float = SIMD128Impl::Float;
using Double = SIMD128Impl::Double;
using Integer = SIMD128Impl::Integer;
using Vec4 = SIMD128Impl::Vec4;
using Mask = SIMD128Impl::Mask;
};
} // ns SIMD128Impl
namespace SIMD256Impl
{
#if SIMD_ARCH >= SIMD_ARCH_AVX
struct AVXImpl
{
#define __SIMD_LIB_AVX_HPP__
#include "simdlib_256_avx.inl"
#undef __SIMD_LIB_AVX_HPP__
}; // struct AVXImpl
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
#if SIMD_ARCH >= SIMD_ARCH_AVX2
struct AVX2Impl : AVXImpl
{
#define __SIMD_LIB_AVX2_HPP__
#include "simdlib_256_avx2.inl"
#undef __SIMD_LIB_AVX2_HPP__
}; // struct AVX2Impl
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
#if SIMD_ARCH >= SIMD_ARCH_AVX512
struct AVX512Impl : AVX2Impl
{
#define __SIMD_LIB_AVX512_HPP__
#include "simdlib_256_avx512.inl"
#undef __SIMD_LIB_AVX512_HPP__
}; // struct AVX2Impl
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
struct Traits : SIMDImpl::Traits
{
#if SIMD_ARCH == SIMD_ARCH_AVX
using IsaImpl = AVXImpl;
#elif SIMD_ARCH == SIMD_ARCH_AVX2
using IsaImpl = AVX2Impl;
#elif SIMD_ARCH == SIMD_ARCH_AVX512
using IsaImpl = AVX512Impl;
#else
#error Invalid value for SIMD_ARCH
#endif
using Float = SIMD256Impl::Float;
using Double = SIMD256Impl::Double;
using Integer = SIMD256Impl::Integer;
using Vec4 = SIMD256Impl::Vec4;
using Mask = SIMD256Impl::Mask;
};
} // ns SIMD256Impl
namespace SIMD512Impl
{
#if SIMD_ARCH >= SIMD_ARCH_AVX
template<typename SIMD256T>
struct AVXImplBase
{
#define __SIMD_LIB_AVX_HPP__
#include "simdlib_512_emu.inl"
#include "simdlib_512_emu_masks.inl"
#undef __SIMD_LIB_AVX_HPP__
}; // struct AVXImplBase
using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>;
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
#if SIMD_ARCH >= SIMD_ARCH_AVX2
using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>;
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
#if SIMD_ARCH >= SIMD_ARCH_AVX512
struct AVX512Impl
{
#define __SIMD_LIB_AVX512_HPP__
#include "simdlib_512_avx512.inl"
#include "simdlib_512_avx512_masks.inl"
#undef __SIMD_LIB_AVX512_HPP__
}; // struct AVX512Impl
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
struct Traits : SIMDImpl::Traits
{
#if SIMD_ARCH == SIMD_ARCH_AVX
using IsaImpl = AVXImpl;
#elif SIMD_ARCH == SIMD_ARCH_AVX2
using IsaImpl = AVX2Impl;
#elif SIMD_ARCH == SIMD_ARCH_AVX512
using IsaImpl = AVX512Impl;
#else
#error Invalid value for SIMD_ARCH
#endif
using Float = SIMD512Impl::Float;
using Double = SIMD512Impl::Double;
using Integer = SIMD512Impl::Integer;
using Vec4 = SIMD512Impl::Vec4;
using Mask = SIMD512Impl::Mask;
};
} // ns SIMD512Impl
} // ns SIMDImpl
template <typename Traits>
struct SIMDBase : Traits::IsaImpl
{
using CompareType = typename Traits::CompareType;
using ScaleFactor = typename Traits::ScaleFactor;
using RoundMode = typename Traits::RoundMode;
using SIMD = typename Traits::IsaImpl;
using Float = typename Traits::Float;
using Double = typename Traits::Double;
using Integer = typename Traits::Integer;
using Vec4 = typename Traits::Vec4;
using Mask = typename Traits::Mask;
// Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
static SIMDINLINE
void vec4_load1_ps(Vec4& r, const float *p)
{
r[0] = SIMD::set1_ps(p[0]);
r[1] = SIMD::set1_ps(p[1]);
r[2] = SIMD::set1_ps(p[2]);
r[3] = SIMD::set1_ps(p[3]);
}
static SIMDINLINE
void vec4_set1_vps(Vec4& r, Float s)
{
r[0] = s;
r[1] = s;
r[2] = s;
r[3] = s;
}
static SIMDINLINE
Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1)
{
Float tmp, r;
r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
return r;
}
static SIMDINLINE
Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1)
{
Float tmp, r;
r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
tmp = SIMD::mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
return r;
}
static SIMDINLINE
Float vec4_rcp_length_ps(const Vec4& v)
{
Float length = vec4_dp4_ps(v, v);
return SIMD::rsqrt_ps(length);
}
static SIMDINLINE
void vec4_normalize_ps(Vec4& r, const Vec4& v)
{
Float rcpLength = vec4_rcp_length_ps(v);
r[0] = SIMD::mul_ps(v[0], rcpLength);
r[1] = SIMD::mul_ps(v[1], rcpLength);
r[2] = SIMD::mul_ps(v[2], rcpLength);
r[3] = SIMD::mul_ps(v[3], rcpLength);
}
static SIMDINLINE
void vec4_mul_ps(Vec4& r, const Vec4& v, Float s)
{
r[0] = SIMD::mul_ps(v[0], s);
r[1] = SIMD::mul_ps(v[1], s);
r[2] = SIMD::mul_ps(v[2], s);
r[3] = SIMD::mul_ps(v[3], s);
}
static SIMDINLINE
void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
{
r[0] = SIMD::mul_ps(v0[0], v1[0]);
r[1] = SIMD::mul_ps(v0[1], v1[1]);
r[2] = SIMD::mul_ps(v0[2], v1[2]);
r[3] = SIMD::mul_ps(v0[3], v1[3]);
}
static SIMDINLINE
void vec4_add_ps(Vec4& r, const Vec4& v0, Float s)
{
r[0] = SIMD::add_ps(v0[0], s);
r[1] = SIMD::add_ps(v0[1], s);
r[2] = SIMD::add_ps(v0[2], s);
r[3] = SIMD::add_ps(v0[3], s);
}
static SIMDINLINE
void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
{
r[0] = SIMD::add_ps(v0[0], v1[0]);
r[1] = SIMD::add_ps(v0[1], v1[1]);
r[2] = SIMD::add_ps(v0[2], v1[2]);
r[3] = SIMD::add_ps(v0[3], v1[3]);
}
static SIMDINLINE
void vec4_min_ps(Vec4& r, const Vec4& v0, Float s)
{
r[0] = SIMD::min_ps(v0[0], s);
r[1] = SIMD::min_ps(v0[1], s);
r[2] = SIMD::min_ps(v0[2], s);
r[3] = SIMD::min_ps(v0[3], s);
}
static SIMDINLINE
void vec4_max_ps(Vec4& r, const Vec4& v0, Float s)
{
r[0] = SIMD::max_ps(v0[0], s);
r[1] = SIMD::max_ps(v0[1], s);
r[2] = SIMD::max_ps(v0[2], s);
r[3] = SIMD::max_ps(v0[3], s);
}
// Matrix4x4 * Vector4
// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
static SIMDINLINE
void SIMDCALL mat4x4_vec4_multiply(
Vec4& result,
const float *pMatrix,
const Vec4& v)
{
Float m;
Float r0;
Float r1;
m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3]
r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
result[0] = r0;
m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3]
r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
result[1] = r0;
m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3]
r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
result[2] = r0;
m = SIMD::load1_ps(pMatrix + 3*4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 3*4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 3*4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 3*4 + 3); // m[row][3]
r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
result[3] = r0;
}
// Matrix4x4 * Vector3 - Direction Vector where w = 0.
// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
static SIMDINLINE
void SIMDCALL mat3x3_vec3_w0_multiply(
Vec4& result,
const float *pMatrix,
const Vec4& v)
{
Float m;
Float r0;
Float r1;
m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
result[0] = r0;
m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
result[1] = r0;
m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
result[2] = r0;
result[3] = SIMD::setzero_ps();
}
// Matrix4x4 * Vector3 - Position vector where w = 1.
// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
static SIMDINLINE
void SIMDCALL mat4x4_vec3_w1_multiply(
Vec4& result,
const float *pMatrix,
const Vec4& v)
{
Float m;
Float r0;
Float r1;
m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3]
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
result[0] = r0;
m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3]
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
result[1] = r0;
m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3]
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
result[2] = r0;
m = SIMD::load1_ps(pMatrix + 3*4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 3*4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 3*4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 3*4 + 3); // m[row][3]
result[3] = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
}
static SIMDINLINE
void SIMDCALL mat4x3_vec3_w1_multiply(
Vec4& result,
const float *pMatrix,
const Vec4& v)
{
Float m;
Float r0;
Float r1;
m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3]
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
result[0] = r0;
m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3]
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
result[1] = r0;
m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0]
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1]
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2]
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3]
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
result[2] = r0;
result[3] = SIMD::set1_ps(1.0f);
}
}; // struct SIMDBase
using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>;
using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>;

View file

@ -0,0 +1,545 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD128 AVX (1) implementation
//============================================================================
#define SIMD_WRAPPER_1(op) \
static SIMDINLINE Float SIMDCALL op(Float a) \
{\
return _mm_##op(a);\
}
#define SIMD_WRAPPER_2(op) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return _mm_##op(a, b);\
}
#define SIMD_DWRAPPER_2(op) \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{\
return _mm_##op(a, b);\
}
#define SIMD_WRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return _mm_##op(a, b, ImmT);\
}
#define SIMD_DWRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{\
return _mm_##op(a, b, ImmT);\
}
#define SIMD_WRAPPER_3(op) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
{\
return _mm_##op(a, b, c);\
}
#define SIMD_IWRAPPER_1(op) \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return _mm_##op(a);\
}
#define SIMD_IWRAPPER_1I_(op, intrin) \
template<int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return intrin(a, ImmT);\
}
#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm_##op)
#define SIMD_IWRAPPER_2_(op, intrin) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return intrin(a, b);\
}
#define SIMD_IWRAPPER_2(op) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return _mm_##op(a, b);\
}
#define SIMD_IFWRAPPER_2(op, intrin) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\
}
#define SIMD_IWRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return _mm_##op(a, b, ImmT);\
}
//-----------------------------------------------------------------------
// Single precision floating point arithmetic operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2(add_ps); // return a + b
SIMD_WRAPPER_2(div_ps); // return a / b
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
SIMD_WRAPPER_2(mul_ps); // return a * b
SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a
SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
SIMD_WRAPPER_2(sub_ps); // return a - b
static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
{
return add_ps(mul_ps(a, b), c);
}
static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c) // return (a * b) - c
{
return sub_ps(mul_ps(a, b), c);
}
template <RoundMode RMT>
static SIMDINLINE Float SIMDCALL round_ps(Float a)
{
return _mm_round_ps(a, static_cast<int>(RMT));
}
static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
//-----------------------------------------------------------------------
// Integer (various width) arithmetic operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
// return (a * b) & 0xFFFFFFFF
//
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
// and store the low 32 bits of the intermediate integers in dst.
SIMD_IWRAPPER_2(mullo_epi32);
SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
SIMD_IWRAPPER_2_(and_si, _mm_and_si128); // return a & b (int)
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128); // return (~a) & b (int)
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
SIMD_IWRAPPER_2_(or_si, _mm_or_si128); // return a | b (int)
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128); // return a ^ b (int)
//-----------------------------------------------------------------------
// Shift operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b (uint32)
{
int32_t a, count;
a = _mm_extract_epi32(vA, 0);
count = _mm_extract_epi32(vB, 0);
a <<= count;
vA = _mm_insert_epi32(vA, a, 0);
a = _mm_extract_epi32(vA, 1);
count = _mm_extract_epi32(vB, 1);
a <<= count;
vA = _mm_insert_epi32(vA, a, 1);
a = _mm_extract_epi32(vA, 2);
count = _mm_extract_epi32(vB, 2);
a <<= count;
vA = _mm_insert_epi32(vA, a, 2);
a = _mm_extract_epi32(vA, 3);
count = _mm_extract_epi32(vB, 3);
a <<= count;
vA = _mm_insert_epi32(vA, a, 3);
return vA;
}
SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
SIMD_IWRAPPER_1I_(srli_si, _mm_srli_si128); // return a >> (ImmT*8) (uint)
template<int ImmT> // same as srli_si, but with Float cast to int
static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
{
return castsi_ps(srli_si<ImmT>(castps_si(a)));
}
static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b (uint32)
{
int32_t a, count;
a = _mm_extract_epi32(vA, 0);
count = _mm_extract_epi32(vB, 0);
a >>= count;
vA = _mm_insert_epi32(vA, a, 0);
a = _mm_extract_epi32(vA, 1);
count = _mm_extract_epi32(vB, 1);
a >>= count;
vA = _mm_insert_epi32(vA, a, 1);
a = _mm_extract_epi32(vA, 2);
count = _mm_extract_epi32(vB, 2);
a >>= count;
vA = _mm_insert_epi32(vA, a, 2);
a = _mm_extract_epi32(vA, 3);
count = _mm_extract_epi32(vB, 3);
a >>= count;
vA = _mm_insert_epi32(vA, a, 3);
return vA;
}
//-----------------------------------------------------------------------
// Conversion operations
//-----------------------------------------------------------------------
static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
{
return _mm_castpd_ps(a);
}
static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
{
return _mm_castps_si128(a);
}
static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
{
return _mm_castsi128_pd(a);
}
static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
{
return _mm_castps_pd(a);
}
static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
{
return _mm_castsi128_ps(a);
}
static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float)
{
return _mm_cvtepi32_ps(a);
}
SIMD_IWRAPPER_1(cvtepu8_epi16); // return (int16)a (uint8 --> int16)
SIMD_IWRAPPER_1(cvtepu8_epi32); // return (int32)a (uint8 --> int32)
SIMD_IWRAPPER_1(cvtepu16_epi32); // return (int32)a (uint16 --> int32)
SIMD_IWRAPPER_1(cvtepu16_epi64); // return (int64)a (uint16 --> int64)
SIMD_IWRAPPER_1(cvtepu32_epi64); // return (int64)a (uint32 --> int64)
static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32)
{
return _mm_cvtps_epi32(a);
}
static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32)
{
return _mm_cvttps_epi32(a);
}
//-----------------------------------------------------------------------
// Comparison operations
//-----------------------------------------------------------------------
template<CompareType CmpTypeT>
static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
{
return _mm_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
}
static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
{
return 0 != _mm_testz_ps(a, b);
}
static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
{
return 0 != _mm_testz_si128(a, b);
}
//-----------------------------------------------------------------------
// Blend / shuffle / permute operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
{
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
}
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
{
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
}
static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value)
{
return _mm_broadcast_ss(p);
}
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm_packs_epi16 and _mm512_packs_epi16
SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm_packs_epi32 and _mm512_packs_epi32
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm_packus_epi16 and _mm512_packus_epi16
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm_packus_epi32 and _mm512_packus_epi32
static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
{
return castps_si(_mm_permutevar_ps(castsi_ps(a), swiz));
}
static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
{
return _mm_permutevar_ps(a, swiz);
}
SIMD_IWRAPPER_1I(shuffle_epi32);
template<int ImmT>
static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) = delete;
SIMD_IWRAPPER_2(shuffle_epi8);
SIMD_DWRAPPER_2I(shuffle_pd);
SIMD_WRAPPER_2I(shuffle_ps);
SIMD_IWRAPPER_2(unpackhi_epi16);
//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps);
static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
{
return castps_si(_mm_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
}
SIMD_IWRAPPER_2(unpackhi_epi64);
SIMD_IWRAPPER_2(unpackhi_epi8);
SIMD_DWRAPPER_2(unpackhi_pd);
SIMD_WRAPPER_2(unpackhi_ps);
SIMD_IWRAPPER_2(unpacklo_epi16);
SIMD_IFWRAPPER_2(unpacklo_epi32, _mm_unpacklo_ps);
SIMD_IWRAPPER_2(unpacklo_epi64);
SIMD_IWRAPPER_2(unpacklo_epi8);
SIMD_DWRAPPER_2(unpacklo_pd);
SIMD_WRAPPER_2(unpacklo_ps);
//-----------------------------------------------------------------------
// Load / store operations
//-----------------------------------------------------------------------
template<ScaleFactor ScaleT>
static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
uint32_t *pOffsets = (uint32_t*)&idx;
Float vResult;
float* pResult = (float*)&vResult;
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
{
uint32_t offset = pOffsets[i];
offset = offset * static_cast<uint32_t>(ScaleT);
pResult[i] = *(float const*)(((uint8_t const*)p + offset));
}
return vResult;
}
static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements)
{
return broadcast_ss(p);
}
static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
{
return _mm_load_ps(p);
}
static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
{
return _mm_load_si128(&p->v);
}
static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
{
return _mm_loadu_ps(p);
}
static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
{
return _mm_lddqu_si128(&p->v);
}
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template<ScaleFactor ScaleT>
static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
{
uint32_t *pOffsets = (uint32_t*)&idx;
Float vResult = old;
float* pResult = (float*)&vResult;
DWORD index;
uint32_t umask = movemask_ps(mask);
while (_BitScanForward(&index, umask))
{
umask &= ~(1 << index);
uint32_t offset = pOffsets[index];
offset = offset * static_cast<uint32_t>(ScaleT);
pResult[index] = *(float const *)(((uint8_t const *)p + offset));
}
return vResult;
}
static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
{
_mm_maskstore_ps(p, mask, src);
}
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
{
return static_cast<uint32_t>(_mm_movemask_epi8(a));
}
static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
{
return static_cast<uint32_t>(_mm_movemask_pd(a));
}
static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
{
return static_cast<uint32_t>(_mm_movemask_ps(a));
}
static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
{
return _mm_set1_epi32(i);
}
static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
{
return _mm_set1_epi8(i);
}
static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
{
return _mm_set1_ps(f);
}
static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
{
return _mm_setzero_ps();
}
static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
{
return _mm_setzero_si128();
}
static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
{
_mm_store_ps(p, a);
}
static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
{
_mm_store_si128(&p->v, a);
}
static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a (same as store_si but allows for unaligned mem)
{
_mm_storeu_si128(&p->v, a);
}
static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache)
{
_mm_stream_ps(p, a);
}
static SIMDINLINE Float SIMDCALL set_ps(float in3, float in2, float in1, float in0)
{
return _mm_set_ps(in3, in2, in1, in0);
}
template <int ImmT>
static SIMDINLINE float SIMDCALL extract_ps(Float a)
{
int tmp = _mm_extract_ps(a, ImmT);
return *reinterpret_cast<float*>(&tmp);
}
#undef SIMD_WRAPPER_1
#undef SIMD_WRAPPER_2
#undef SIMD_DWRAPPER_2
#undef SIMD_DWRAPPER_2I
#undef SIMD_WRAPPER_2I
#undef SIMD_WRAPPER_3
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_2
#undef SIMD_IFWRAPPER_2
#undef SIMD_IWRAPPER_2I
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_1I
#undef SIMD_IWRAPPER_1I_
#undef SIMD_IWRAPPER_2
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2I

View file

@ -0,0 +1,68 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX2_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD4 AVX (2) implementation
//
// Since this implementation inherits from the AVX (1) implementation,
// the only operations below ones that replace AVX (1) operations.
// Only 2 shifts and 2 gathers were introduced with AVX 2
// Also, add native support for FMA operations
//============================================================================
#define SIMD_WRAPPER_3(op) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
{\
return _mm_##op(a, b, c);\
}
SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b (uint32)
{
return _mm_sllv_epi32(vA, vB);
}
static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b (uint32)
{
return _mm_srlv_epi32(vA, vB);
}
template<ScaleFactor ScaleT>
static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
return _mm_i32gather_ps(p, idx, static_cast<const int>(ScaleT));
}
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template<ScaleFactor ScaleT>
static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
{
return _mm_mask_i32gather_ps(old, p, idx, mask, static_cast<const int>(ScaleT));
}
#undef SIMD_WRAPPER_3

View file

@ -0,0 +1,408 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD128 AVX (512) implementation
//
// Since this implementation inherits from the AVX (2) implementation,
// the only operations below ones that replace AVX (2) operations.
// These use native AVX512 instructions with masking to enable a larger
// register set.
//============================================================================
private:
static SIMDINLINE __m512 __conv(Float r) { return _mm512_castps128_ps512(r.v); }
static SIMDINLINE __m512d __conv(Double r) { return _mm512_castpd128_pd512(r.v); }
static SIMDINLINE __m512i __conv(Integer r) { return _mm512_castsi128_si512(r.v); }
static SIMDINLINE Float __conv(__m512 r) { return _mm512_castps512_ps128(r); }
static SIMDINLINE Double __conv(__m512d r) { return _mm512_castpd512_pd128(r); }
static SIMDINLINE Integer __conv(__m512i r) { return _mm512_castsi512_si128(r); }
public:
#define SIMD_WRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Float SIMDCALL op(Float a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
}
#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
#define SIMD_WRAPPER_1I_(op, intrin, mask) \
template<int ImmT> \
static SIMDINLINE Float SIMDCALL op(Float a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
}
#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
#define SIMD_WRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
}
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
#define SIMD_WRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
}
#define SIMD_WRAPPER_3_(op, intrin, mask) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
}
#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
#define SIMD_DWRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Double SIMDCALL op(Double a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
}
#if !defined(AVX512F_STRICT)
#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
#endif
#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
template<int ImmT> \
static SIMDINLINE Double SIMDCALL op(Double a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
}
#if !defined(AVX512F_STRICT)
#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
#endif
#define SIMD_DWRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
}
#if !defined(AVX512F_STRICT)
#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
#endif
#define SIMD_DWRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{\
return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT));\
}
#define SIMD_IWRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
}
#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xf))
#if !defined(AVX512F_STRICT)
#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
#endif
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
template<int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
}
#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xf))
#if !defined(AVX512F_STRICT)
#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
#endif
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
}
#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xf))
#if !defined(AVX512F_STRICT)
#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
#endif
#define SIMD_IWRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
}
//-----------------------------------------------------------------------
// Single precision floating point arithmetic operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2(add_ps); // return a + b
SIMD_WRAPPER_2(div_ps); // return a / b
SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
SIMD_WRAPPER_2(mul_ps); // return a * b
SIMD_WRAPPER_1_(rcp_ps, rcp28_ps, __mmask16(0xf)); // return 1.0f / a
SIMD_WRAPPER_1_(rsqrt_ps, rsqrt28_ps, __mmask16(0xf)); // return 1.0f / sqrt(a)
SIMD_WRAPPER_2(sub_ps); // return a - b
//-----------------------------------------------------------------------
// Integer (various width) arithmetic operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32)
SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32)
SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32)
SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32)
SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32)
SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
#if !defined(AVX512F_STRICT)
SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
#endif
// return (a * b) & 0xFFFFFFFF
//
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
// and store the low 32 bits of the intermediate integers in dst.
SIMD_IWRAPPER_2_32(mullo_epi32);
SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
#if !defined(AVX512F_STRICT)
SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
#endif
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xf)); // return a & b (int)
SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xf)); // return (~a) & b (int)
SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xf)); // return a | b (int)
SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xf)); // return a ^ b (int)
//-----------------------------------------------------------------------
// Shift operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT
SIMD_IWRAPPER_2_32(sllv_epi32); // return a << b (uint32)
SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT (int32)
SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT (uint32)
SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32)
// use AVX2 version
//SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint)
//-----------------------------------------------------------------------
// Conversion operations (Use AVX2 versions)
//-----------------------------------------------------------------------
// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff); // return (int16)a (uint8 --> int16)
// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff); // return (int32)a (uint8 --> int32)
// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff); // return (int32)a (uint16 --> int32)
// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf); // return (int64)a (uint16 --> int64)
// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf); // return (int64)a (uint32 --> int64)
//-----------------------------------------------------------------------
// Comparison operations (Use AVX2 versions
//-----------------------------------------------------------------------
//SIMD_IWRAPPER_2_CMP(cmpeq_epi8); // return a == b (int8)
//SIMD_IWRAPPER_2_CMP(cmpeq_epi16); // return a == b (int16)
//SIMD_IWRAPPER_2_CMP(cmpeq_epi32); // return a == b (int32)
//SIMD_IWRAPPER_2_CMP(cmpeq_epi64); // return a == b (int64)
//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,); // return a > b (int8)
//SIMD_IWRAPPER_2_CMP(cmpgt_epi16); // return a > b (int16)
//SIMD_IWRAPPER_2_CMP(cmpgt_epi32); // return a > b (int32)
//SIMD_IWRAPPER_2_CMP(cmpgt_epi64); // return a > b (int64)
//
//static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b) // return a < b (int32)
//{
// return cmpgt_epi32(b, a);
//}
//-----------------------------------------------------------------------
// Blend / shuffle / permute operations
//-----------------------------------------------------------------------
#if !defined(AVX512F_STRICT)
SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
#endif
// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
//{
// return _mm256_permutevar8x32_ps(a, swiz);
//}
SIMD_IWRAPPER_1I_32(shuffle_epi32);
//template<int ImmT>
//static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
//{
// return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
//}
//SIMD_IWRAPPER_2(shuffle_epi8);
SIMD_IWRAPPER_2_32(unpackhi_epi32);
SIMD_IWRAPPER_2_32(unpacklo_epi32);
#if !defined(AVX512F_STRICT)
SIMD_IWRAPPER_2_16(unpackhi_epi16);
SIMD_IWRAPPER_2_64(unpackhi_epi64);
SIMD_IWRAPPER_2_8(unpackhi_epi8);
SIMD_IWRAPPER_2_16(unpacklo_epi16);
SIMD_IWRAPPER_2_64(unpacklo_epi64);
SIMD_IWRAPPER_2_8(unpacklo_epi8);
#endif
//-----------------------------------------------------------------------
// Load / store operations
//-----------------------------------------------------------------------
static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
{
return __conv(_mm512_maskz_load_ps(__mmask16(0xf), p));
}
static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
{
return __conv(_mm512_maskz_load_epi32(__mmask16(0xf), p));
}
static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
{
return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p));
}
static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
{
return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p));
}
template<ScaleFactor ScaleT>
static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
return __conv(_mm512_mask_i32gather_ps(
_mm512_setzero_ps(),
__mmask16(0xf),
__conv(idx),
p,
static_cast<int>(ScaleT)));
}
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template<ScaleFactor ScaleT>
static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
{
__mmask16 m = 0xf;
m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)),
_mm512_set1_epi32(0x8000000));
return __conv(_mm512_mask_i32gather_ps(
__conv(old),
m,
__conv(idx),
p,
static_cast<int>(ScaleT)));
}
#if !defined(AVX512F_STRICT)
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
{
__mmask64 m = 0xffffull;
return static_cast<uint32_t>(
_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
}
#endif
static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
{
__mmask16 m = 0xf;
m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
_mm512_mask_store_ps(p, m, __conv(src));
}
static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
{
_mm512_mask_store_ps(p, __mmask16(0xf), __conv(a));
}
static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
{
_mm512_mask_store_epi32(p, __mmask16(0xf), __conv(a));
}
//=======================================================================
// Legacy interface (available only in SIMD256 width)
//=======================================================================
#undef SIMD_WRAPPER_1_
#undef SIMD_WRAPPER_1
#undef SIMD_WRAPPER_1I_
#undef SIMD_WRAPPER_1I
#undef SIMD_WRAPPER_2_
#undef SIMD_WRAPPER_2
#undef SIMD_WRAPPER_2I
#undef SIMD_WRAPPER_3_
#undef SIMD_WRAPPER_3
#undef SIMD_DWRAPPER_1_
#undef SIMD_DWRAPPER_1
#undef SIMD_DWRAPPER_1I_
#undef SIMD_DWRAPPER_1I
#undef SIMD_DWRAPPER_2_
#undef SIMD_DWRAPPER_2
#undef SIMD_DWRAPPER_2I
#undef SIMD_IWRAPPER_1_
#undef SIMD_IWRAPPER_1_8
#undef SIMD_IWRAPPER_1_16
#undef SIMD_IWRAPPER_1_32
#undef SIMD_IWRAPPER_1_64
#undef SIMD_IWRAPPER_1I_
#undef SIMD_IWRAPPER_1I_8
#undef SIMD_IWRAPPER_1I_16
#undef SIMD_IWRAPPER_1I_32
#undef SIMD_IWRAPPER_1I_64
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2_8
#undef SIMD_IWRAPPER_2_16
#undef SIMD_IWRAPPER_2_32
#undef SIMD_IWRAPPER_2_64
#undef SIMD_IWRAPPER_2I
//#undef SIMD_IWRAPPER_2I_8
//#undef SIMD_IWRAPPER_2I_16
//#undef SIMD_IWRAPPER_2I_32
//#undef SIMD_IWRAPPER_2I_64

View file

@ -0,0 +1,757 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
using SIMD128T = SIMD128Impl::AVXImpl;
//============================================================================
// SIMD256 AVX (1) implementation
//============================================================================
#define SIMD_WRAPPER_1(op) \
static SIMDINLINE Float SIMDCALL op(Float a) \
{\
return _mm256_##op(a);\
}
#define SIMD_WRAPPER_2(op) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return _mm256_##op(a, b);\
}
#define SIMD_DWRAPPER_2(op) \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{\
return _mm256_##op(a, b);\
}
#define SIMD_WRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return _mm256_##op(a, b, ImmT);\
}
#define SIMD_DWRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{\
return _mm256_##op(a, b, ImmT);\
}
#define SIMD_WRAPPER_3(op) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
{\
return _mm256_##op(a, b, c);\
}
#define SIMD_IWRAPPER_1(op) \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return _mm256_##op(a);\
}
#define SIMD_IWRAPPER_2(op) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return _mm256_##op(a, b);\
}
#define SIMD_IFWRAPPER_2(op, intrin) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\
}
#define SIMD_IFWRAPPER_2I(op, intrin) \
template<int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return castps_si( intrin(castsi_ps(a), castsi_ps(b), ImmT) );\
}
#define SIMD_IWRAPPER_2I_(op, intrin) \
template<int ImmT>\
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return _mm256_##intrin(a, b, ImmT);\
}
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
#define SIMD_IWRAPPER_3(op) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c) \
{\
return _mm256_##op(a, b, c);\
}
// emulated integer simd
#define SIMD_EMU_IWRAPPER_1(op) \
static SIMDINLINE \
Integer SIMDCALL op(Integer a)\
{\
return Integer\
{\
SIMD128T::op(a.v4[0]),\
SIMD128T::op(a.v4[1]),\
};\
}
#define SIMD_EMU_IWRAPPER_1L(op, shift) \
static SIMDINLINE \
Integer SIMDCALL op(Integer a)\
{\
return Integer \
{\
SIMD128T::op(a.v4[0]), \
SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])), \
};\
}\
static SIMDINLINE \
Integer SIMDCALL op(SIMD128Impl::Integer a)\
{\
return Integer \
{\
SIMD128T::op(a), \
SIMD128T::op(SIMD128T::template srli_si<shift>(a)), \
};\
}
#define SIMD_EMU_IWRAPPER_1I(op) \
template <int ImmT> static SIMDINLINE \
Integer SIMDCALL op(Integer a)\
{\
return Integer\
{\
SIMD128T::template op<ImmT>(a.v4[0]),\
SIMD128T::template op<ImmT>(a.v4[1]),\
};\
}
#define SIMD_EMU_IWRAPPER_2(op) \
static SIMDINLINE \
Integer SIMDCALL op(Integer a, Integer b)\
{\
return Integer\
{\
SIMD128T::op(a.v4[0], b.v4[0]),\
SIMD128T::op(a.v4[1], b.v4[1]),\
};\
}
#define SIMD_EMU_IWRAPPER_2I(op) \
template <int ImmT> static SIMDINLINE \
Integer SIMDCALL op(Integer a, Integer b)\
{\
return Integer\
{\
SIMD128T::template op<ImmT>(a.v4[0], b.v[0]),\
SIMD128T::template op<ImmT>(a.v4[1], b.v[1]),\
};\
}
//-----------------------------------------------------------------------
// Single precision floating point arithmetic operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2(add_ps); // return a + b
SIMD_WRAPPER_2(div_ps); // return a / b
static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
{
return add_ps(mul_ps(a, b), c);
}
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
SIMD_WRAPPER_2(mul_ps); // return a * b
SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a
SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
SIMD_WRAPPER_2(sub_ps); // return a - b
template <RoundMode RMT>
static SIMDINLINE Float SIMDCALL round_ps(Float a)
{
return _mm256_round_ps(a, static_cast<int>(RMT));
}
static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
//-----------------------------------------------------------------------
// Integer (various width) arithmetic operations
//-----------------------------------------------------------------------
SIMD_EMU_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
SIMD_EMU_IWRAPPER_2(add_epi32); // return a + b (int32)
SIMD_EMU_IWRAPPER_2(add_epi8); // return a + b (int8)
SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
SIMD_EMU_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
SIMD_EMU_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
SIMD_EMU_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
SIMD_EMU_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
SIMD_EMU_IWRAPPER_2(mul_epi32); // return a * b (int32)
// return (a * b) & 0xFFFFFFFF
//
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
// and store the low 32 bits of the intermediate integers in dst.
SIMD_EMU_IWRAPPER_2(mullo_epi32);
SIMD_EMU_IWRAPPER_2(sub_epi32); // return a - b (int32)
SIMD_EMU_IWRAPPER_2(sub_epi64); // return a - b (int64)
SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
SIMD_EMU_IWRAPPER_2(and_si); // return a & b (int)
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
SIMD_EMU_IWRAPPER_2(andnot_si); // return (~a) & b (int)
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
SIMD_EMU_IWRAPPER_2(or_si); // return a | b (int)
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
SIMD_EMU_IWRAPPER_2(xor_si); // return a ^ b (int)
//-----------------------------------------------------------------------
// Shift operations
//-----------------------------------------------------------------------
SIMD_EMU_IWRAPPER_1I(slli_epi32); // return a << ImmT
static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vCount) // return a << b (uint32)
{
int32_t aHi, aLow, countHi, countLow;
__m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
__m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
__m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
__m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
aHi = _mm_extract_epi32(vAHi, 0);
countHi = _mm_extract_epi32(vCountHi, 0);
aHi <<= countHi;
vAHi = _mm_insert_epi32(vAHi, aHi, 0);
aLow = _mm_extract_epi32(vALow, 0);
countLow = _mm_extract_epi32(vCountLow, 0);
aLow <<= countLow;
vALow = _mm_insert_epi32(vALow, aLow, 0);
aHi = _mm_extract_epi32(vAHi, 1);
countHi = _mm_extract_epi32(vCountHi, 1);
aHi <<= countHi;
vAHi = _mm_insert_epi32(vAHi, aHi, 1);
aLow = _mm_extract_epi32(vALow, 1);
countLow = _mm_extract_epi32(vCountLow, 1);
aLow <<= countLow;
vALow = _mm_insert_epi32(vALow, aLow, 1);
aHi = _mm_extract_epi32(vAHi, 2);
countHi = _mm_extract_epi32(vCountHi, 2);
aHi <<= countHi;
vAHi = _mm_insert_epi32(vAHi, aHi, 2);
aLow = _mm_extract_epi32(vALow, 2);
countLow = _mm_extract_epi32(vCountLow, 2);
aLow <<= countLow;
vALow = _mm_insert_epi32(vALow, aLow, 2);
aHi = _mm_extract_epi32(vAHi, 3);
countHi = _mm_extract_epi32(vCountHi, 3);
aHi <<= countHi;
vAHi = _mm_insert_epi32(vAHi, aHi, 3);
aLow = _mm_extract_epi32(vALow, 3);
countLow = _mm_extract_epi32(vCountLow, 3);
aLow <<= countLow;
vALow = _mm_insert_epi32(vALow, aLow, 3);
__m256i ret = _mm256_set1_epi32(0);
ret = _mm256_insertf128_si256(ret, vAHi, 1);
ret = _mm256_insertf128_si256(ret, vALow, 0);
return ret;
}
SIMD_EMU_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
SIMD_EMU_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
SIMD_EMU_IWRAPPER_1I(srli_si); // return a >> (ImmT*8) (uint)
template<int ImmT> // same as srli_si, but with Float cast to int
static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
{
return castsi_ps(srli_si<ImmT>(castps_si(a)));
}
static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vCount) // return a >> b (uint32)
{
int32_t aHi, aLow, countHi, countLow;
__m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
__m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
__m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
__m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
aHi = _mm_extract_epi32(vAHi, 0);
countHi = _mm_extract_epi32(vCountHi, 0);
aHi >>= countHi;
vAHi = _mm_insert_epi32(vAHi, aHi, 0);
aLow = _mm_extract_epi32(vALow, 0);
countLow = _mm_extract_epi32(vCountLow, 0);
aLow >>= countLow;
vALow = _mm_insert_epi32(vALow, aLow, 0);
aHi = _mm_extract_epi32(vAHi, 1);
countHi = _mm_extract_epi32(vCountHi, 1);
aHi >>= countHi;
vAHi = _mm_insert_epi32(vAHi, aHi, 1);
aLow = _mm_extract_epi32(vALow, 1);
countLow = _mm_extract_epi32(vCountLow, 1);
aLow >>= countLow;
vALow = _mm_insert_epi32(vALow, aLow, 1);
aHi = _mm_extract_epi32(vAHi, 2);
countHi = _mm_extract_epi32(vCountHi, 2);
aHi >>= countHi;
vAHi = _mm_insert_epi32(vAHi, aHi, 2);
aLow = _mm_extract_epi32(vALow, 2);
countLow = _mm_extract_epi32(vCountLow, 2);
aLow >>= countLow;
vALow = _mm_insert_epi32(vALow, aLow, 2);
aHi = _mm_extract_epi32(vAHi, 3);
countHi = _mm_extract_epi32(vCountHi, 3);
aHi >>= countHi;
vAHi = _mm_insert_epi32(vAHi, aHi, 3);
aLow = _mm_extract_epi32(vALow, 3);
countLow = _mm_extract_epi32(vCountLow, 3);
aLow >>= countLow;
vALow = _mm_insert_epi32(vALow, aLow, 3);
__m256i ret = _mm256_set1_epi32(0);
ret = _mm256_insertf128_si256(ret, vAHi, 1);
ret = _mm256_insertf128_si256(ret, vALow, 0);
return ret;
}
//-----------------------------------------------------------------------
// Conversion operations
//-----------------------------------------------------------------------
static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
{
return _mm256_castpd_ps(a);
}
static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
{
return _mm256_castps_si256(a);
}
static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
{
return _mm256_castsi256_pd(a);
}
static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
{
return _mm256_castps_pd(a);
}
static SIMDINLINE Integer SIMDCALL castpd_si(Double a) // return *(Integer*)(&a)
{
return _mm256_castpd_si256(a);
}
static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
{
return _mm256_castsi256_ps(a);
}
static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float)
{
return _mm256_cvtepi32_ps(a);
}
SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8); // return (int16)a (uint8 --> int16)
SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4); // return (int32)a (uint8 --> int32)
SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8); // return (int32)a (uint16 --> int32)
SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4); // return (int64)a (uint16 --> int64)
SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8); // return (int64)a (uint32 --> int64)
static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32)
{
return _mm256_cvtps_epi32(a);
}
static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32)
{
return _mm256_cvttps_epi32(a);
}
//-----------------------------------------------------------------------
// Comparison operations
//-----------------------------------------------------------------------
template<CompareType CmpTypeT>
static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
{
return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
}
static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
SIMD_EMU_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
SIMD_EMU_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
SIMD_EMU_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
SIMD_EMU_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
SIMD_EMU_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
SIMD_EMU_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
SIMD_EMU_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
SIMD_EMU_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
SIMD_EMU_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
{
return 0 != _mm256_testz_ps(a, b);
}
static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
{
return 0 != _mm256_testz_si256(a, b);
}
//-----------------------------------------------------------------------
// Blend / shuffle / permute operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps); // return ImmT ? b : a (int32)
SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
{
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
}
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
{
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
}
static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value)
{
return _mm256_broadcast_ss(p);
}
SIMD_EMU_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
SIMD_EMU_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
SIMD_EMU_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
SIMD_EMU_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
{
Integer result;
// Ugly slow implementation
uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
{
pResult[i] = pA[0xF & pSwiz[i]];
}
return result;
}
static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
{
Float result;
// Ugly slow implementation
float const *pA = reinterpret_cast<float const*>(&a);
uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
float *pResult = reinterpret_cast<float *>(&result);
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
{
pResult[i] = pA[0xF & pSwiz[i]];
}
return result;
}
SIMD_WRAPPER_2I(permute2f128_ps);
SIMD_DWRAPPER_2I(permute2f128_pd);
SIMD_IWRAPPER_2I_(permute2f128_si, permute2f128_si256);
SIMD_EMU_IWRAPPER_1I(shuffle_epi32);
template<int ImmT>
static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
{
return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
}
SIMD_EMU_IWRAPPER_2(shuffle_epi8);
SIMD_DWRAPPER_2I(shuffle_pd);
SIMD_WRAPPER_2I(shuffle_ps);
SIMD_EMU_IWRAPPER_2(unpackhi_epi16);
SIMD_IFWRAPPER_2(unpackhi_epi32, _mm256_unpackhi_ps);
SIMD_EMU_IWRAPPER_2(unpackhi_epi64);
SIMD_EMU_IWRAPPER_2(unpackhi_epi8);
SIMD_DWRAPPER_2(unpackhi_pd);
SIMD_WRAPPER_2(unpackhi_ps);
SIMD_EMU_IWRAPPER_2(unpacklo_epi16);
SIMD_IFWRAPPER_2(unpacklo_epi32, _mm256_unpacklo_ps);
SIMD_EMU_IWRAPPER_2(unpacklo_epi64);
SIMD_EMU_IWRAPPER_2(unpacklo_epi8);
SIMD_DWRAPPER_2(unpacklo_pd);
SIMD_WRAPPER_2(unpacklo_ps);
//-----------------------------------------------------------------------
// Load / store operations
//-----------------------------------------------------------------------
template<ScaleFactor ScaleT>
static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
uint32_t *pOffsets = (uint32_t*)&idx;
Float vResult;
float* pResult = (float*)&vResult;
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
{
uint32_t offset = pOffsets[i];
offset = offset * static_cast<uint32_t>(ScaleT);
pResult[i] = *(float const*)(((uint8_t const*)p + offset));
}
return vResult;
}
static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements)
{
return broadcast_ss(p);
}
static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
{
return _mm256_load_ps(p);
}
static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
{
return _mm256_load_si256(&p->v);
}
static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
{
return _mm256_loadu_ps(p);
}
static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
{
return _mm256_lddqu_si256(&p->v);
}
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template<ScaleFactor ScaleT>
static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
{
uint32_t *pOffsets = (uint32_t*)&idx;
Float vResult = old;
float* pResult = (float*)&vResult;
DWORD index;
uint32_t umask = movemask_ps(mask);
while (_BitScanForward(&index, umask))
{
umask &= ~(1 << index);
uint32_t offset = pOffsets[index];
offset = offset * static_cast<uint32_t>(ScaleT);
pResult[index] = *(float const *)(((uint8_t const *)p + offset));
}
return vResult;
}
static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
{
_mm256_maskstore_ps(p, mask, src);
}
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
{
return SIMD128T::movemask_epi8(a.v4[0]) |
(SIMD128T::movemask_epi8(a.v4[1]) << 16);
}
static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
{
return static_cast<uint32_t>(_mm256_movemask_pd(a));
}
static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
{
return static_cast<uint32_t>(_mm256_movemask_ps(a));
}
static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
{
return _mm256_set1_epi32(i);
}
static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
{
return _mm256_set1_epi8(i);
}
static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
{
return _mm256_set1_ps(f);
}
static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
{
return _mm256_setzero_ps();
}
static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
{
return _mm256_setzero_si256();
}
static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
{
_mm256_store_ps(p, a);
}
static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
{
_mm256_store_si256(&p->v, a);
}
static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache)
{
_mm256_stream_ps(p, a);
}
//=======================================================================
// Legacy interface (available only in SIMD256 width)
//=======================================================================
static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const *p)
{
return _mm256_broadcast_ps(&p->v);
}
template<int ImmT>
static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double a)
{
return _mm256_extractf128_pd(a, ImmT);
}
template<int ImmT>
static SIMDINLINE SIMD128Impl::Float SIMDCALL extractf128_ps(Float a)
{
return _mm256_extractf128_ps(a, ImmT);
}
template<int ImmT>
static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer a)
{
return _mm256_extractf128_si256(a, ImmT);
}
template<int ImmT>
static SIMDINLINE Double SIMDCALL insertf128_pd(Double a, SIMD128Impl::Double b)
{
return _mm256_insertf128_pd(a, b, ImmT);
}
template<int ImmT>
static SIMDINLINE Float SIMDCALL insertf128_ps(Float a, SIMD128Impl::Float b)
{
return _mm256_insertf128_ps(a, b, ImmT);
}
template<int ImmT>
static SIMDINLINE Integer SIMDCALL insertf128_si(Integer a, SIMD128Impl::Integer b)
{
return _mm256_insertf128_si256(a, b, ImmT);
}
#ifndef _mm256_set_m128i
#define _mm256_set_m128i(/* SIMD128Impl::Integer */ hi, /* SIMD128Impl::Integer */ lo) \
_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
#endif
#ifndef _mm256_loadu2_m128i
#define _mm256_loadu2_m128i(/* SIMD128Impl::Integer const* */ hiaddr, \
/* SIMD128Impl::Integer const* */ loaddr) \
_mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
#endif
static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi, SIMD128Impl::Integer const* plo)
{
return _mm256_loadu2_m128i(&phi->v, &plo->v);
}
static SIMDINLINE Integer SIMDCALL set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
{
return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0);
}
static SIMDINLINE Float SIMDCALL set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
{
return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0);
}
static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer *phi, SIMD128Impl::Integer *plo, Integer src)
{
_mm256_storeu2_m128i(&phi->v, &plo->v, src);
}
#undef SIMD_WRAPPER_1
#undef SIMD_WRAPPER_2
#undef SIMD_DWRAPPER_2
#undef SIMD_DWRAPPER_2I
#undef SIMD_WRAPPER_2I
#undef SIMD_WRAPPER_3
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_2
#undef SIMD_IFWRAPPER_2
#undef SIMD_IFWRAPPER_2I
#undef SIMD_IWRAPPER_2I
#undef SIMD_IWRAPPER_2I_
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_3
#undef SIMD_EMU_IWRAPPER_1
#undef SIMD_EMU_IWRAPPER_1I
#undef SIMD_EMU_IWRAPPER_2
#undef SIMD_EMU_IWRAPPER_2I

View file

@ -0,0 +1,234 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX2_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD256 AVX (2) implementation
//
// Since this implementation inherits from the AVX (1) implementation,
// the only operations below ones that replace AVX (1) operations.
// Mostly these are integer operations that are no longer emulated with SSE
//============================================================================
#define SIMD_IWRAPPER_1(op) \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return _mm256_##op(a);\
}
#define SIMD_IWRAPPER_1L(op) \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return _mm256_##op(_mm256_castsi256_si128(a));\
}\
#define SIMD_IWRAPPER_1I(op) \
template<int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return _mm256_##op(a, ImmT);\
}
#define SIMD_IWRAPPER_1I_(op, intrin) \
template<int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return _mm256_##intrin(a, ImmT);\
}
#define SIMD_IWRAPPER_2_(op, intrin) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return _mm256_##intrin(a, b);\
}
#define SIMD_IWRAPPER_2(op) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return _mm256_##op(a, b);\
}
#define SIMD_IWRAPPER_2I(op) \
template<int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return _mm256_##op(a, b, ImmT);\
}
#define SIMD_IWRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return _mm256_##op(a, b, ImmT);\
}
//-----------------------------------------------------------------------
// Floating point arithmetic operations
//-----------------------------------------------------------------------
static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
{
return _mm256_fmadd_ps(a, b, c);
}
//-----------------------------------------------------------------------
// Integer (various width) arithmetic operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
// return (a * b) & 0xFFFFFFFF
//
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
// and store the low 32 bits of the intermediate integers in dst.
SIMD_IWRAPPER_2(mullo_epi32);
SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_2_(and_si, and_si256); // return a & b (int)
SIMD_IWRAPPER_2_(andnot_si, andnot_si256); // return (~a) & b (int)
SIMD_IWRAPPER_2_(or_si, or_si256); // return a | b (int)
SIMD_IWRAPPER_2_(xor_si, xor_si256); // return a ^ b (int)
//-----------------------------------------------------------------------
// Shift operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32)
SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32)
SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint)
template<int ImmT> // same as srli_si, but with Float cast to int
static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
{
return castsi_ps(srli_si<ImmT>(castps_si(a)));
}
//-----------------------------------------------------------------------
// Conversion operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1L(cvtepu8_epi16); // return (int16)a (uint8 --> int16)
SIMD_IWRAPPER_1L(cvtepu8_epi32); // return (int32)a (uint8 --> int32)
SIMD_IWRAPPER_1L(cvtepu16_epi32); // return (int32)a (uint16 --> int32)
SIMD_IWRAPPER_1L(cvtepu16_epi64); // return (int64)a (uint16 --> int64)
SIMD_IWRAPPER_1L(cvtepu32_epi64); // return (int64)a (uint32 --> int64)
//-----------------------------------------------------------------------
// Comparison operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b) // return a < b (int32)
{
return cmpgt_epi32(b, a);
}
//-----------------------------------------------------------------------
// Blend / shuffle / permute operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32)
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
{
return _mm256_permutevar8x32_ps(a, swiz);
}
SIMD_IWRAPPER_1I(shuffle_epi32);
template<int ImmT>
static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
{
return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
}
SIMD_IWRAPPER_2(shuffle_epi8);
SIMD_IWRAPPER_2(unpackhi_epi16);
SIMD_IWRAPPER_2(unpackhi_epi32);
SIMD_IWRAPPER_2(unpackhi_epi64);
SIMD_IWRAPPER_2(unpackhi_epi8);
SIMD_IWRAPPER_2(unpacklo_epi16);
SIMD_IWRAPPER_2(unpacklo_epi32);
SIMD_IWRAPPER_2(unpacklo_epi64);
SIMD_IWRAPPER_2(unpacklo_epi8);
//-----------------------------------------------------------------------
// Load / store operations
//-----------------------------------------------------------------------
template<ScaleFactor ScaleT>
static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
return _mm256_i32gather_ps(p, idx, static_cast<int>(ScaleT));
}
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template<ScaleFactor ScaleT>
static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
{
// g++ in debug mode needs the explicit .v suffix instead of relying on operator __m256()
// Only for this intrinsic - not sure why. :(
return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast<int>(ScaleT));
}
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
{
return static_cast<uint32_t>(_mm256_movemask_epi8(a));
}
//=======================================================================
// Legacy interface (available only in SIMD256 width)
//=======================================================================
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_1L
#undef SIMD_IWRAPPER_1I
#undef SIMD_IWRAPPER_1I_
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2
#undef SIMD_IWRAPPER_2I
#undef SIMD_IWRAPPER_2I

View file

@ -0,0 +1,409 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD256 AVX (512) implementation
//
// Since this implementation inherits from the AVX (2) implementation,
// the only operations below ones that replace AVX (2) operations.
// These use native AVX512 instructions with masking to enable a larger
// register set.
//============================================================================
private:
static SIMDINLINE __m512 __conv(Float r) { return _mm512_castps256_ps512(r.v); }
static SIMDINLINE __m512d __conv(Double r) { return _mm512_castpd256_pd512(r.v); }
static SIMDINLINE __m512i __conv(Integer r) { return _mm512_castsi256_si512(r.v); }
static SIMDINLINE Float __conv(__m512 r) { return _mm512_castps512_ps256(r); }
static SIMDINLINE Double __conv(__m512d r) { return _mm512_castpd512_pd256(r); }
static SIMDINLINE Integer __conv(__m512i r) { return _mm512_castsi512_si256(r); }
public:
#define SIMD_WRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Float SIMDCALL op(Float a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
}
#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xff))
#define SIMD_WRAPPER_1I_(op, intrin, mask) \
template<int ImmT> \
static SIMDINLINE Float SIMDCALL op(Float a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
}
#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xff))
#define SIMD_WRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
}
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xff))
#define SIMD_WRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT));\
}
#define SIMD_WRAPPER_3_(op, intrin, mask) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
}
#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xff))
#define SIMD_DWRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Double SIMDCALL op(Double a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
}
#if !defined(AVX512F_STRICT)
#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
#endif
#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
template<int ImmT> \
static SIMDINLINE Double SIMDCALL op(Double a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
}
#if !defined(AVX512F_STRICT)
#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
#endif
#define SIMD_DWRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
}
#if !defined(AVX512F_STRICT)
#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
#endif
#define SIMD_DWRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{\
return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
}
#define SIMD_IWRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
}
#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xff))
#if !defined(AVX512F_STRICT)
#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
#endif
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
template<int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
}
#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xff))
#if !defined(AVX512F_STRICT)
#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
#endif
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
}
#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xff))
#if !defined(AVX512F_STRICT)
#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
#endif
#define SIMD_IWRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT));\
}
//-----------------------------------------------------------------------
// Single precision floating point arithmetic operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2(add_ps); // return a + b
SIMD_WRAPPER_2(div_ps); // return a / b
SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
SIMD_WRAPPER_2(mul_ps); // return a * b
//SIMD_WRAPPER_1_(rcp_ps, rcp28_ps, __mmask16(0xff)); // return 1.0f / a
//SIMD_WRAPPER_1_(rsqrt_ps, rsqrt28_ps, __mmask16(0xff)); // return 1.0f / sqrt(a)
SIMD_WRAPPER_2(sub_ps); // return a - b
//-----------------------------------------------------------------------
// Integer (various width) arithmetic operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32)
SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32)
SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32)
SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32)
SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32)
SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
#if !defined(AVX512F_STRICT)
SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
#endif
// return (a * b) & 0xFFFFFFFF
//
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
// and store the low 32 bits of the intermediate integers in dst.
SIMD_IWRAPPER_2_32(mullo_epi32);
SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
#if !defined(AVX512F_STRICT)
SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
#endif
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xff)); // return a & b (int)
SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xff)); // return (~a) & b (int)
SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xff)); // return a | b (int)
SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xff)); // return a ^ b (int)
//-----------------------------------------------------------------------
// Shift operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT
SIMD_IWRAPPER_2_32(sllv_epi32); // return a << b (uint32)
SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT (int32)
SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT (uint32)
SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32)
// use AVX2 version
//SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint)
//-----------------------------------------------------------------------
// Conversion operations (Use AVX2 versions)
//-----------------------------------------------------------------------
// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff); // return (int16)a (uint8 --> int16)
// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff); // return (int32)a (uint8 --> int32)
// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff); // return (int32)a (uint16 --> int32)
// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf); // return (int64)a (uint16 --> int64)
// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf); // return (int64)a (uint32 --> int64)
//-----------------------------------------------------------------------
// Comparison operations (Use AVX2 versions
//-----------------------------------------------------------------------
//SIMD_IWRAPPER_2_CMP(cmpeq_epi8); // return a == b (int8)
//SIMD_IWRAPPER_2_CMP(cmpeq_epi16); // return a == b (int16)
//SIMD_IWRAPPER_2_CMP(cmpeq_epi32); // return a == b (int32)
//SIMD_IWRAPPER_2_CMP(cmpeq_epi64); // return a == b (int64)
//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,); // return a > b (int8)
//SIMD_IWRAPPER_2_CMP(cmpgt_epi16); // return a > b (int16)
//SIMD_IWRAPPER_2_CMP(cmpgt_epi32); // return a > b (int32)
//SIMD_IWRAPPER_2_CMP(cmpgt_epi64); // return a > b (int64)
//
//static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b) // return a < b (int32)
//{
// return cmpgt_epi32(b, a);
//}
//-----------------------------------------------------------------------
// Blend / shuffle / permute operations
//-----------------------------------------------------------------------
#if !defined(AVX512F_STRICT)
SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
#endif
// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
//{
// return _mm256_permutevar8x32_ps(a, swiz);
//}
SIMD_IWRAPPER_1I_32(shuffle_epi32);
//template<int ImmT>
//static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
//{
// return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
//}
//SIMD_IWRAPPER_2(shuffle_epi8);
SIMD_IWRAPPER_2_32(unpackhi_epi32);
SIMD_IWRAPPER_2_32(unpacklo_epi32);
#if !defined(AVX512F_STRICT)
SIMD_IWRAPPER_2_16(unpackhi_epi16);
SIMD_IWRAPPER_2_64(unpackhi_epi64);
SIMD_IWRAPPER_2_8(unpackhi_epi8);
SIMD_IWRAPPER_2_16(unpacklo_epi16);
SIMD_IWRAPPER_2_64(unpacklo_epi64);
SIMD_IWRAPPER_2_8(unpacklo_epi8);
#endif
//-----------------------------------------------------------------------
// Load / store operations
//-----------------------------------------------------------------------
static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
{
return __conv(_mm512_maskz_load_ps(__mmask16(0xff), p));
}
static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
{
return __conv(_mm512_maskz_load_epi32(__mmask16(0xff), p));
}
static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
{
return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p));
}
static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
{
return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p));
}
template<ScaleFactor ScaleT>
static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
return __conv(_mm512_mask_i32gather_ps(
_mm512_setzero_ps(),
__mmask16(0xff),
__conv(idx),
p,
static_cast<int>(ScaleT)));
}
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template<ScaleFactor ScaleT>
static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
{
__mmask16 m = 0xff;
m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)),
_mm512_set1_epi32(0x8000000));
return __conv(_mm512_mask_i32gather_ps(
__conv(old),
m,
__conv(idx),
p,
static_cast<int>(ScaleT)));
}
#if !defined(AVX512F_STRICT)
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
{
__mmask64 m = 0xffffffffull;
return static_cast<uint32_t>(
_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
}
#endif
static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
{
__mmask16 m = 0xff;
m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
_mm512_mask_store_ps(p, m, __conv(src));
}
static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
{
_mm512_mask_store_ps(p, __mmask16(0xff), __conv(a));
}
static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
{
_mm512_mask_store_epi32(p, __mmask16(0xff), __conv(a));
}
//=======================================================================
// Legacy interface (available only in SIMD256 width)
//=======================================================================
#undef SIMD_WRAPPER_1_
#undef SIMD_WRAPPER_1
#undef SIMD_WRAPPER_1I_
#undef SIMD_WRAPPER_1I
#undef SIMD_WRAPPER_2_
#undef SIMD_WRAPPER_2
#undef SIMD_WRAPPER_2I
#undef SIMD_WRAPPER_3_
#undef SIMD_WRAPPER_3
#undef SIMD_DWRAPPER_1_
#undef SIMD_DWRAPPER_1
#undef SIMD_DWRAPPER_1I_
#undef SIMD_DWRAPPER_1I
#undef SIMD_DWRAPPER_2_
#undef SIMD_DWRAPPER_2
#undef SIMD_DWRAPPER_2I
#undef SIMD_IWRAPPER_1_
#undef SIMD_IWRAPPER_1_8
#undef SIMD_IWRAPPER_1_16
#undef SIMD_IWRAPPER_1_32
#undef SIMD_IWRAPPER_1_64
#undef SIMD_IWRAPPER_1I_
#undef SIMD_IWRAPPER_1I_8
#undef SIMD_IWRAPPER_1I_16
#undef SIMD_IWRAPPER_1I_32
#undef SIMD_IWRAPPER_1I_64
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2_8
#undef SIMD_IWRAPPER_2_16
#undef SIMD_IWRAPPER_2_32
#undef SIMD_IWRAPPER_2_64
#undef SIMD_IWRAPPER_2I
//#undef SIMD_IWRAPPER_2I_8
//#undef SIMD_IWRAPPER_2I_16
//#undef SIMD_IWRAPPER_2I_32
//#undef SIMD_IWRAPPER_2I_64

View file

@ -0,0 +1,682 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD16 AVX512 (F) implementation
//
// TODO: Optimize for KNL / KNH or for SKX??
// For now probably optimizing more for KNL as that's where
// immediate customers are.
//============================================================================
static const int TARGET_SIMD_WIDTH = 16;
using SIMD256T = SIMD256Impl::AVX2Impl;
#define SIMD_WRAPPER_1_(op, intrin) \
static SIMDINLINE Float SIMDCALL op(Float a) \
{\
return intrin(a);\
}
#define SIMD_WRAPPER_1(op) \
SIMD_WRAPPER_1_(op, _mm512_##op)
#define SIMD_WRAPPER_2_(op, intrin) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return _mm512_##intrin(a, b);\
}
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
#define SIMD_WRAPPERI_2_(op, intrin) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return _mm512_castsi512_ps(_mm512_##intrin(\
_mm512_castps_si512(a), _mm512_castps_si512(b)));\
}
#define SIMD_DWRAPPER_2(op) \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{\
return _mm512_##op(a, b);\
}
#define SIMD_WRAPPER_2I_(op, intrin) \
template<int ImmT>\
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return _mm512_##intrin(a, b, ImmT);\
}
#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
#define SIMD_DWRAPPER_2I_(op, intrin) \
template<int ImmT>\
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{\
return _mm512_##intrin(a, b, ImmT);\
}
#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
#define SIMD_WRAPPER_3(op) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
{\
return _mm512_##op(a, b, c);\
}
#define SIMD_IWRAPPER_1(op) \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return _mm512_##op(a);\
}
#define SIMD_IWRAPPER_1_8(op) \
static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) \
{\
return _mm512_##op(a);\
}
#define SIMD_IWRAPPER_1_4(op) \
static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) \
{\
return _mm512_##op(a);\
}
#define SIMD_IWRAPPER_1I_(op, intrin) \
template<int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return intrin(a, ImmT);\
}
#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
#define SIMD_IWRAPPER_2_(op, intrin) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return _mm512_##intrin(a, b);\
}
#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
#define SIMD_IWRAPPER_2_CMP(op, cmp) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return cmp(a, b);\
}
#define SIMD_IFWRAPPER_2(op, intrin) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
}
#define SIMD_IWRAPPER_2I_(op, intrin) \
template<int ImmT>\
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return _mm512_##intrin(a, b, ImmT);\
}
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
private:
static SIMDINLINE Integer vmask(__mmask8 m)
{
return _mm512_maskz_set1_epi64(m, -1LL);
}
static SIMDINLINE Integer vmask(__mmask16 m)
{
return _mm512_maskz_set1_epi32(m, -1);
}
static SIMDINLINE Integer vmask(__mmask32 m)
{
return _mm512_maskz_set1_epi16(m, -1);
}
static SIMDINLINE Integer vmask(__mmask64 m)
{
return _mm512_maskz_set1_epi8(m, -1);
}
public:
//-----------------------------------------------------------------------
// Single precision floating point arithmetic operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2(add_ps); // return a + b
SIMD_WRAPPER_2(div_ps); // return a / b
SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
SIMD_WRAPPER_2(mul_ps); // return a * b
SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp28_ps); // return 1.0f / a
SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt28_ps); // return 1.0f / sqrt(a)
SIMD_WRAPPER_2(sub_ps); // return a - b
template <RoundMode RMT>
static SIMDINLINE Float SIMDCALL round_ps(Float a)
{
return _mm512_roundscale_ps(a, static_cast<int>(RMT));
}
static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
//-----------------------------------------------------------------------
// Integer (various width) arithmetic operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
// return (a * b) & 0xFFFFFFFF
//
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
// and store the low 32 bits of the intermediate integers in dst.
SIMD_IWRAPPER_2(mullo_epi32);
SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_2_(and_si, and_si512); // return a & b (int)
SIMD_IWRAPPER_2_(andnot_si, andnot_si512); // return (~a) & b (int)
SIMD_IWRAPPER_2_(or_si, or_si512); // return a | b (int)
SIMD_IWRAPPER_2_(xor_si, xor_si512); // return a ^ b (int)
#if defined(AVX512F_STRICT)
SIMD_WRAPPERI_2_(and_ps, and_epi32); // return a & b (float treated as int)
SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32); // return (~a) & b (float treated as int)
SIMD_WRAPPERI_2_(or_ps, or_epi32); // return a | b (float treated as int)
SIMD_WRAPPERI_2_(xor_ps, xor_epi32); // return a ^ b (float treated as int)
#else
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
#endif
//-----------------------------------------------------------------------
// Shift operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
SIMD_IWRAPPER_2(sllv_epi32);
SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
SIMD_IWRAPPER_1I_(srli_si, srli_si512); // return a >> (ImmT*8) (uint)
template<int ImmT> // same as srli_si, but with Float cast to int
static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
{
return castsi_ps(srli_si<ImmT>(castps_si(a)));
}
SIMD_IWRAPPER_2(srlv_epi32);
//-----------------------------------------------------------------------
// Conversion operations
//-----------------------------------------------------------------------
static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
{
return _mm512_castpd_ps(a);
}
static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
{
return _mm512_castps_si512(a);
}
static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
{
return _mm512_castsi512_pd(a);
}
static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
{
return _mm512_castps_pd(a);
}
static SIMDINLINE Integer SIMDCALL castpd_si(Double a) // return *(Integer*)(&a)
{
return _mm512_castpd_si512(a);
}
static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
{
return _mm512_castsi512_ps(a);
}
static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float)
{
return _mm512_cvtepi32_ps(a);
}
SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a (uint8 --> int16)
SIMD_IWRAPPER_1_4(cvtepu8_epi32); // return (int32)a (uint8 --> int32)
SIMD_IWRAPPER_1_8(cvtepu16_epi32); // return (int32)a (uint16 --> int32)
SIMD_IWRAPPER_1_4(cvtepu16_epi64); // return (int64)a (uint16 --> int64)
SIMD_IWRAPPER_1_8(cvtepu32_epi64); // return (int64)a (uint32 --> int64)
static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32)
{
return _mm512_cvtps_epi32(a);
}
static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32)
{
return _mm512_cvttps_epi32(a);
}
//-----------------------------------------------------------------------
// Comparison operations
//-----------------------------------------------------------------------
template<CompareType CmpTypeT>
static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
{
return _mm512_cmp_ps_mask(a, b, static_cast<const int>(CmpTypeT));
}
template<CompareType CmpTypeT>
static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
{
// Legacy vector mask generator
__mmask16 result = cmp_ps_mask<CmpTypeT>(a, b);
return castsi_ps(vmask(result));
}
static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
template<CompareTypeInt CmpTypeT>
static SIMDINLINE Integer SIMDCALL cmp_epi8(Integer a, Integer b)
{
// Legacy vector mask generator
__mmask64 result = _mm512_cmp_epi8_mask(a, b, static_cast<const int>(CmpTypeT));
return vmask(result);
}
template<CompareTypeInt CmpTypeT>
static SIMDINLINE Integer SIMDCALL cmp_epi16(Integer a, Integer b)
{
// Legacy vector mask generator
__mmask32 result = _mm512_cmp_epi16_mask(a, b, static_cast<const int>(CmpTypeT));
return vmask(result);
}
template<CompareTypeInt CmpTypeT>
static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b)
{
// Legacy vector mask generator
__mmask16 result = _mm512_cmp_epi32_mask(a, b, static_cast<const int>(CmpTypeT));
return vmask(result);
}
template<CompareTypeInt CmpTypeT>
static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b)
{
// Legacy vector mask generator
__mmask8 result = _mm512_cmp_epi64_mask(a, b, static_cast<const int>(CmpTypeT));
return vmask(result);
}
SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8<CompareTypeInt::EQ>); // return a == b (int8)
SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>); // return a == b (int16)
SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>); // return a == b (int32)
SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>); // return a == b (int64)
SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8<CompareTypeInt::GT>); // return a > b (int8)
SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>); // return a > b (int16)
SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>); // return a > b (int32)
SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>); // return a > b (int64)
SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>); // return a < b (int32)
static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
{
return (0 == static_cast<int>(_mm512_test_epi32_mask(castps_si(a), castps_si(b))));
}
static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
{
return (0 == static_cast<int>(_mm512_test_epi32_mask(a, b)));
}
//-----------------------------------------------------------------------
// Blend / shuffle / permute operations
//-----------------------------------------------------------------------
template <int ImmT>
static SIMDINLINE Float blend_ps(Float a, Float b) // return ImmT ? b : a (float)
{
return _mm512_mask_blend_ps(__mmask16(ImmT), a, b);
}
template <int ImmT>
static SIMDINLINE Float blend_epi32(Integer a, Integer b) // return ImmT ? b : a (int32)
{
return _mm512_mask_blend_epi32(__mmask16(ImmT), a, b);
}
static SIMDINLINE Float blendv_ps(Float a, Float b, Float mask) // return mask ? b : a (float)
{
return _mm512_mask_blend_ps(__mmask16(movemask_ps(mask)), a, b);
}
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
{
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
}
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
{
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
}
static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value)
{
return _mm512_set1_ps(*p);
}
template<int imm>
static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
{
return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), imm));
}
template<int imm>
static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
{
return _mm512_extractf64x4_pd(a, imm);
}
template<int imm>
static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
{
return _mm512_extracti64x4_epi64(a, imm);
}
template<int imm>
static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
{
return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(a), _mm256_castps_pd(b), imm));
}
template<int imm>
static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
{
return _mm512_insertf64x4(a, b, imm);
}
template<int imm>
static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
{
return _mm512_inserti64x4(a, b, imm);
}
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
{
return _mm512_permutexvar_epi32(swiz, a);
}
static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
{
return _mm512_permutexvar_ps(swiz, a);
}
SIMD_WRAPPER_2I_(permute2f128_ps, shuffle_f32x4);
SIMD_DWRAPPER_2I_(permute2f128_pd, shuffle_f64x2);
SIMD_IWRAPPER_2I_(permute2f128_si, shuffle_i32x4);
SIMD_IWRAPPER_1I(shuffle_epi32);
SIMD_IWRAPPER_2(shuffle_epi8);
SIMD_DWRAPPER_2I(shuffle_pd);
SIMD_WRAPPER_2I(shuffle_ps);
template<int ImmT>
static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
{
return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
}
SIMD_IWRAPPER_2(unpackhi_epi16);
//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps);
static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
{
return castps_si(_mm512_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
}
SIMD_IWRAPPER_2(unpackhi_epi64);
SIMD_IWRAPPER_2(unpackhi_epi8);
SIMD_DWRAPPER_2(unpackhi_pd);
SIMD_WRAPPER_2(unpackhi_ps);
SIMD_IWRAPPER_2(unpacklo_epi16);
SIMD_IFWRAPPER_2(unpacklo_epi32, unpacklo_ps);
SIMD_IWRAPPER_2(unpacklo_epi64);
SIMD_IWRAPPER_2(unpacklo_epi8);
SIMD_DWRAPPER_2(unpacklo_pd);
SIMD_WRAPPER_2(unpacklo_ps);
//-----------------------------------------------------------------------
// Load / store operations
//-----------------------------------------------------------------------
template<ScaleFactor ScaleT>
static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
uint32_t *pOffsets = (uint32_t*)&idx;
Float vResult;
float* pResult = (float*)&vResult;
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
{
uint32_t offset = pOffsets[i];
offset = offset * static_cast<uint32_t>(ScaleT);
pResult[i] = *(float const*)(((uint8_t const*)p + offset));
}
return vResult;
}
static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements)
{
return broadcast_ss(p);
}
static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
{
return _mm512_load_ps(p);
}
static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
{
return _mm512_load_si512(&p->v);
}
static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
{
return _mm512_loadu_ps(p);
}
static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
{
return _mm512_loadu_si512(p);
}
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template<ScaleFactor ScaleT>
static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
{
__mmask16 k = _mm512_cmpneq_ps_mask(mask, setzero_ps());
return _mm512_mask_i32gather_ps(old, k, idx, p, ScaleT);
}
static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
{
Mask m = _mm512_cmplt_epi32_mask(mask, setzero_si());
_mm512_mask_store_ps(p, m, src);
}
static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
{
__mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si());
return static_cast<uint64_t>(m);
}
static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
{
__mmask8 m = _mm512_cmplt_pd_mask(a, setzero_pd());
return static_cast<uint32_t>(m);
}
static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
{
__mmask16 m = _mm512_cmplt_ps_mask(a, setzero_ps());
return static_cast<uint32_t>(m);
}
static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
{
return _mm512_set1_epi32(i);
}
static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
{
return _mm512_set1_epi8(i);
}
static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
{
return _mm512_set1_ps(f);
}
static SIMDINLINE Double SIMDCALL setzero_pd() // return 0 (double)
{
return _mm512_setzero_pd();
}
static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
{
return _mm512_setzero_ps();
}
static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
{
return _mm512_setzero_si512();
}
static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
{
_mm512_store_ps(p, a);
}
static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
{
_mm512_store_si512(&p->v, a);
}
static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a (same as store_si but allows for unaligned mem)
{
_mm512_storeu_si512(&p->v, a);
}
static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache)
{
_mm512_stream_ps(p, a);
}
static SIMDINLINE Integer SIMDCALL set_epi32(
int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
{
return _mm512_set_epi32(
i15, i14, i13, i12, i11, i10, i9, i8,
i7, i6, i5, i4, i3, i2, i1, i0);
}
static SIMDINLINE Integer SIMDCALL set_epi32(
int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
{
return set_epi32(
0, 0, 0, 0, 0, 0, 0, 0,
i7, i6, i5, i4, i3, i2, i1, i0);
}
static SIMDINLINE Float SIMDCALL set_ps(
float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
{
return _mm512_set_ps(
i15, i14, i13, i12, i11, i10, i9, i8,
i7, i6, i5, i4, i3, i2, i1, i0);
}
static SIMDINLINE Float SIMDCALL set_ps(
float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
{
return set_ps(
0, 0, 0, 0, 0, 0, 0, 0,
i7, i6, i5, i4, i3, i2, i1, i0);
}
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
{
return castsi_ps(_mm512_maskz_mov_epi32(__mmask16(mask), set1_epi32(-1)));
}
#undef SIMD_WRAPPER_1_
#undef SIMD_WRAPPER_1
#undef SIMD_WRAPPER_2
#undef SIMD_WRAPPER_2_
#undef SIMD_WRAPPERI_2_
#undef SIMD_DWRAPPER_2
#undef SIMD_DWRAPPER_2I
#undef SIMD_WRAPPER_2I_
#undef SIMD_WRAPPER_3_
#undef SIMD_WRAPPER_2I
#undef SIMD_WRAPPER_3
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_2
#undef SIMD_IFWRAPPER_2
#undef SIMD_IWRAPPER_2I
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_1I
#undef SIMD_IWRAPPER_1I_
#undef SIMD_IWRAPPER_2
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2I

View file

@ -0,0 +1,27 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
// Implement mask-enabled SIMD functions

View file

@ -0,0 +1,842 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD16 AVX (1) implementation
//============================================================================
static const int TARGET_SIMD_WIDTH = 8;
using SIMD128T = SIMD128Impl::AVXImpl;
#define SIMD_WRAPPER_1(op) \
static SIMDINLINE Float SIMDCALL op(Float a) \
{\
return Float\
{\
SIMD256T::op(a.v8[0]),\
SIMD256T::op(a.v8[1]),\
};\
}
#define SIMD_WRAPPER_2(op) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return Float\
{\
SIMD256T::op(a.v8[0], b.v8[0]),\
SIMD256T::op(a.v8[1], b.v8[1]),\
};\
}
#define SIMD_WRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return Float\
{\
SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
};\
}
#define SIMD_WRAPPER_2I_1(op) \
template<int ImmT>\
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return Float\
{\
SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
};\
}
#define SIMD_WRAPPER_3(op) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
{\
return Float\
{\
SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
};\
}
#define SIMD_IWRAPPER_1(op) \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return Integer\
{\
SIMD256T::op(a.v8[0]),\
SIMD256T::op(a.v8[1]),\
};\
}
#define SIMD_IWRAPPER_2(op) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return Integer\
{\
SIMD256T::op(a.v8[0], b.v8[0]),\
SIMD256T::op(a.v8[1], b.v8[1]),\
};\
}
#define SIMD_IWRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return Integer\
{\
SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
};\
}
#define SIMD_IWRAPPER_2I_1(op) \
template<int ImmT>\
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return Integer\
{\
SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
};\
}
#define SIMD_IWRAPPER_2I_2(op) \
template<int ImmT>\
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return Integer\
{\
SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),\
SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),\
};\
}
#define SIMD_IWRAPPER_3(op) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c) \
{\
return Integer\
{\
SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
};\
}
//-----------------------------------------------------------------------
// Single precision floating point arithmetic operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2(add_ps); // return a + b
SIMD_WRAPPER_2(div_ps); // return a / b
SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
SIMD_WRAPPER_2(mul_ps); // return a * b
SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a
SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
SIMD_WRAPPER_2(sub_ps); // return a - b
template <RoundMode RMT>
static SIMDINLINE Float SIMDCALL round_ps(Float a)
{
return Float
{
SIMD256T::template round_ps<RMT>(a.v8[0]),
SIMD256T::template round_ps<RMT>(a.v8[1]),
};
}
static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
//-----------------------------------------------------------------------
// Integer (various width) arithmetic operations
//-----------------------------------------------------------------------
SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
// return (a * b) & 0xFFFFFFFF
//
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
// and store the low 32 bits of the intermediate integers in dst.
SIMD_IWRAPPER_2(mullo_epi32);
SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
SIMD_IWRAPPER_2(and_si); // return a & b (int)
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
SIMD_IWRAPPER_2(andnot_si); // return (~a) & b (int)
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
SIMD_IWRAPPER_2(or_si); // return a | b (int)
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
SIMD_IWRAPPER_2(xor_si); // return a ^ b (int)
//-----------------------------------------------------------------------
// Shift operations
//-----------------------------------------------------------------------
template<int ImmT>
static SIMDINLINE Integer SIMDCALL slli_epi32(Integer a) // return a << ImmT
{
return Integer
{
SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
};
}
SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32)
template<int ImmT>
static SIMDINLINE Integer SIMDCALL srai_epi32(Integer a) // return a >> ImmT (int32)
{
return Integer
{
SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
};
}
template<int ImmT>
static SIMDINLINE Integer SIMDCALL srli_epi32(Integer a) // return a >> ImmT (uint32)
{
return Integer
{
SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
};
}
template<int ImmT> // for each 128-bit lane:
static SIMDINLINE Integer SIMDCALL srli_si(Integer a) // return a >> (ImmT*8) (uint)
{
return Integer
{
SIMD256T::template srli_si<ImmT>(a.v8[0]),
SIMD256T::template srli_si<ImmT>(a.v8[1]),
};
}
template<int ImmT>
static SIMDINLINE Float SIMDCALL srlisi_ps(Float a) // same as srli_si, but with Float cast to int
{
return Float
{
SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
};
}
SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32)
//-----------------------------------------------------------------------
// Conversion operations
//-----------------------------------------------------------------------
static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
{
return Float
{
SIMD256T::castpd_ps(a.v8[0]),
SIMD256T::castpd_ps(a.v8[1]),
};
}
static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
{
return Integer
{
SIMD256T::castps_si(a.v8[0]),
SIMD256T::castps_si(a.v8[1]),
};
}
static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
{
return Double
{
SIMD256T::castsi_pd(a.v8[0]),
SIMD256T::castsi_pd(a.v8[1]),
};
}
static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
{
return Double
{
SIMD256T::castps_pd(a.v8[0]),
SIMD256T::castps_pd(a.v8[1]),
};
}
static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
{
return Float
{
SIMD256T::castsi_ps(a.v8[0]),
SIMD256T::castsi_ps(a.v8[1]),
};
}
static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float)
{
return Float
{
SIMD256T::cvtepi32_ps(a.v8[0]),
SIMD256T::cvtepi32_ps(a.v8[1]),
};
}
static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer a) // return (int16)a (uint8 --> int16)
{
return Integer
{
SIMD256T::cvtepu8_epi16(a.v4[0]),
SIMD256T::cvtepu8_epi16(a.v4[1]),
};
}
static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer a) // return (int32)a (uint8 --> int32)
{
return Integer
{
SIMD256T::cvtepu8_epi32(a.v4[0]),
SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
};
}
static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer a) // return (int32)a (uint16 --> int32)
{
return Integer
{
SIMD256T::cvtepu16_epi32(a.v4[0]),
SIMD256T::cvtepu16_epi32(a.v4[1]),
};
}
static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer a) // return (int64)a (uint16 --> int64)
{
return Integer
{
SIMD256T::cvtepu16_epi64(a.v4[0]),
SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
};
}
static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer a) // return (int64)a (uint32 --> int64)
{
return Integer
{
SIMD256T::cvtepu32_epi64(a.v4[0]),
SIMD256T::cvtepu32_epi64(a.v4[1]),
};
}
static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32)
{
return Integer
{
SIMD256T::cvtps_epi32(a.v8[0]),
SIMD256T::cvtps_epi32(a.v8[1]),
};
}
static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32)
{
return Integer
{
SIMD256T::cvtps_epi32(a.v8[0]),
SIMD256T::cvtps_epi32(a.v8[1]),
};
}
//-----------------------------------------------------------------------
// Comparison operations
//-----------------------------------------------------------------------
template<CompareType CmpTypeT>
static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
{
return Float
{
SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
};
}
static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
template<CompareType CmpTypeT>
static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
{
return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
}
SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
{
return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) &
SIMD256T::testz_ps(a.v8[1], b.v8[1]));
}
static SIMDINLINE int SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
{
return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) &
SIMD256T::testz_si(a.v8[1], b.v8[1]));
}
//-----------------------------------------------------------------------
// Blend / shuffle / permute operations
//-----------------------------------------------------------------------
SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32)
SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
{
return Integer
{
SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
};
}
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
{
return Integer
{
SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
};
}
static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value)
{
float f = *p;
return Float
{
SIMD256T::set1_ps(f),
SIMD256T::set1_ps(f),
};
}
template<int imm>
static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
{
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
return a.v8[imm];
}
template<int imm>
static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
{
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
return a.v8[imm];
}
template<int imm>
static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
{
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
return a.v8[imm];
}
template<int imm>
static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
{
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
a.v8[imm] = b;
return a;
}
template<int imm>
static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
{
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
a.v8[imm] = b;
return a;
}
template<int imm>
static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
{
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
a.v8[imm] = b;
return a;
}
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
{
Integer result;
// Ugly slow implementation
uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
{
pResult[i] = pA[0xF & pSwiz[i]];
}
return result;
}
static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
{
Float result;
// Ugly slow implementation
float const *pA = reinterpret_cast<float const*>(&a);
uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
float *pResult = reinterpret_cast<float *>(&result);
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
{
pResult[i] = pA[0xF & pSwiz[i]];
}
return result;
}
// All of the 512-bit permute2f128_XX intrinsics do the following:
//
// SELECT4(src, control) {
// CASE(control[1:0])
// 0: tmp[127:0] : = src[127:0]
// 1 : tmp[127:0] : = src[255:128]
// 2 : tmp[127:0] : = src[383:256]
// 3 : tmp[127:0] : = src[511:384]
// ESAC
// RETURN tmp[127:0]
// }
//
// dst[127:0] : = SELECT4(a[511:0], imm8[1:0])
// dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
// dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
// dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
// dst[MAX:512] : = 0
//
// Since the 256-bit AVX instructions use a 4-bit control field (instead
// of 2-bit for AVX512), we need to expand the control bits sent to the
// AVX instructions for emulation.
//
template <int shuf>
static SIMDINLINE Float SIMDCALL permute2f128_ps(Float a, Float b)
{
return Float
{
SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
};
}
template <int shuf>
static SIMDINLINE Double SIMDCALL permute2f128_pd(Double a, Double b)
{
return Double
{
SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
};
}
template <int shuf>
static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer a, Integer b)
{
return Integer
{
SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
};
}
SIMD_IWRAPPER_2I_1(shuffle_epi32);
SIMD_IWRAPPER_2I_2(shuffle_epi64);
SIMD_IWRAPPER_2(shuffle_epi8);
SIMD_WRAPPER_2I_1(shuffle_pd);
SIMD_WRAPPER_2I_1(shuffle_ps);
SIMD_IWRAPPER_2(unpackhi_epi16);
SIMD_IWRAPPER_2(unpackhi_epi32);
SIMD_IWRAPPER_2(unpackhi_epi64);
SIMD_IWRAPPER_2(unpackhi_epi8);
SIMD_WRAPPER_2(unpackhi_pd);
SIMD_WRAPPER_2(unpackhi_ps);
SIMD_IWRAPPER_2(unpacklo_epi16);
SIMD_IWRAPPER_2(unpacklo_epi32);
SIMD_IWRAPPER_2(unpacklo_epi64);
SIMD_IWRAPPER_2(unpacklo_epi8);
SIMD_WRAPPER_2(unpacklo_pd);
SIMD_WRAPPER_2(unpacklo_ps);
//-----------------------------------------------------------------------
// Load / store operations
//-----------------------------------------------------------------------
template<ScaleFactor ScaleT>
static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
return Float
{
SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
};
}
static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements)
{
return broadcast_ss(p);
}
static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
{
return Float
{
SIMD256T::load_ps(p),
SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)
};
}
static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
{
return Integer
{
SIMD256T::load_si(&p->v8[0]),
SIMD256T::load_si(&p->v8[1]),
};
}
static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
{
return Float
{
SIMD256T::loadu_ps(p),
SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)
};
}
static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
{
return Integer
{
SIMD256T::loadu_si(&p->v8[0]),
SIMD256T::loadu_si(&p->v8[1]),
};
}
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template<ScaleFactor ScaleT>
static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
{
return Float
{
SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
};
}
static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
{
SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
}
static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
{
uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
return mask;
}
static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
{
uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
return mask;
}
static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
{
uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
return mask;
}
static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
{
return Integer
{
SIMD256T::set1_epi32(i),
SIMD256T::set1_epi32(i)
};
}
static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
{
return Integer
{
SIMD256T::set1_epi8(i),
SIMD256T::set1_epi8(i)
};
}
static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
{
return Float
{
SIMD256T::set1_ps(f),
SIMD256T::set1_ps(f)
};
}
static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
{
return Float
{
SIMD256T::setzero_ps(),
SIMD256T::setzero_ps()
};
}
static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
{
return Integer
{
SIMD256T::setzero_si(),
SIMD256T::setzero_si()
};
}
static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
{
SIMD256T::store_ps(p, a.v8[0]);
SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
}
static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
{
SIMD256T::store_si(&p->v8[0], a.v8[0]);
SIMD256T::store_si(&p->v8[1], a.v8[1]);
}
static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache)
{
SIMD256T::stream_ps(p, a.v8[0]);
SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
}
static SIMDINLINE Integer SIMDCALL set_epi32(
int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
{
return Integer
{
SIMD256T::set_epi32(
i7, i6, i5, i4, i3, i2, i1, i0),
SIMD256T::set_epi32(
i15, i14, i13, i12, i11, i10, i9, i8)
};
}
static SIMDINLINE Integer SIMDCALL set_epi32(
int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
{
return set_epi32(
0, 0, 0, 0, 0, 0, 0, 0,
i7, i6, i5, i4, i3, i2, i1, i0);
}
static SIMDINLINE Float SIMDCALL set_ps(
float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
{
return Float
{
SIMD256T::set_ps(
i7, i6, i5, i4, i3, i2, i1, i0),
SIMD256T::set_ps(
i15, i14, i13, i12, i11, i10, i9, i8)
};
}
static SIMDINLINE Float SIMDCALL set_ps(
float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
{
return set_ps(
0, 0, 0, 0, 0, 0, 0, 0,
i7, i6, i5, i4, i3, i2, i1, i0);
}
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
{
Integer vec = set1_epi32(mask);
const Integer bit = set_epi32(
0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100,
0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
vec = and_si(vec, bit);
vec = cmplt_epi32(setzero_si(), vec);
return castsi_ps(vec);
}
#undef SIMD_WRAPPER_1
#undef SIMD_WRAPPER_2
#undef SIMD_WRAPPER_2I
#undef SIMD_WRAPPER_2I_1
#undef SIMD_WRAPPER_3
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_2
#undef SIMD_IWRAPPER_2I
#undef SIMD_IWRAPPER_2I_1
#undef SIMD_IWRAPPER_3

View file

@ -0,0 +1,28 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
// no backwards compatibility for simd mask-enabled functions

View file

@ -0,0 +1,428 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#pragma once
#if 0
//===========================================================================
// Placeholder name representing either SIMD4, SIMD256, or SIMD16 structures.
//===========================================================================
struct SIMD256 // or SIMD4 or SIMD16
{
//=======================================================================
// SIMD Types
//
// These typedefs are examples. The SIMD256 and SIMD16 implementations will
// use different base types with this same naming.
using Float = __m256; // Packed single-precision float vector
using Double = __m256d; // Packed double-precision float vector
using Integer = __m256i; // Packed integer vector (mutable element widths)
using Mask = uint8_t; // Integer representing mask bits
//=======================================================================
// Standard interface
// (available in both SIMD256 and SIMD16 widths)
//=======================================================================
//-----------------------------------------------------------------------
// Single precision floating point arithmetic operations
//-----------------------------------------------------------------------
static Float add_ps(Float a, Float b); // return a + b
static Float div_ps(Float a, Float b); // return a / b
static Float fmadd_ps(Float a, Float b, Float c); // return (a * b) + c
static Float fmsub_ps(Float a, Float b, Float c); // return (a * b) - c
static Float max_ps(Float a, Float b); // return (a > b) ? a : b
static Float min_ps(Float a, Float b); // return (a < b) ? a : b
static Float mul_ps(Float a, Float b); // return a * b
static Float rcp_ps(Float a); // return 1.0f / a
static Float rsqrt_ps(Float a); // return 1.0f / sqrt(a)
static Float sub_ps(Float a, Float b); // return a - b
enum class RoundMode
{
TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + (signof(value))0.5)
TO_NEG_INF = 0x01, // Round to negative infinity
TO_POS_INF = 0x02, // Round to positive infinity
TO_ZERO = 0x03, // Round to 0 a.k.a. truncate
CUR_DIRECTION = 0x04, // Round in direction set in MXCSR register
RAISE_EXC = 0x00, // Raise exception on overflow
NO_EXC = 0x08, // Suppress exceptions
NINT = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC),
NINT_NOEXC = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC),
FLOOR = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC),
FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC),
CEIL = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC),
CEIL_NOEXC = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC),
TRUNC = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC),
TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC),
RINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC),
NEARBYINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC),
};
// return round_func(a)
//
// round_func is chosen on the RMT template parameter. See the documentation
// for the RoundMode enumeration above.
template <RoundMode RMT>
static Float round_ps(Float a); // return round(a)
//-----------------------------------------------------------------------
// Integer (various width) arithmetic operations
//-----------------------------------------------------------------------
static Integer abs_epi32(Integer a); // return absolute_value(a) (int32)
static Integer add_epi32(Integer a, Integer b); // return a + b (int32)
static Integer add_epi8(Integer a, Integer b); // return a + b (int8)
static Integer adds_epu8(Integer a, Integer b); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
static Integer max_epi32(Integer a, Integer b); // return (a > b) ? a : b (int32)
static Integer max_epu32(Integer a, Integer b); // return (a > b) ? a : b (uint32)
static Integer min_epi32(Integer a, Integer b); // return (a < b) ? a : b (int32)
static Integer min_epu32(Integer a, Integer b); // return (a < b) ? a : b (uint32)
static Integer mul_epi32(Integer a, Integer b); // return a * b (int32)
// return (a * b) & 0xFFFFFFFF
//
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
// and store the low 32 bits of the intermediate integers in dst.
static Float mullo_epi32(Integer a, Integer b);
static Integer sub_epi32(Integer a, Integer b); // return a - b (int32)
static Integer sub_epi64(Integer a, Integer b); // return a - b (int64)
static Integer subs_epu8(Integer a, Integer b); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
static Float and_ps(Float a, Float b); // return a & b (float treated as int)
static Integer and_si(Integer a, Integer b); // return a & b (int)
static Float andnot_ps(Float a, Float b); // return (~a) & b (float treated as int)
static Integer andnot_si(Integer a, Integer b); // return (~a) & b (int)
static Float or_ps(Float a, Float b); // return a | b (float treated as int)
static Float or_si(Integer a, Integer b); // return a | b (int)
static Float xor_ps(Float a, Float b); // return a ^ b (float treated as int)
static Integer xor_si(Integer a, Integer b); // return a ^ b (int)
//-----------------------------------------------------------------------
// Shift operations
//-----------------------------------------------------------------------
template<int ImmT>
static Integer slli_epi32(Integer a); // return a << ImmT
static Integer sllv_epi32(Integer a, Integer b); // return a << b
template<int ImmT>
static Integer srai_epi32(Integer a); // return a >> ImmT (int32)
template<int ImmT>
static Integer srli_epi32(Integer a); // return a >> ImmT (uint32)
template<int ImmT> // for each 128-bit lane:
static Integer srli_si(Integer a); // return a >> (ImmT*8) (uint)
template<int ImmT>
static Float srlisi_ps(Float a); // same as srli_si, but with Float cast to int
static Integer srlv_epi32(Integer a, Integer b); // return a >> b (uint32)
//-----------------------------------------------------------------------
// Conversion operations
//-----------------------------------------------------------------------
static Float castpd_ps(Double a); // return *(Float*)(&a)
static Integer castps_si(Float a); // return *(Integer*)(&a)
static Double castsi_pd(Integer a); // return *(Double*)(&a)
static Double castps_pd(Float a); // return *(Double*)(&a)
static Float castsi_ps(Integer a); // return *(Float*)(&a)
static Float cvtepi32_ps(Integer a); // return (float)a (int32 --> float)
static Integer cvtepu8_epi16(Integer a); // return (int16)a (uint8 --> int16)
static Integer cvtepu8_epi32(Integer a); // return (int32)a (uint8 --> int32)
static Integer cvtepu16_epi32(Integer a); // return (int32)a (uint16 --> int32)
static Integer cvtepu16_epi64(Integer a); // return (int64)a (uint16 --> int64)
static Integer cvtepu32_epi64(Integer a); // return (int64)a (uint32 --> int64)
static Integer cvtps_epi32(Float a); // return (int32)a (float --> int32)
static Integer cvttps_epi32(Float a); // return (int32)a (rnd_to_zero(float) --> int32)
//-----------------------------------------------------------------------
// Comparison operations
//-----------------------------------------------------------------------
// Comparison types used with cmp_ps:
// - ordered comparisons are always false if either operand is NaN
// - unordered comparisons are always true if either operand is NaN
// - signaling comparisons raise an exception if either operand is NaN
// - non-signaling comparisons will never raise an exception
//
// Ordered: return (a != NaN) && (b != NaN) && (a cmp b)
// Unordered: return (a == NaN) || (b == NaN) || (a cmp b)
enum class CompareType
{
EQ_OQ = 0x00, // Equal (ordered, nonsignaling)
LT_OS = 0x01, // Less-than (ordered, signaling)
LE_OS = 0x02, // Less-than-or-equal (ordered, signaling)
UNORD_Q = 0x03, // Unordered (nonsignaling)
NEQ_UQ = 0x04, // Not-equal (unordered, nonsignaling)
NLT_US = 0x05, // Not-less-than (unordered, signaling)
NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling)
ORD_Q = 0x07, // Ordered (nonsignaling)
EQ_UQ = 0x08, // Equal (unordered, non-signaling)
NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling)
NGT_US = 0x0A, // Not-greater-than (unordered, signaling)
FALSE_OQ = 0x0B, // False (ordered, nonsignaling)
NEQ_OQ = 0x0C, // Not-equal (ordered, non-signaling)
GE_OS = 0x0D, // Greater-than-or-equal (ordered, signaling)
GT_OS = 0x0E, // Greater-than (ordered, signaling)
TRUE_UQ = 0x0F, // True (unordered, non-signaling)
EQ_OS = 0x10, // Equal (ordered, signaling)
LT_OQ = 0x11, // Less-than (ordered, nonsignaling)
LE_OQ = 0x12, // Less-than-or-equal (ordered, nonsignaling)
UNORD_S = 0x13, // Unordered (signaling)
NEQ_US = 0x14, // Not-equal (unordered, signaling)
NLT_UQ = 0x15, // Not-less-than (unordered, nonsignaling)
NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
ORD_S = 0x17, // Ordered (signaling)
EQ_US = 0x18, // Equal (unordered, signaling)
NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
NGT_UQ = 0x1A, // Not-greater-than (unordered, nonsignaling)
FALSE_OS = 0x1B, // False (ordered, signaling)
NEQ_OS = 0x1C, // Not-equal (ordered, signaling)
GE_OQ = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
GT_OQ = 0x1E, // Greater-than (ordered, nonsignaling)
TRUE_US = 0x1F, // True (unordered, signaling)
};
// return a (CmpTypeT) b (float)
//
// See documentation for CompareType above for valid values for CmpTypeT.
template<CompareType CmpTypeT>
static Float cmp_ps(Float a, Float b); // return a (CmtTypeT) b (see above)
static Float cmpgt_ps(Float a, Float b); // return cmp_ps<CompareType::GT_OQ>(a, b)
static Float cmple_ps(Float a, Float b); // return cmp_ps<CompareType::LE_OQ>(a, b)
static Float cmplt_ps(Float a, Float b); // return cmp_ps<CompareType::LT_OQ>(a, b)
static Float cmpneq_ps(Float a, Float b); // return cmp_ps<CompareType::NEQ_OQ>(a, b)
static Float cmpeq_ps(Float a, Float b); // return cmp_ps<CompareType::EQ_OQ>(a, b)
static Float cmpge_ps(Float a, Float b); // return cmp_ps<CompareType::GE_OQ>(a, b)
static Integer cmpeq_epi8(Integer a, Integer b); // return a == b (int8)
static Integer cmpeq_epi16(Integer a, Integer b); // return a == b (int16)
static Integer cmpeq_epi32(Integer a, Integer b); // return a == b (int32)
static Integer cmpeq_epi64(Integer a, Integer b); // return a == b (int64)
static Integer cmpgt_epi8(Integer a, Integer b); // return a > b (int8)
static Integer cmpgt_epi16(Integer a, Integer b); // return a > b (int16)
static Integer cmpgt_epi32(Integer a, Integer b); // return a > b (int32)
static Integer cmpgt_epi64(Integer a, Integer b); // return a > b (int64)
static Integer cmplt_epi32(Integer a, Integer b); // return a < b (int32)
static bool testz_ps(Float a, Float b); // return all_lanes_zero(a & b) ? 1 : 0 (float)
static bool testz_si(Integer a, Integer b); // return all_lanes_zero(a & b) ? 1 : 0 (int)
//-----------------------------------------------------------------------
// Blend / shuffle / permute operations
//-----------------------------------------------------------------------
template<int ImmT>
static Float blend_ps(Float a, Float b); // return ImmT ? b : a (float)
static Integer blendv_epi32(Integer a, Integer b, Float mask); // return mask ? b : a (int)
static Float blendv_ps(Float a, Float b, Float mask); // return mask ? b : a (float)
static Float broadcast_ss(float const *p); // return *p (all elements in vector get same value)
static Integer packs_epi16(Integer a, Integer b); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
static Integer packs_epi32(Integer a, Integer b); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
static Integer packus_epi16(Integer a, Integer b); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
static Integer packus_epi32(Integer a, Integer b); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
static Float permute_epi32(Integer a, Integer swiz); // return a[swiz[i]] for each 32-bit lane i (int32)
static Float permute_ps(Float a, Integer swiz); // return a[swiz[i]] for each 32-bit lane i (float)
template<int SwizT>
static Integer shuffle_epi32(Integer a, Integer b);
template<int SwizT>
static Integer shuffle_epi64(Integer a, Integer b);
static Integer shuffle_epi8(Integer a, Integer b);
template<int SwizT>
static Float shuffle_pd(Double a, Double b);
template<int SwizT>
static Float shuffle_ps(Float a, Float b);
static Integer unpackhi_epi16(Integer a, Integer b);
static Integer unpackhi_epi32(Integer a, Integer b);
static Integer unpackhi_epi64(Integer a, Integer b);
static Integer unpackhi_epi8(Integer a, Integer b);
static Float unpackhi_pd(Double a, Double b);
static Float unpackhi_ps(Float a, Float b);
static Integer unpacklo_epi16(Integer a, Integer b);
static Integer unpacklo_epi32(Integer a, Integer b);
static Integer unpacklo_epi64(Integer a, Integer b);
static Integer unpacklo_epi8(Integer a, Integer b);
static Float unpacklo_pd(Double a, Double b);
static Float unpacklo_ps(Float a, Float b);
//-----------------------------------------------------------------------
// Load / store operations
//-----------------------------------------------------------------------
enum class ScaleFactor
{
SF_1, // No scaling
SF_2, // Scale offset by 2
SF_4, // Scale offset by 4
SF_8, // Scale offset by 8
};
template<ScaleFactor ScaleT>
static Float i32gather_ps(float const* p, Integer idx); // return *(float*)(((int8*)p) + (idx * ScaleT))
static Float load1_ps(float const *p); // return *p (broadcast 1 value to all elements)
static Float load_ps(float const *p); // return *p (loads SIMD width elements from memory)
static Integer load_si(Integer const *p); // return *p
static Float loadu_ps(float const *p); // return *p (same as load_ps but allows for unaligned mem)
static Integer loadu_si(Integer const *p); // return *p (same as load_si but allows for unaligned mem)
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template<int ScaleT>
static Float mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask);
static void maskstore_ps(float *p, Integer mask, Float src);
static int movemask_epi8(Integer a);
static int movemask_pd(Double a);
static int movemask_ps(Float a);
static Integer set1_epi32(int i); // return i (all elements are same value)
static Integer set1_epi8(char i); // return i (all elements are same value)
static Float set1_ps(float f); // return f (all elements are same value)
static Float setzero_ps(); // return 0 (float)
static Integer setzero_si(); // return 0 (integer)
static void store_ps(float *p, Float a); // *p = a (stores all elements contiguously in memory)
static void store_si(Integer *p, Integer a); // *p = a
static void stream_ps(float *p, Float a); // *p = a (same as store_ps, but doesn't keep memory in cache)
//=======================================================================
// Legacy interface (available only in SIMD256 width)
//=======================================================================
static Float broadcast_ps(__m128 const *p);
template<int ImmT>
static __m128d extractf128_pd(Double a);
template<int ImmT>
static __m128 extractf128_ps(Float a);
template<int ImmT>
static __m128i extractf128_si(Integer a);
template<int ImmT>
static Double insertf128_pd(Double a, __m128d b);
template<int ImmT>
static Float insertf128_ps(Float a, __m128 b);
template<int ImmT>
static Integer insertf128_si(Integer a, __m128i b);
static Integer loadu2_si(__m128 const* phi, __m128 const* plo);
template<int ImmT>
static Double permute2f128_pd(Double a, Double b);
template<int ImmT>
static Float permute2f128_ps(Float a, Float b);
template<int ImmT>
static Integer permute2f128_si(Integer a, Integer b);
static Integer set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0);
static void storeu2_si(__m128i *phi, __m128i *plo, Integer src);
//=======================================================================
// Advanced masking interface (currently available only in SIMD16 width)
//=======================================================================
//=======================================================================
// Extended Utility Functions (common to SIMD256 and SIMD16)
//=======================================================================
//-----------------------------------------------------------------------
// Extended Types
//-----------------------------------------------------------------------
// Vec4, an SOA SIMD set of 4-dimensional vectors
union Vec4
{
Vec4() = default;
Vec4(Float in)
{
s.x = in;
s.y = in;
s.z = in;
s.w = in;
}
Vec4(Float x, Float y, Float z, Float w)
{
s.x = x;
s.y = y;
s.z = z;
s.w = w;
}
Float v[4];
Integer vi[4];
struct
{
Float x;
Float y;
Float z;
Float w;
} s;
Float& operator[] (const int i) { return v[i]; }
Float const & operator[] (const int i) const { return v[i]; }
};
//-----------------------------------------------------------------------
// Extended Functions
//-----------------------------------------------------------------------
static void vec4_set1_ps(Vec4& r, const float *p); // r[0] = set1(p[0]), r[1] = set1(p[1]), ...
static void vec4_set1_vps(Vec4& r, Float s); // r[0] = s, r[1] = s, ...
static Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1); // return dp3(v0, v1)
static Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1); // return dp4(v0, v1)
static Float vec4_rcp_length_ps(const Vec4& v); // return 1.0f / sqrt(dp4(v, v))
static void vec4_normalize_ps(Vec4& r, const Vec4& v); // r = v * rcp_length(v)
static void vec4_mul_ps(Vec4& r, const Vec4& v, Float s); // r = v * set1_vps(s)
static void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1); // r = v0 * v1
static void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1); // r = v0 + v1
static void vec4_min_ps(Vec4& r, const Vec4& v0, Float s); // r = (v0 < s) ? v0 : s
static void vec4_max_ps(Vec4& r, const Vec4& v0, Float s); // r = (v0 > s) ? v0 : s
// Matrix4x4 * Vector4
// result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * v.s.w)
// result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * v.s.w)
// result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * v.s.w)
// result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * v.s.w)
static void mat4x4_vec4_multiply(
Vec4& result,
const float *pMatrix,
const Vec4& v);
// Matrix4x4 * Vector3 - Direction Vector where w = 0.
// result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 0)
// result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 0)
// result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 0)
// result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 0)
static void mat3x3_vec3_w0_multiply(
Vec4& result,
const float *pMatrix,
const Vec4& v);
// Matrix4x4 * Vector3 - Position vector where w = 1.
// result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
// result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
// result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
// result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 1)
static void mat4x4_vec3_w1_multiply(
Vec4& result,
const float *pMatrix,
const Vec4& v);
// Matrix4x3 * Vector3 - Position vector where w = 1.
// result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
// result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
// result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
// result.s.w = 1
static void mat4x3_vec3_w1_multiply(
Vec4& result,
const float *pMatrix,
const Vec4& v);
};
#endif // #if 0

View file

@ -0,0 +1,377 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#pragma once
#if !defined(__cplusplus)
#error C++ compilation required
#endif
#include <immintrin.h>
#include <inttypes.h>
#include <stdint.h>
#define SIMD_ARCH_AVX 0
#define SIMD_ARCH_AVX2 1
#define SIMD_ARCH_AVX512 2
#if !defined(SIMD_ARCH)
#define SIMD_ARCH SIMD_ARCH_AVX
#endif
#if defined(_MSC_VER)
#define SIMDCALL __vectorcall
#define SIMDINLINE __forceinline
#define SIMDALIGN(type_, align_) __declspec(align(align_)) type_
#else
#define SIMDCALL
#define SIMDINLINE inline
#define SIMDALIGN(type_, align_) type_ __attribute__((aligned(align_)))
#endif
// For documentation, please see the following include...
// #include "simdlib_interface.hpp"
namespace SIMDImpl
{
enum class CompareType
{
EQ_OQ = 0x00, // Equal (ordered, nonsignaling)
LT_OS = 0x01, // Less-than (ordered, signaling)
LE_OS = 0x02, // Less-than-or-equal (ordered, signaling)
UNORD_Q = 0x03, // Unordered (nonsignaling)
NEQ_UQ = 0x04, // Not-equal (unordered, nonsignaling)
NLT_US = 0x05, // Not-less-than (unordered, signaling)
NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling)
ORD_Q = 0x07, // Ordered (nonsignaling)
EQ_UQ = 0x08, // Equal (unordered, non-signaling)
NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling)
NGT_US = 0x0A, // Not-greater-than (unordered, signaling)
FALSE_OQ = 0x0B, // False (ordered, nonsignaling)
NEQ_OQ = 0x0C, // Not-equal (ordered, non-signaling)
GE_OS = 0x0D, // Greater-than-or-equal (ordered, signaling)
GT_OS = 0x0E, // Greater-than (ordered, signaling)
TRUE_UQ = 0x0F, // True (unordered, non-signaling)
EQ_OS = 0x10, // Equal (ordered, signaling)
LT_OQ = 0x11, // Less-than (ordered, nonsignaling)
LE_OQ = 0x12, // Less-than-or-equal (ordered, nonsignaling)
UNORD_S = 0x13, // Unordered (signaling)
NEQ_US = 0x14, // Not-equal (unordered, signaling)
NLT_UQ = 0x15, // Not-less-than (unordered, nonsignaling)
NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
ORD_S = 0x17, // Ordered (signaling)
EQ_US = 0x18, // Equal (unordered, signaling)
NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
NGT_UQ = 0x1A, // Not-greater-than (unordered, nonsignaling)
FALSE_OS = 0x1B, // False (ordered, signaling)
NEQ_OS = 0x1C, // Not-equal (ordered, signaling)
GE_OQ = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
GT_OQ = 0x1E, // Greater-than (ordered, nonsignaling)
TRUE_US = 0x1F, // True (unordered, signaling)
};
#if SIMD_ARCH >= SIMD_ARCH_AVX512
enum class CompareTypeInt
{
EQ = _MM_CMPINT_EQ, // Equal
LT = _MM_CMPINT_LT, // Less than
LE = _MM_CMPINT_LE, // Less than or Equal
NE = _MM_CMPINT_NE, // Not Equal
GE = _MM_CMPINT_GE, // Greater than or Equal
GT = _MM_CMPINT_GT, // Greater than
};
#endif // SIMD_ARCH >= SIMD_ARCH_AVX512
enum class ScaleFactor
{
SF_1 = 1, // No scaling
SF_2 = 2, // Scale offset by 2
SF_4 = 4, // Scale offset by 4
SF_8 = 8, // Scale offset by 8
};
enum class RoundMode
{
TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + 0.5)
TO_NEG_INF = 0x01, // Round to negative infinity
TO_POS_INF = 0x02, // Round to positive infinity
TO_ZERO = 0x03, // Round to 0 a.k.a. truncate
CUR_DIRECTION = 0x04, // Round in direction set in MXCSR register
RAISE_EXC = 0x00, // Raise exception on overflow
NO_EXC = 0x08, // Suppress exceptions
NINT = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC),
NINT_NOEXC = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC),
FLOOR = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC),
FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC),
CEIL = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC),
CEIL_NOEXC = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC),
TRUNC = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC),
TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC),
RINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC),
NEARBYINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC),
};
struct Traits
{
using CompareType = SIMDImpl::CompareType;
using ScaleFactor = SIMDImpl::ScaleFactor;
using RoundMode = SIMDImpl::RoundMode;
};
// Attribute, 4-dimensional attribute in SIMD SOA layout
template<typename Float, typename Integer, typename Double>
union Vec4
{
Float v[4];
Integer vi[4];
Double vd[4];
struct
{
Float x;
Float y;
Float z;
Float w;
};
SIMDINLINE Float& operator[] (const int i) { return v[i]; }
SIMDINLINE Float const & operator[] (const int i) const { return v[i]; }
SIMDINLINE Vec4& operator=(Vec4 const & in)
{
v[0] = in.v[0];
v[1] = in.v[1];
v[2] = in.v[2];
v[3] = in.v[3];
return *this;
}
};
namespace SIMD128Impl
{
union Float
{
SIMDINLINE Float() = default;
SIMDINLINE Float(__m128 in) : v(in) {}
SIMDINLINE Float& operator=(__m128 in) { v = in; return *this; }
SIMDINLINE Float& operator=(Float const & in) { v = in.v; return *this; }
SIMDINLINE operator __m128() const { return v; }
SIMDALIGN(__m128, 16) v;
};
union Integer
{
SIMDINLINE Integer() = default;
SIMDINLINE Integer(__m128i in) : v(in) {}
SIMDINLINE Integer& operator=(__m128i in) { v = in; return *this; }
SIMDINLINE Integer& operator=(Integer const & in) { v = in.v; return *this; }
SIMDINLINE operator __m128i() const { return v; }
SIMDALIGN(__m128i, 16) v;
};
union Double
{
SIMDINLINE Double() = default;
SIMDINLINE Double(__m128d in) : v(in) {}
SIMDINLINE Double& operator=(__m128d in) { v = in; return *this; }
SIMDINLINE Double& operator=(Double const & in) { v = in.v; return *this; }
SIMDINLINE operator __m128d() const { return v; }
SIMDALIGN(__m128d, 16) v;
};
using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
using Mask = uint8_t;
static const uint32_t SIMD_WIDTH = 4;
} // ns SIMD128Impl
namespace SIMD256Impl
{
union Float
{
SIMDINLINE Float() = default;
SIMDINLINE Float(__m256 in) : v(in) {}
SIMDINLINE Float(SIMD128Impl::Float in_lo, SIMD128Impl::Float in_hi = _mm_setzero_ps())
{
v = _mm256_insertf128_ps(_mm256_castps128_ps256(in_lo), in_hi, 0x1);
}
SIMDINLINE Float& operator=(__m256 in) { v = in; return *this; }
SIMDINLINE Float& operator=(Float const & in) { v = in.v; return *this; }
SIMDINLINE operator __m256() const { return v; }
SIMDALIGN(__m256, 32) v;
SIMD128Impl::Float v4[2];
};
union Integer
{
SIMDINLINE Integer() = default;
SIMDINLINE Integer(__m256i in) : v(in) {}
SIMDINLINE Integer(SIMD128Impl::Integer in_lo, SIMD128Impl::Integer in_hi = _mm_setzero_si128())
{
v = _mm256_insertf128_si256(_mm256_castsi128_si256(in_lo), in_hi, 0x1);
}
SIMDINLINE Integer& operator=(__m256i in) { v = in; return *this; }
SIMDINLINE Integer& operator=(Integer const & in) { v = in.v; return *this; }
SIMDINLINE operator __m256i() const { return v; }
SIMDALIGN(__m256i, 32) v;
SIMD128Impl::Integer v4[2];
};
union Double
{
SIMDINLINE Double() = default;
SIMDINLINE Double(__m256d in) : v(in) {}
SIMDINLINE Double(SIMD128Impl::Double in_lo, SIMD128Impl::Double in_hi = _mm_setzero_pd())
{
v = _mm256_insertf128_pd(_mm256_castpd128_pd256(in_lo), in_hi, 0x1);
}
SIMDINLINE Double& operator=(__m256d in) { v = in; return *this; }
SIMDINLINE Double& operator=(Double const & in) { v = in.v; return *this; }
SIMDINLINE operator __m256d() const { return v; }
SIMDALIGN(__m256d, 32) v;
SIMD128Impl::Double v4[2];
};
using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
using Mask = uint8_t;
static const uint32_t SIMD_WIDTH = 8;
} // ns SIMD256Impl
namespace SIMD512Impl
{
#if !defined(_MM_K0_REG)
// Define AVX512 types if not included via immintrin.h.
// All data members of these types are ONLY to viewed
// in a debugger. Do NOT access them via code!
union __m512
{
private:
float m512_f32[16];
};
struct __m512d
{
private:
double m512d_f64[8];
};
union __m512i
{
private:
int8_t m512i_i8[64];
int16_t m512i_i16[32];
int32_t m512i_i32[16];
int64_t m512i_i64[8];
uint8_t m512i_u8[64];
uint16_t m512i_u16[32];
uint32_t m512i_u32[16];
uint64_t m512i_u64[8];
};
using __mmask16 = uint16_t;
#endif
#if SIMD_ARCH >= SIMD_ARCH_AVX512
#define SIMD_ALIGNMENT_BYTES 64
#else
#define SIMD_ALIGNMENT_BYTES 32
#endif
union Float
{
SIMDINLINE Float() = default;
SIMDINLINE Float(__m512 in) : v(in) {}
SIMDINLINE Float(SIMD256Impl::Float in_lo, SIMD256Impl::Float in_hi = _mm256_setzero_ps()) { v8[0] = in_lo; v8[1] = in_hi; }
SIMDINLINE Float& operator=(__m512 in) { v = in; return *this; }
SIMDINLINE Float& operator=(Float const & in)
{
#if SIMD_ARCH >= SIMD_ARCH_AVX512
v = in.v;
#else
v8[0] = in.v8[0];
v8[1] = in.v8[1];
#endif
return *this;
}
SIMDINLINE operator __m512() const { return v; }
SIMDALIGN(__m512, SIMD_ALIGNMENT_BYTES) v;
SIMD256Impl::Float v8[2];
};
union Integer
{
SIMDINLINE Integer() = default;
SIMDINLINE Integer(__m512i in) : v(in) {}
SIMDINLINE Integer(SIMD256Impl::Integer in_lo, SIMD256Impl::Integer in_hi = _mm256_setzero_si256()) { v8[0] = in_lo; v8[1] = in_hi; }
SIMDINLINE Integer& operator=(__m512i in) { v = in; return *this; }
SIMDINLINE Integer& operator=(Integer const & in)
{
#if SIMD_ARCH >= SIMD_ARCH_AVX512
v = in.v;
#else
v8[0] = in.v8[0];
v8[1] = in.v8[1];
#endif
return *this;
}
SIMDINLINE operator __m512i() const { return v; }
SIMDALIGN(__m512i, SIMD_ALIGNMENT_BYTES) v;
SIMD256Impl::Integer v8[2];
};
union Double
{
SIMDINLINE Double() = default;
SIMDINLINE Double(__m512d in) : v(in) {}
SIMDINLINE Double(SIMD256Impl::Double in_lo, SIMD256Impl::Double in_hi = _mm256_setzero_pd()) { v8[0] = in_lo; v8[1] = in_hi; }
SIMDINLINE Double& operator=(__m512d in) { v = in; return *this; }
SIMDINLINE Double& operator=(Double const & in)
{
#if SIMD_ARCH >= SIMD_ARCH_AVX512
v = in.v;
#else
v8[0] = in.v8[0];
v8[1] = in.v8[1];
#endif
return *this;
}
SIMDINLINE operator __m512d() const { return v; }
SIMDALIGN(__m512d, SIMD_ALIGNMENT_BYTES) v;
SIMD256Impl::Double v8[2];
};
typedef SIMDImpl::Vec4<Float, Integer, Double> SIMDALIGN(Vec4, 64);
using Mask = __mmask16;
static const uint32_t SIMD_WIDTH = 16;
#undef SIMD_ALIGNMENT_BYTES
} // ns SIMD512Impl
} // ns SIMDImpl

View file

@ -43,10 +43,10 @@ enum SWR_BACKEND_FUNCS
};
#if KNOB_SIMD_WIDTH == 8
static const simdscalar vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
static const simdscalar vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
static const simdscalar vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
static const simdscalar vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
static const __m256 vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
static const __m256 vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
static const __m256 vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
static const __m256 vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
#define MASK 0xff
#endif
@ -163,52 +163,52 @@ struct generateInputCoverage
uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
if(T::MultisampleT::numSamples == 1)
{
sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
}
else if(T::MultisampleT::numSamples == 2)
{
sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
}
else if(T::MultisampleT::numSamples == 4)
{
sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
}
else if(T::MultisampleT::numSamples == 8)
{
sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
}
else if(T::MultisampleT::numSamples == 16)
{
sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
sampleCoverage[1] = _simd_set1_epi32(centerCoverage);
}
}
else
{
__m256i src = _mm256_set1_epi32(0);
__m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
simdscalari src = _simd_set1_epi32(0);
simdscalari index0 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
if(T::MultisampleT::numSamples == 1)
{
mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
}
else if(T::MultisampleT::numSamples == 2)
{
mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
}
else if(T::MultisampleT::numSamples == 4)
{
mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
mask[0] = _simd_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
}
else if(T::MultisampleT::numSamples == 8)
{
mask[0] = _mm256_set1_epi32(-1);
mask[0] = _simd_set1_epi32(-1);
}
else if(T::MultisampleT::numSamples == 16)
{
mask[0] = _mm256_set1_epi32(-1);
mask[1] = _mm256_set1_epi32(-1);
index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
mask[0] = _simd_set1_epi32(-1);
mask[1] = _simd_set1_epi32(-1);
index1 = _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
}
// gather coverage for samples 0-7
@ -253,14 +253,14 @@ struct generateInputCoverage
packedSampleCoverage = packedCoverage0;
}
#else
simdscalari permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
simdscalari permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
// pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
simdscalari packedSampleCoverage;
if(T::MultisampleT::numSamples > 8)
{
permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
// pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
@ -293,7 +293,7 @@ struct generateInputCoverage
{
uint32_t inputMask[KNOB_SIMD_WIDTH];
generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
inputCoverage = _simd_castsi_ps(_simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
}
};
@ -305,10 +305,10 @@ struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
{
// will need to update for avx512
assert(KNOB_SIMD_WIDTH == 8);
simdscalari vec = _mm256_set1_epi32(coverageMask[0]);
const simdscalari bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
simdscalari vec = _simd_set1_epi32(coverageMask[0]);
const simdscalari bit = _simd_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
vec = _simd_and_si(vec, bit);
vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
vec = _simd_cmplt_epi32(_simd_setzero_si(), vec);
vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
inputCoverage = _simd_castsi_ps(vec);
}
@ -357,7 +357,7 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS
(inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
// look up and set the sample offsets from UL pixel corner for first covered sample
__m256 vXSample = _mm256_set_ps(samplePos.X(sampleNum[7]),
simdscalar vXSample = _simd_set_ps(samplePos.X(sampleNum[7]),
samplePos.X(sampleNum[6]),
samplePos.X(sampleNum[5]),
samplePos.X(sampleNum[4]),
@ -366,7 +366,7 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS
samplePos.X(sampleNum[1]),
samplePos.X(sampleNum[0]));
__m256 vYSample = _mm256_set_ps(samplePos.Y(sampleNum[7]),
simdscalar vYSample = _simd_set_ps(samplePos.Y(sampleNum[7]),
samplePos.Y(sampleNum[6]),
samplePos.Y(sampleNum[5]),
samplePos.Y(sampleNum[4]),
@ -380,7 +380,7 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS
// Case (1) and case (3b) - All samples covered or not covered with full SampleMask
static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
simdscalari vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
simdscalari vInputCoveragei = _simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
static const simdscalari vZero = _simd_setzero_si();

View file

@ -88,7 +88,7 @@ INLINE void ProcessAttributes(
inputSlot = backendState.vertexAttribOffset + i;
}
__m128 attrib[3]; // triangle attribs (always 4 wide)
simd4scalar attrib[3]; // triangle attribs (always 4 wide)
float* pAttribStart = pBuffer;
if (HasConstantInterpT::value || IsDegenerate::value)
@ -128,7 +128,7 @@ INLINE void ProcessAttributes(
for (uint32_t i = 0; i < NumVertsT::value; ++i)
{
_mm_store_ps(pBuffer, attrib[vid]);
SIMD128::store_ps(pBuffer, attrib[vid]);
pBuffer += 4;
}
}
@ -138,7 +138,7 @@ INLINE void ProcessAttributes(
for (uint32_t i = 0; i < NumVertsT::value; ++i)
{
_mm_store_ps(pBuffer, attrib[i]);
SIMD128::store_ps(pBuffer, attrib[i]);
pBuffer += 4;
}
}
@ -149,7 +149,7 @@ INLINE void ProcessAttributes(
for (uint32_t i = 0; i < NumVertsT::value; ++i)
{
_mm_store_ps(pBuffer, attrib[i]);
SIMD128::store_ps(pBuffer, attrib[i]);
pBuffer += 4;
}
}
@ -160,7 +160,7 @@ INLINE void ProcessAttributes(
// effect of the missing vertices in the triangle interpolation.
for (uint32_t v = NumVertsT::value; v < 3; ++v)
{
_mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
pBuffer += 4;
}
@ -279,8 +279,7 @@ struct GatherScissors_simd16<16>
{
static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
simd16scalari &scisXmin, simd16scalari &scisYmin,
simd16scalari &scisXmax, simd16scalari &scisYmax)
{
simd16scalari &scisXmax, simd16scalari &scisYmax) {
scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
pScissorsInFixedPoint[pViewportIndex[1]].xmin,
pScissorsInFixedPoint[pViewportIndex[2]].xmin,
@ -390,14 +389,14 @@ void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask,
uint32_t clipAttribSlot = clipSlot == 0 ?
VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
__m128 primClipDist[3];
simd4scalar primClipDist[3];
pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
float vertClipDist[NumVerts];
for (uint32_t e = 0; e < NumVerts; ++e)
{
OSALIGNSIMD(float) aVertClipDist[4];
_mm_store_ps(aVertClipDist, primClipDist[e]);
SIMD128::store_ps(aVertClipDist, primClipDist[e]);
vertClipDist[e] = aVertClipDist[clipComp];
};
@ -625,13 +624,14 @@ void BinTriangles(
(SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
}
simdBBox bbox;
if (!triMask)
{
goto endBinTriangles;
}
// Calc bounding box of triangles
simdBBox bbox;
calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
// determine if triangle falls between pixel centers and discard
@ -673,28 +673,30 @@ void BinTriangles(
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
if (state.backendState.readViewportArrayIndex)
{
GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
scisXmin, scisYmin, scisXmax, scisYmax);
}
else // broadcast fast path for non-VPAI case.
{
scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
}
simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
if (state.backendState.readViewportArrayIndex)
{
GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
scisXmin, scisYmin, scisXmax, scisYmax);
}
else // broadcast fast path for non-VPAI case.
{
scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
}
// Make triangle bbox inclusive
bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1));
bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1));
// Make triangle bbox inclusive
bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1));
bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1));
bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax);
bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax);
bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax);
bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax);
}
if (CT::IsConservativeT::value)
{
@ -768,7 +770,7 @@ endBinTriangles:
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
__m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
@ -837,10 +839,10 @@ endBinTriangles:
// store triangle vertex data
desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
_mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
_mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
_mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
_mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
// store user clip distances
if (rastState.clipDistanceMask)
@ -870,7 +872,7 @@ endBinTriangles:
#if USE_SIMD16_FRONTEND
template <typename CT>
void SIMDAPI BinTriangles_simd16(
void SIMDCALL BinTriangles_simd16(
DRAW_CONTEXT *pDC,
PA_STATE& pa,
uint32_t workerId,
@ -1124,29 +1126,31 @@ void SIMDAPI BinTriangles_simd16(
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
if (state.backendState.readViewportArrayIndex)
{
GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
scisXmin, scisYmin, scisXmax, scisYmax);
}
else // broadcast fast path for non-VPAI case.
{
scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
}
simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
// Make triangle bbox inclusive
bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1));
bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1));
if (state.backendState.readViewportArrayIndex)
{
GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
scisXmin, scisYmin, scisXmax, scisYmax);
}
else // broadcast fast path for non-VPAI case.
{
scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
}
bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax);
bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax);
// Make triangle bbox inclusive
bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1));
bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1));
bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax);
bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax);
}
if (CT::IsConservativeT::value)
{
@ -1221,10 +1225,10 @@ endBinTriangles:
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
__m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
__m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
__m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
__m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
vTranspose3x8(vHorizX[0], _simd16_extract_ps(tri[0].x, 0), _simd16_extract_ps(tri[1].x, 0), _simd16_extract_ps(tri[2].x, 0));
vTranspose3x8(vHorizY[0], _simd16_extract_ps(tri[0].y, 0), _simd16_extract_ps(tri[1].y, 0), _simd16_extract_ps(tri[2].y, 0));
@ -1547,24 +1551,26 @@ void BinPostSetupPoints(
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
if (state.backendState.readViewportArrayIndex)
{
GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
scisXmin, scisYmin, scisXmax, scisYmax);
}
else // broadcast fast path for non-VPAI case.
{
scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
}
simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
if (state.backendState.readViewportArrayIndex)
{
GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
scisXmin, scisYmin, scisXmax, scisYmax);
}
else // broadcast fast path for non-VPAI case.
{
scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
}
bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
}
// Cull bloated points completely outside scissor
simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
@ -1934,24 +1940,26 @@ void BinPostSetupPoints_simd16(
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
if (state.backendState.readViewportArrayIndex)
{
GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
scisXmin, scisYmin, scisXmax, scisYmax);
}
else // broadcast fast path for non-VPAI case.
{
scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
}
simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
if (state.backendState.readViewportArrayIndex)
{
GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
scisXmin, scisYmin, scisXmax, scisYmax);
}
else // broadcast fast path for non-VPAI case.
{
scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
}
bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
}
// Cull bloated points completely outside scissor
simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
@ -2071,7 +2079,7 @@ void BinPostSetupPoints_simd16(
AR_END(FEBinPoints, 1);
}
void SIMDAPI BinPoints_simd16(
void SIMDCALL BinPoints_simd16(
DRAW_CONTEXT *pDC,
PA_STATE& pa,
uint32_t workerId,
@ -2168,6 +2176,8 @@ void BinPostSetupLines(
simdscalar& vRecipW0 = recipW[0];
simdscalar& vRecipW1 = recipW[1];
simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
// convert to fixed point
simdscalari vXi[2], vYi[2];
vXi[0] = fpToFixedPointVertical(prim[0].x);
@ -2214,24 +2224,26 @@ void BinPostSetupLines(
bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
if (state.backendState.readViewportArrayIndex)
{
GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
scisXmin, scisYmin, scisXmax, scisYmax);
}
else // broadcast fast path for non-VPAI case.
{
scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
}
simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
if (state.backendState.readViewportArrayIndex)
{
GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
scisXmin, scisYmin, scisXmax, scisYmax);
}
else // broadcast fast path for non-VPAI case.
{
scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
}
bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
}
// Cull prims completely outside scissor
{
@ -2261,7 +2273,6 @@ void BinPostSetupLines(
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
__m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
@ -2310,10 +2321,10 @@ void BinPostSetupLines(
// store line vertex data
desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
_mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
_mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
_mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
_mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
// store user clip distances
if (rastState.clipDistanceMask)
@ -2417,25 +2428,27 @@ void BinPostSetupLines_simd16(
bbox.ymax = _simd16_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
if (state.backendState.readViewportArrayIndex)
{
GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
scisXmin, scisYmin, scisXmax, scisYmax);
}
else // broadcast fast path for non-VPAI case.
{
scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
}
simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
if (state.backendState.readViewportArrayIndex)
{
GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
scisXmin, scisYmin, scisXmax, scisYmax);
}
else // broadcast fast path for non-VPAI case.
{
scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
}
bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
}
// Cull prims completely outside scissor
{
@ -2468,10 +2481,10 @@ void BinPostSetupLines_simd16(
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
__m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
__m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
__m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
__m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
vTranspose3x8(vHorizX[0], _simd16_extract_ps(prim[0].x, 0), _simd16_extract_ps(prim[1].x, 0), unused);
vTranspose3x8(vHorizY[0], _simd16_extract_ps(prim[0].y, 0), _simd16_extract_ps(prim[1].y, 0), unused);
@ -2650,7 +2663,7 @@ void BinLines(
}
#if USE_SIMD16_FRONTEND
void SIMDAPI BinLines_simd16(
void SIMDCALL BinLines_simd16(
DRAW_CONTEXT *pDC,
PA_STATE& pa,
uint32_t workerId,

View file

@ -188,7 +188,7 @@ void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector p
}
#if USE_SIMD16_FRONTEND
void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
{
SWR_CONTEXT *pContext = pDC->pContext;
AR_BEGIN(FEClipTriangles, pDC->drawId);
@ -203,7 +203,7 @@ void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t work
AR_END(FEClipTriangles, 1);
}
void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
{
SWR_CONTEXT *pContext = pDC->pContext;
AR_BEGIN(FEClipLines, pDC->drawId);
@ -218,7 +218,7 @@ void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId
AR_END(FEClipLines, 1);
}
void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
{
SWR_CONTEXT *pContext = pDC->pContext;
AR_BEGIN(FEClipPoints, pDC->drawId);

View file

@ -1095,7 +1095,7 @@ public:
AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
// we have to clip tris, execute the clipper, which will also
// call the binner
ClipSimd(vMask16(primMask), vMask16(clipMask), pa, primId);
ClipSimd(_simd16_vmask_ps(primMask), _simd16_vmask_ps(clipMask), pa, primId);
AR_END(FEGuardbandClip, 1);
}
else if (validMask)
@ -1180,7 +1180,7 @@ private:
{
simd16scalari vOffsets = ComputeOffsets(attrib, vIndices, component);
simd16scalar vSrc = _simd16_setzero_ps();
return _simd16_mask_i32gather_ps(vSrc, pBuffer, vOffsets, _simd16_castps_si(vMask), 1);
return _simd16_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
}
#endif
@ -1895,8 +1895,8 @@ void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvecto
void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
#if USE_SIMD16_FRONTEND
void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
#endif

View file

@ -218,7 +218,7 @@ typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t worke
#if ENABLE_AVX512_SIMD16
// function signature for pipeline stages that execute after primitive assembly
typedef void(SIMDAPI *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
typedef void(SIMDCALL *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
uint32_t primMask, simd16scalari primID);
#endif

View file

@ -202,7 +202,7 @@ INLINE void StoreSOA(const simdvector &src, uint8_t *pDst)
/// @param pSrc - source data in SOA form
/// @param dst - output data in SOA form
template<SWR_FORMAT SrcFormat>
INLINE void SIMDAPI LoadSOA(const uint8_t *pSrc, simd16vector &dst)
INLINE void SIMDCALL LoadSOA(const uint8_t *pSrc, simd16vector &dst)
{
// fast path for float32
if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32))
@ -247,7 +247,7 @@ INLINE void SIMDAPI LoadSOA(const uint8_t *pSrc, simd16vector &dst)
/// @param vComp - SIMD vector of floats
/// @param Component - component
template<SWR_FORMAT Format>
INLINE simd16scalar SIMDAPI Clamp(simd16scalar vComp, uint32_t Component)
INLINE simd16scalar SIMDCALL Clamp(simd16scalar vComp, uint32_t Component)
{
if (FormatTraits<Format>::isNormalized(Component))
{
@ -293,7 +293,7 @@ INLINE simd16scalar SIMDAPI Clamp(simd16scalar vComp, uint32_t Component)
/// @param vComp - SIMD vector of floats
/// @param Component - component
template<SWR_FORMAT Format>
INLINE simd16scalar SIMDAPI Normalize(simd16scalar vComp, uint32_t Component)
INLINE simd16scalar SIMDCALL Normalize(simd16scalar vComp, uint32_t Component)
{
if (FormatTraits<Format>::isNormalized(Component))
{
@ -309,7 +309,7 @@ INLINE simd16scalar SIMDAPI Normalize(simd16scalar vComp, uint32_t Component)
/// @param src - source data in SOA form
/// @param dst - output data in SOA form
template<SWR_FORMAT DstFormat>
INLINE void SIMDAPI StoreSOA(const simd16vector &src, uint8_t *pDst)
INLINE void SIMDCALL StoreSOA(const simd16vector &src, uint8_t *pDst)
{
// fast path for float32
if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32))

View file

@ -43,7 +43,7 @@ struct PackTraits
static simdscalar pack(simdscalar &in) = delete;
#if ENABLE_AVX512_SIMD16
static simd16scalar loadSOA_16(const uint8_t *pSrc) = delete;
static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src) = delete;
static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src) = delete;
static simd16scalar unpack(simd16scalar &in) = delete;
static simd16scalar pack(simd16scalar &in) = delete;
#endif
@ -63,7 +63,7 @@ struct PackTraits<0, false>
static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
#if ENABLE_AVX512_SIMD16
static simd16scalar loadSOA_16(const uint8_t *pSrc) { return _simd16_setzero_ps(); }
static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src) { return; }
static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src) { return; }
static simd16scalar unpack(simd16scalar &in) { return _simd16_setzero_ps(); }
static simd16scalar pack(simd16scalar &in) { return _simd16_setzero_ps(); }
#endif
@ -109,7 +109,7 @@ struct PackTraits<8, false>
__m256i result = _mm256_castsi128_si256(resLo);
result = _mm256_insertf128_si256(result, resHi, 1);
return _mm256_castsi256_ps(result);
return simdscalar{ _mm256_castsi256_ps(result) };
#else
return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
#endif
@ -144,7 +144,7 @@ struct PackTraits<8, false>
return result;
}
static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
{
// store simd16 bytes
_mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
@ -152,7 +152,8 @@ struct PackTraits<8, false>
static simd16scalar unpack(simd16scalar &in)
{
simd16scalari result = _simd16_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))));
simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
simd16scalari result = _simd16_cvtepu8_epi32(tmp);
return _simd16_castsi_ps(result);
}
@ -259,7 +260,7 @@ struct PackTraits<8, true>
return result;
}
static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
{
// store simd16 bytes
_mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
@ -267,7 +268,8 @@ struct PackTraits<8, true>
static simd16scalar unpack(simd16scalar &in)
{
simd16scalari result = _simd16_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))));
simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
simd16scalari result = _simd16_cvtepu8_epi32(tmp);
return _simd16_castsi_ps(result);
}
@ -370,7 +372,7 @@ struct PackTraits<16, false>
return result;
}
static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
{
_simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
}
@ -469,7 +471,7 @@ struct PackTraits<16, true>
return result;
}
static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
{
_simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
}
@ -514,7 +516,7 @@ struct PackTraits<32, false>
return _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
}
static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
{
_simd16_store_ps(reinterpret_cast<float *>(pDst), src);
}
@ -812,7 +814,7 @@ static inline __m128 ConvertFloatToSRGB2(__m128& Src)
#if ENABLE_AVX512_SIMD16
template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
inline static simd16scalar SIMDAPI fastpow(simd16scalar value)
inline static simd16scalar SIMDCALL fastpow(simd16scalar value)
{
static const float factor1 = exp2(127.0f * expden / expnum - 127.0f)
* powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum);
@ -834,7 +836,7 @@ inline static simd16scalar SIMDAPI fastpow(simd16scalar value)
return result;
}
inline static simd16scalar SIMDAPI pow512_4(simd16scalar arg)
inline static simd16scalar SIMDCALL pow512_4(simd16scalar arg)
{
// 5/12 is too small, so compute the 4th root of 20/12 instead.
// 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
@ -855,7 +857,7 @@ inline static simd16scalar SIMDAPI pow512_4(simd16scalar arg)
return xavg;
}
inline static simd16scalar SIMDAPI powf_wrapper(const simd16scalar base, float exp)
inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar base, float exp)
{
const float *f = reinterpret_cast<const float *>(&base);
@ -1410,7 +1412,7 @@ struct ComponentTraits
return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
}
INLINE static void SIMDAPI storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar src)
INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar src)
{
switch (comp)
{

View file

@ -31,58 +31,58 @@
#include "common/simdintrin.h"
INLINE
void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
void vTranspose(simd4scalar &row0, simd4scalar &row1, simd4scalar &row2, simd4scalar &row3)
{
__m128i row0i = _mm_castps_si128(row0);
__m128i row1i = _mm_castps_si128(row1);
__m128i row2i = _mm_castps_si128(row2);
__m128i row3i = _mm_castps_si128(row3);
simd4scalari row0i = SIMD128::castps_si(row0);
simd4scalari row1i = SIMD128::castps_si(row1);
simd4scalari row2i = SIMD128::castps_si(row2);
simd4scalari row3i = SIMD128::castps_si(row3);
__m128i vTemp = row2i;
row2i = _mm_unpacklo_epi32(row2i, row3i);
vTemp = _mm_unpackhi_epi32(vTemp, row3i);
simd4scalari vTemp = row2i;
row2i = SIMD128::unpacklo_epi32(row2i, row3i);
vTemp = SIMD128::unpackhi_epi32(vTemp, row3i);
row3i = row0i;
row0i = _mm_unpacklo_epi32(row0i, row1i);
row3i = _mm_unpackhi_epi32(row3i, row1i);
row0i = SIMD128::unpacklo_epi32(row0i, row1i);
row3i = SIMD128::unpackhi_epi32(row3i, row1i);
row1i = row0i;
row0i = _mm_unpacklo_epi64(row0i, row2i);
row1i = _mm_unpackhi_epi64(row1i, row2i);
row0i = SIMD128::unpacklo_epi64(row0i, row2i);
row1i = SIMD128::unpackhi_epi64(row1i, row2i);
row2i = row3i;
row2i = _mm_unpacklo_epi64(row2i, vTemp);
row3i = _mm_unpackhi_epi64(row3i, vTemp);
row2i = SIMD128::unpacklo_epi64(row2i, vTemp);
row3i = SIMD128::unpackhi_epi64(row3i, vTemp);
row0 = _mm_castsi128_ps(row0i);
row1 = _mm_castsi128_ps(row1i);
row2 = _mm_castsi128_ps(row2i);
row3 = _mm_castsi128_ps(row3i);
row0 = SIMD128::castsi_ps(row0i);
row1 = SIMD128::castsi_ps(row1i);
row2 = SIMD128::castsi_ps(row2i);
row3 = SIMD128::castsi_ps(row3i);
}
INLINE
void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
void vTranspose(simd4scalari &row0, simd4scalari &row1, simd4scalari &row2, simd4scalari &row3)
{
__m128i vTemp = row2;
row2 = _mm_unpacklo_epi32(row2, row3);
vTemp = _mm_unpackhi_epi32(vTemp, row3);
simd4scalari vTemp = row2;
row2 = SIMD128::unpacklo_epi32(row2, row3);
vTemp = SIMD128::unpackhi_epi32(vTemp, row3);
row3 = row0;
row0 = _mm_unpacklo_epi32(row0, row1);
row3 = _mm_unpackhi_epi32(row3, row1);
row0 = SIMD128::unpacklo_epi32(row0, row1);
row3 = SIMD128::unpackhi_epi32(row3, row1);
row1 = row0;
row0 = _mm_unpacklo_epi64(row0, row2);
row1 = _mm_unpackhi_epi64(row1, row2);
row0 = SIMD128::unpacklo_epi64(row0, row2);
row1 = SIMD128::unpackhi_epi64(row1, row2);
row2 = row3;
row2 = _mm_unpacklo_epi64(row2, vTemp);
row3 = _mm_unpackhi_epi64(row3, vTemp);
row2 = SIMD128::unpacklo_epi64(row2, vTemp);
row3 = SIMD128::unpackhi_epi64(row3, vTemp);
}
#if KNOB_SIMD_WIDTH == 8
INLINE
void vTranspose3x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2)
void vTranspose3x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2)
{
simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); //y0w0y1w1 y4w4y5w5
@ -94,10 +94,10 @@ void vTranspose3x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar
simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
@ -106,7 +106,7 @@ void vTranspose3x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar
}
INLINE
void vTranspose4x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3)
void vTranspose4x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3)
{
simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
@ -118,10 +118,10 @@ void vTranspose4x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar
simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
@ -227,16 +227,16 @@ struct Transpose8_8_8_8
#if KNOB_SIMD_WIDTH == 8
#if KNOB_ARCH <= KNOB_ARCH_AVX
__m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg
__m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa
__m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
__m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
__m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
__m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa
__m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba
__m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba
_mm_store_si128((__m128i*)pDst, c0123lo);
_mm_store_si128((__m128i*)(pDst + 16), c0123hi);
simd4scalari c0c1 = src.v4[0]; // rrrrrrrrgggggggg
simd4scalari c2c3 = SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1)); // bbbbbbbbaaaaaaaa
simd4scalari c0c2 = SIMD128::unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
simd4scalari c1c3 = SIMD128::unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
simd4scalari c01 = SIMD128::unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
simd4scalari c23 = SIMD128::unpackhi_epi8(c0c2, c1c3); // babababababababa
simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23); // rgbargbargbargba
simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23); // rgbargbargbargba
SIMD128::store_si((simd4scalari*)pDst, c0123lo);
SIMD128::store_si((simd4scalari*)(pDst + 16), c0123hi);
#else
simdscalari dst01 = _simd_shuffle_epi8(src,
_simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
@ -254,10 +254,10 @@ struct Transpose8_8_8_8
INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
{
__m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc)); // rrrrrrrrrrrrrrrr
__m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
__m128i src2 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
__m128i src3 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc)); // rrrrrrrrrrrrrrrr
simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg
simd4scalari src2 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
simd4scalari src3 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
@ -305,10 +305,10 @@ struct Transpose8_8
#if KNOB_SIMD_WIDTH == 8
simdscalari src = _simd_load_si((const simdscalari*)pSrc);
__m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg
__m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg
rg = _mm_unpacklo_epi8(rg, g);
_mm_store_si128((__m128i*)pDst, rg);
simd4scalari rg = src.v4[0]; // rrrrrrrr gggggggg
simd4scalari g = SIMD128::unpackhi_epi64(rg, rg); // gggggggg gggggggg
rg = SIMD128::unpacklo_epi8(rg, g);
SIMD128::store_si((simd4scalari*)pDst, rg);
#else
#error Unsupported vector width
#endif
@ -317,8 +317,8 @@ struct Transpose8_8
INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
{
__m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc)); // rrrrrrrrrrrrrrrr
__m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc)); // rrrrrrrrrrrrrrrr
simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg
simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
@ -349,16 +349,16 @@ struct Transpose32_32_32_32
simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
__m128 vDst[8];
simd4scalar vDst[8];
vTranspose4x8(vDst, src0, src1, src2, src3);
_mm_store_ps((float*)pDst, vDst[0]);
_mm_store_ps((float*)pDst+4, vDst[1]);
_mm_store_ps((float*)pDst+8, vDst[2]);
_mm_store_ps((float*)pDst+12, vDst[3]);
_mm_store_ps((float*)pDst+16, vDst[4]);
_mm_store_ps((float*)pDst+20, vDst[5]);
_mm_store_ps((float*)pDst+24, vDst[6]);
_mm_store_ps((float*)pDst+28, vDst[7]);
SIMD128::store_ps((float*)pDst, vDst[0]);
SIMD128::store_ps((float*)pDst+4, vDst[1]);
SIMD128::store_ps((float*)pDst+8, vDst[2]);
SIMD128::store_ps((float*)pDst+12, vDst[3]);
SIMD128::store_ps((float*)pDst+16, vDst[4]);
SIMD128::store_ps((float*)pDst+20, vDst[5]);
SIMD128::store_ps((float*)pDst+24, vDst[6]);
SIMD128::store_ps((float*)pDst+28, vDst[7]);
#else
#error Unsupported vector width
#endif
@ -400,16 +400,16 @@ struct Transpose32_32_32
simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
__m128 vDst[8];
simd4scalar vDst[8];
vTranspose3x8(vDst, src0, src1, src2);
_mm_store_ps((float*)pDst, vDst[0]);
_mm_store_ps((float*)pDst + 4, vDst[1]);
_mm_store_ps((float*)pDst + 8, vDst[2]);
_mm_store_ps((float*)pDst + 12, vDst[3]);
_mm_store_ps((float*)pDst + 16, vDst[4]);
_mm_store_ps((float*)pDst + 20, vDst[5]);
_mm_store_ps((float*)pDst + 24, vDst[6]);
_mm_store_ps((float*)pDst + 28, vDst[7]);
SIMD128::store_ps((float*)pDst, vDst[0]);
SIMD128::store_ps((float*)pDst + 4, vDst[1]);
SIMD128::store_ps((float*)pDst + 8, vDst[2]);
SIMD128::store_ps((float*)pDst + 12, vDst[3]);
SIMD128::store_ps((float*)pDst + 16, vDst[4]);
SIMD128::store_ps((float*)pDst + 20, vDst[5]);
SIMD128::store_ps((float*)pDst + 24, vDst[6]);
SIMD128::store_ps((float*)pDst + 28, vDst[7]);
#else
#error Unsupported vector width
#endif
@ -448,21 +448,21 @@ struct Transpose32_32
{
#if KNOB_SIMD_WIDTH == 8
const float* pfSrc = (const float*)pSrc;
__m128 src_r0 = _mm_load_ps(pfSrc + 0);
__m128 src_r1 = _mm_load_ps(pfSrc + 4);
__m128 src_g0 = _mm_load_ps(pfSrc + 8);
__m128 src_g1 = _mm_load_ps(pfSrc + 12);
simd4scalar src_r0 = SIMD128::load_ps(pfSrc + 0);
simd4scalar src_r1 = SIMD128::load_ps(pfSrc + 4);
simd4scalar src_g0 = SIMD128::load_ps(pfSrc + 8);
simd4scalar src_g1 = SIMD128::load_ps(pfSrc + 12);
__m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
__m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
__m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
__m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
simd4scalar dst0 = SIMD128::unpacklo_ps(src_r0, src_g0);
simd4scalar dst1 = SIMD128::unpackhi_ps(src_r0, src_g0);
simd4scalar dst2 = SIMD128::unpacklo_ps(src_r1, src_g1);
simd4scalar dst3 = SIMD128::unpackhi_ps(src_r1, src_g1);
float* pfDst = (float*)pDst;
_mm_store_ps(pfDst + 0, dst0);
_mm_store_ps(pfDst + 4, dst1);
_mm_store_ps(pfDst + 8, dst2);
_mm_store_ps(pfDst + 12, dst3);
SIMD128::store_ps(pfDst + 0, dst0);
SIMD128::store_ps(pfDst + 4, dst1);
SIMD128::store_ps(pfDst + 8, dst2);
SIMD128::store_ps(pfDst + 12, dst3);
#else
#error Unsupported vector width
#endif
@ -504,25 +504,25 @@ struct Transpose16_16_16_16
simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
__m128i src_r = _mm256_extractf128_si256(src_rg, 0);
__m128i src_g = _mm256_extractf128_si256(src_rg, 1);
__m128i src_b = _mm256_extractf128_si256(src_ba, 0);
__m128i src_a = _mm256_extractf128_si256(src_ba, 1);
simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
simd4scalari src_b = _simd_extractf128_si(src_ba, 0);
simd4scalari src_a = _simd_extractf128_si(src_ba, 1);
__m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
__m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
__m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
__m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
__m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
__m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
__m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
__m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
_mm_store_si128(((__m128i*)pDst) + 0, dst0);
_mm_store_si128(((__m128i*)pDst) + 1, dst1);
_mm_store_si128(((__m128i*)pDst) + 2, dst2);
_mm_store_si128(((__m128i*)pDst) + 3, dst3);
SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
#else
#error Unsupported vector width
#endif
@ -573,25 +573,25 @@ struct Transpose16_16_16
#if KNOB_SIMD_WIDTH == 8
simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
__m128i src_r = _mm256_extractf128_si256(src_rg, 0);
__m128i src_g = _mm256_extractf128_si256(src_rg, 1);
__m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
__m128i src_a = _mm_undefined_si128();
simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
simd4scalari src_b = SIMD128::load_si((const simd4scalari*)(pSrc + sizeof(simdscalari)));
simd4scalari src_a = SIMD128::setzero_si();
__m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
__m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
__m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
__m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
__m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
__m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
__m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
__m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
_mm_store_si128(((__m128i*)pDst) + 0, dst0);
_mm_store_si128(((__m128i*)pDst) + 1, dst1);
_mm_store_si128(((__m128i*)pDst) + 2, dst2);
_mm_store_si128(((__m128i*)pDst) + 3, dst3);
SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
#else
#error Unsupported vector width
#endif
@ -642,17 +642,17 @@ struct Transpose16_16
#if KNOB_SIMD_WIDTH == 8
simdscalar src = _simd_load_ps((const float*)pSrc);
__m128 comp0 = _mm256_castps256_ps128(src);
__m128 comp1 = _mm256_extractf128_ps(src, 1);
simd4scalar comp0 = _simd_extractf128_ps(src, 0);
simd4scalar comp1 = _simd_extractf128_ps(src, 1);
__m128i comp0i = _mm_castps_si128(comp0);
__m128i comp1i = _mm_castps_si128(comp1);
simd4scalari comp0i = SIMD128::castps_si(comp0);
simd4scalari comp1i = SIMD128::castps_si(comp1);
__m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
__m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
simd4scalari resLo = SIMD128::unpacklo_epi16(comp0i, comp1i);
simd4scalari resHi = SIMD128::unpackhi_epi16(comp0i, comp1i);
_mm_store_si128((__m128i*)pDst, resLo);
_mm_store_si128((__m128i*)pDst + 1, resHi);
SIMD128::store_si((simd4scalari*)pDst, resLo);
SIMD128::store_si((simd4scalari*)pDst + 1, resHi);
#else
#error Unsupported vector width
#endif

View file

@ -527,7 +527,7 @@ static void StreamOut(
// Write all entries into primitive data buffer for SOS.
while (_BitScanForward(&slot, soMask))
{
__m128 attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
pa.AssembleSingle(paSlot, primIndex, attrib);
@ -941,7 +941,9 @@ static void GeometryShaderStage(
if (HasStreamOutT::value)
{
#if ENABLE_AVX512_SIMD16
gsPa.useAlternateOffset = false;
#endif
StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
}
@ -1279,7 +1281,9 @@ static void TessellationStages(
{
if (HasStreamOutT::value)
{
#if ENABLE_AVX512_SIMD16
tessPa.useAlternateOffset = false;
#endif
StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
}

View file

@ -391,7 +391,7 @@ struct PA_STATE_BASE; // forward decl
void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
#if USE_SIMD16_FRONTEND
void SIMDAPI BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
void SIMDAPI BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
void SIMDCALL BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
void SIMDCALL BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
#endif

View file

@ -92,7 +92,7 @@ struct PA_STATE
#if ENABLE_AVX512_SIMD16
virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0;
#endif
virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
virtual bool NextPrim() = 0;
virtual SIMDVERTEX& GetNextVsOutput() = 0;
virtual bool GetNextStreamOutput() = 0;
@ -139,7 +139,7 @@ struct PA_STATE_OPT : public PA_STATE
#if ENABLE_AVX512_SIMD16
typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles.
#if ENABLE_AVX512_SIMD16
@ -205,7 +205,7 @@ struct PA_STATE_OPT : public PA_STATE
#endif
// Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
}
@ -767,7 +767,7 @@ PRAGMA_WARNING_POP()
}
#endif
void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
{
// move to slot
for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
@ -1253,7 +1253,7 @@ struct PA_TESS : PA_STATE
_simd16_setzero_ps(),
pBase,
indices,
mask,
_simd16_castsi_ps(mask),
4 /* gcc doesn't like sizeof(float) */);
verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
@ -1263,7 +1263,7 @@ struct PA_TESS : PA_STATE
pBase,
indices,
_simd_castsi_ps(mask),
4 /* gcc doesn't like sizeof(float) */);
4); // gcc doesn't like sizeof(float)
#endif
pBase += m_attributeStrideInVectors * SIMD_WIDTH;
}
@ -1302,7 +1302,7 @@ struct PA_TESS : PA_STATE
_simd16_setzero_ps(),
pBase,
indices,
mask,
_simd16_castsi_ps(mask),
4 /* gcc doesn't like sizeof(float) */);
#else
simdscalar temp = _simd_mask_i32gather_ps(
@ -1321,7 +1321,7 @@ struct PA_TESS : PA_STATE
}
#endif
void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
SWR_ASSERT(slot < m_numAttributes);
SWR_ASSERT(primIndex < PA_TESS::NumPrims());

View file

@ -34,103 +34,103 @@
#if (KNOB_SIMD_WIDTH == 8)
INLINE __m128 swizzleLane0(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
INLINE simd4scalar swizzleLane0(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
{
simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
}
INLINE __m128 swizzleLane1(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
INLINE simd4scalar swizzleLane1(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
{
simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
}
INLINE __m128 swizzleLane2(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
INLINE simd4scalar swizzleLane2(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
{
simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
}
INLINE __m128 swizzleLane3(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
INLINE simd4scalar swizzleLane3(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
{
simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
}
INLINE __m128 swizzleLane4(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
INLINE simd4scalar swizzleLane4(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
{
simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
}
INLINE __m128 swizzleLane5(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
INLINE simd4scalar swizzleLane5(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
{
simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
}
INLINE __m128 swizzleLane6(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
INLINE simd4scalar swizzleLane6(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
{
simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
}
INLINE __m128 swizzleLane7(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
INLINE simd4scalar swizzleLane7(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
{
simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
}
INLINE __m128 swizzleLane0(const simdvector &v)
INLINE simd4scalar swizzleLane0(const simdvector &v)
{
return swizzleLane0(v.x, v.y, v.z, v.w);
}
INLINE __m128 swizzleLane1(const simdvector &v)
INLINE simd4scalar swizzleLane1(const simdvector &v)
{
return swizzleLane1(v.x, v.y, v.z, v.w);
}
INLINE __m128 swizzleLane2(const simdvector &v)
INLINE simd4scalar swizzleLane2(const simdvector &v)
{
return swizzleLane2(v.x, v.y, v.z, v.w);
}
INLINE __m128 swizzleLane3(const simdvector &v)
INLINE simd4scalar swizzleLane3(const simdvector &v)
{
return swizzleLane3(v.x, v.y, v.z, v.w);
}
INLINE __m128 swizzleLane4(const simdvector &v)
INLINE simd4scalar swizzleLane4(const simdvector &v)
{
return swizzleLane4(v.x, v.y, v.z, v.w);
}
INLINE __m128 swizzleLane5(const simdvector &v)
INLINE simd4scalar swizzleLane5(const simdvector &v)
{
return swizzleLane5(v.x, v.y, v.z, v.w);
}
INLINE __m128 swizzleLane6(const simdvector &v)
INLINE simd4scalar swizzleLane6(const simdvector &v)
{
return swizzleLane6(v.x, v.y, v.z, v.w);
}
INLINE __m128 swizzleLane7(const simdvector &v)
INLINE simd4scalar swizzleLane7(const simdvector &v)
{
return swizzleLane7(v.x, v.y, v.z, v.w);
}
INLINE __m128 swizzleLaneN(const simdvector &v, int lane)
INLINE simd4scalar swizzleLaneN(const simdvector &v, int lane)
{
switch (lane)
{
@ -156,87 +156,87 @@ INLINE __m128 swizzleLaneN(const simdvector &v, int lane)
}
#if ENABLE_AVX512_SIMD16
INLINE __m128 swizzleLane0(const simd16vector &v)
INLINE simd4scalar swizzleLane0(const simd16vector &v)
{
return swizzleLane0(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
}
INLINE __m128 swizzleLane1(const simd16vector &v)
INLINE simd4scalar swizzleLane1(const simd16vector &v)
{
return swizzleLane1(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
}
INLINE __m128 swizzleLane2(const simd16vector &v)
INLINE simd4scalar swizzleLane2(const simd16vector &v)
{
return swizzleLane2(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
}
INLINE __m128 swizzleLane3(const simd16vector &v)
INLINE simd4scalar swizzleLane3(const simd16vector &v)
{
return swizzleLane3(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
}
INLINE __m128 swizzleLane4(const simd16vector &v)
INLINE simd4scalar swizzleLane4(const simd16vector &v)
{
return swizzleLane4(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
}
INLINE __m128 swizzleLane5(const simd16vector &v)
INLINE simd4scalar swizzleLane5(const simd16vector &v)
{
return swizzleLane5(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
}
INLINE __m128 swizzleLane6(const simd16vector &v)
INLINE simd4scalar swizzleLane6(const simd16vector &v)
{
return swizzleLane6(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
}
INLINE __m128 swizzleLane7(const simd16vector &v)
INLINE simd4scalar swizzleLane7(const simd16vector &v)
{
return swizzleLane7(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
}
INLINE __m128 swizzleLane8(const simd16vector &v)
INLINE simd4scalar swizzleLane8(const simd16vector &v)
{
return swizzleLane0(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
}
INLINE __m128 swizzleLane9(const simd16vector &v)
INLINE simd4scalar swizzleLane9(const simd16vector &v)
{
return swizzleLane1(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
}
INLINE __m128 swizzleLaneA(const simd16vector &v)
INLINE simd4scalar swizzleLaneA(const simd16vector &v)
{
return swizzleLane2(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
}
INLINE __m128 swizzleLaneB(const simd16vector &v)
INLINE simd4scalar swizzleLaneB(const simd16vector &v)
{
return swizzleLane3(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
}
INLINE __m128 swizzleLaneC(const simd16vector &v)
INLINE simd4scalar swizzleLaneC(const simd16vector &v)
{
return swizzleLane4(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
}
INLINE __m128 swizzleLaneD(const simd16vector &v)
INLINE simd4scalar swizzleLaneD(const simd16vector &v)
{
return swizzleLane5(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
}
INLINE __m128 swizzleLaneE(const simd16vector &v)
INLINE simd4scalar swizzleLaneE(const simd16vector &v)
{
return swizzleLane6(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
}
INLINE __m128 swizzleLaneF(const simd16vector &v)
INLINE simd4scalar swizzleLaneF(const simd16vector &v)
{
return swizzleLane7(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
}
INLINE __m128 swizzleLaneN(const simd16vector &v, int lane)
INLINE simd4scalar swizzleLaneN(const simd16vector &v, int lane)
{
switch (lane)
{
@ -286,7 +286,7 @@ bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@ -294,7 +294,7 @@ bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@ -302,7 +302,7 @@ bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@ -310,7 +310,7 @@ bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@ -318,7 +318,7 @@ bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@ -326,7 +326,7 @@ bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@ -334,13 +334,13 @@ bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
#if ENABLE_AVX512_SIMD16
bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@ -350,10 +350,10 @@ bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
template <uint32_t TotalControlPoints>
void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
// We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
// KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
@ -788,7 +788,7 @@ bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
}
#endif
void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
#if USE_SIMD16_FRONTEND
const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
@ -1057,7 +1057,7 @@ bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
}
#endif
void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
#if USE_SIMD16_FRONTEND
const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
@ -1325,7 +1325,7 @@ bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
}
#endif
void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
#if USE_SIMD16_FRONTEND
const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot);
@ -1491,7 +1491,7 @@ bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
}
#endif
void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
#if USE_SIMD16_FRONTEND
const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
@ -1741,7 +1741,7 @@ bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
}
#endif
void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
PaLineStripSingle0(pa, slot, primIndex, verts);
@ -1855,7 +1855,7 @@ bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
}
#endif
void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
#if USE_SIMD16_FRONTEND
const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
@ -2075,7 +2075,7 @@ bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
}
#endif
void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
#if USE_SIMD16_FRONTEND
const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
@ -2239,7 +2239,7 @@ bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
}
#endif
void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
#if USE_SIMD16_FRONTEND
const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
@ -2529,7 +2529,7 @@ void PaRectListSingle0(
PA_STATE_OPT& pa,
uint32_t slot,
uint32_t primIndex,
__m128 verts[])
simd4scalar verts[])
{
// We have 12 simdscalars contained within 3 simdvectors which
// hold at least 8 triangles worth of data. We want to assemble a single

View file

@ -199,15 +199,15 @@ struct StorePixels<32, 2>
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
{
// Each 4-pixel row is 16-bytes
__m128i *pZRow01 = (__m128i*)pSrc;
__m128i vQuad00 = _mm_load_si128(pZRow01);
__m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
simd4scalari *pZRow01 = (simd4scalari*)pSrc;
simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
__m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
__m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
_mm_storeu_si128((__m128i*)ppDsts[0], vRow00);
_mm_storeu_si128((__m128i*)ppDsts[1], vRow10);
SIMD128::storeu_si((simd4scalari*)ppDsts[0], vRow00);
SIMD128::storeu_si((simd4scalari*)ppDsts[1], vRow10);
}
};
@ -218,20 +218,20 @@ struct StorePixels<32, 4>
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
{
// 4 x 16 bytes = 64 bytes, 16 pixels
const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
__m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
// Unswizzle from SWR-Z order
__m128i quad0 = _mm_load_si128(&pSrc128[0]); // 0 1 2 3
__m128i quad1 = _mm_load_si128(&pSrc128[1]); // 4 5 6 7
__m128i quad2 = _mm_load_si128(&pSrc128[2]); // 8 9 A B
__m128i quad3 = _mm_load_si128(&pSrc128[3]); // C D E F
simd4scalari quad0 = SIMD128::load_si(&pSrc128[0]); // 0 1 2 3
simd4scalari quad1 = SIMD128::load_si(&pSrc128[1]); // 4 5 6 7
simd4scalari quad2 = SIMD128::load_si(&pSrc128[2]); // 8 9 A B
simd4scalari quad3 = SIMD128::load_si(&pSrc128[3]); // C D E F
_mm_storeu_si128(ppDsts128[0], _mm_unpacklo_epi64(quad0, quad1)); // 0 1 4 5
_mm_storeu_si128(ppDsts128[1], _mm_unpackhi_epi64(quad0, quad1)); // 2 3 6 7
_mm_storeu_si128(ppDsts128[2], _mm_unpacklo_epi64(quad2, quad3)); // 8 9 C D
_mm_storeu_si128(ppDsts128[3], _mm_unpackhi_epi64(quad2, quad3)); // A B E F
SIMD128::storeu_si(ppDsts128[0], SIMD128::unpacklo_epi64(quad0, quad1)); // 0 1 4 5
SIMD128::storeu_si(ppDsts128[1], SIMD128::unpackhi_epi64(quad0, quad1)); // 2 3 6 7
SIMD128::storeu_si(ppDsts128[2], SIMD128::unpacklo_epi64(quad2, quad3)); // 8 9 C D
SIMD128::storeu_si(ppDsts128[3], SIMD128::unpackhi_epi64(quad2, quad3)); // A B E F
}
};
@ -251,10 +251,10 @@ struct StorePixels<64, 4>
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
{
// Each 4-pixel row is 32 bytes.
const __m128i* pPixSrc = (const __m128i*)pSrc;
const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
// order of pointers match SWR-Z layout
__m128i** pvDsts = (__m128i**)&ppDsts[0];
simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
*pvDsts[0] = pPixSrc[0];
*pvDsts[1] = pPixSrc[1];
*pvDsts[2] = pPixSrc[2];
@ -269,9 +269,9 @@ struct StorePixels<64, 8>
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
{
// 8 x 16 bytes = 128 bytes, 16 pixels
const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
__m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
// order of pointers match SWR-Z layout
*ppDsts128[0] = pSrc128[0]; // 0 1
@ -301,10 +301,10 @@ struct StorePixels<128, 8>
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
{
// Each 4-pixel row is 64 bytes.
const __m128i* pPixSrc = (const __m128i*)pSrc;
const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
// Unswizzle from SWR-Z order
__m128i** pvDsts = (__m128i**)&ppDsts[0];
simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
*pvDsts[0] = pPixSrc[0];
*pvDsts[1] = pPixSrc[2];
*pvDsts[2] = pPixSrc[1];
@ -323,9 +323,9 @@ struct StorePixels<128, 16>
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
{
// 16 x 16 bytes = 256 bytes, 16 pixels
const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
__m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
for (uint32_t i = 0; i < 16; i += 4)
{
@ -563,8 +563,8 @@ struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
// merge/store data into destination but don't overwrite the X8 bits
simdscalari destlo = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]));
simdscalari desthi = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]));
simdscalari destlo = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]));
simdscalari desthi = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]));
simd16scalari dest = _simd16_setzero_si();
@ -575,8 +575,8 @@ struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));
_simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), _simd16_extract_si(dest, 0));
_simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), _simd16_extract_si(dest, 1));
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]), _simd16_extract_si(dest, 0));
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]), _simd16_extract_si(dest, 1));
#else
static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
@ -593,25 +593,25 @@ struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
// Store data into destination but don't overwrite the X8 bits
// Each 4-pixel row is 16-bytes
__m128i *pZRow01 = (__m128i*)aosTile;
__m128i vQuad00 = _mm_load_si128(pZRow01);
__m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
simd4scalari *pZRow01 = (simd4scalari*)aosTile;
simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
__m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
__m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
__m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]);
__m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]);
simd4scalari vDst0 = SIMD128::loadu_si((const simd4scalari*)ppDsts[0]);
simd4scalari vDst1 = SIMD128::loadu_si((const simd4scalari*)ppDsts[1]);
__m128i vMask = _mm_set1_epi32(0xFFFFFF);
simd4scalari vMask = _mm_set1_epi32(0xFFFFFF);
vDst0 = _mm_andnot_si128(vMask, vDst0);
vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask));
vDst1 = _mm_andnot_si128(vMask, vDst1);
vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask));
vDst0 = SIMD128::andnot_si(vMask, vDst0);
vDst0 = SIMD128::or_si(vDst0, SIMD128::and_si(vRow00, vMask));
vDst1 = SIMD128::andnot_si(vMask, vDst1);
vDst1 = SIMD128::or_si(vDst1, SIMD128::and_si(vRow10, vMask));
_mm_storeu_si128((__m128i*)ppDsts[0], vDst0);
_mm_storeu_si128((__m128i*)ppDsts[1], vDst1);
SIMD128::storeu_si((simd4scalari*)ppDsts[0], vDst0);
SIMD128::storeu_si((simd4scalari*)ppDsts[1], vDst1);
#endif
}
};
@ -683,8 +683,8 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDs
// store 8x2 memory order:
// row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
// row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
_simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
_simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
}
#endif
@ -736,15 +736,15 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst
// splitting into two sets of 4 wide integer vector types
// because AVX doesn't have instructions to support this operation at 8 wide
__m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
__m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
__m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
__m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
simd4scalari srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
__m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
__m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
__m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
__m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
simd4scalari srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
@ -753,18 +753,18 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst
srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
srcLo2 = SIMD128::or_si(srcLo2, srcLo3); // ab00ab00ab00ab00
srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
srcHi2 = SIMD128::or_si(srcHi2, srcHi3); // ab00ab00ab00ab00
srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // abgrabgrabgrabgr
srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // abgrabgrabgrabgr
// unpack into rows that get the tiling order correct
__m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
__m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
simdscalari final = _mm256_castsi128_si256(vRow00);
final = _mm256_insertf128_si256(final, vRow10, 1);
@ -785,7 +785,7 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst
final = _mm256_permute4x64_epi64(final, 0xD8);
#endif
_simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
_simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
}
#if USE_8x2_TILE_BACKEND
@ -848,8 +848,8 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8
// store 8x2 memory order:
// row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
// row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
_simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
_simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
}
#endif
@ -894,29 +894,29 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_
// splitting into two sets of 4 wide integer vector types
// because AVX doesn't have instructions to support this operation at 8 wide
__m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
__m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
__m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
__m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
__m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
__m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
// unpack into rows that get the tiling order correct
__m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
__m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
simdscalari final = _mm256_castsi128_si256(vRow00);
final = _mm256_insertf128_si256(final, vRow10, 1);
@ -936,7 +936,7 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_
#endif
_simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
_simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
}
template<>