mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-23 12:40:29 +01:00
swr/rast: Switch intrinsic usage to SIMDLib
Switch from a macro-based simd intrinsics layer to a more C++ implementation, which also adds AVX512 optimizations to 128-bit and 256-bit SIMD. Reviewed-by: Bruce Cherniak <bruce.cherniak at intel.com>
This commit is contained in:
parent
8b66d18a3b
commit
fc4f6c44c4
30 changed files with 6206 additions and 2663 deletions
|
|
@ -65,6 +65,19 @@ COMMON_CXX_SOURCES := \
|
|||
rasterizer/common/rdtsc_buckets_shared.h \
|
||||
rasterizer/common/simd16intrin.h \
|
||||
rasterizer/common/simdintrin.h \
|
||||
rasterizer/common/simdlib.hpp \
|
||||
rasterizer/common/simdlib_128_avx.inl \
|
||||
rasterizer/common/simdlib_128_avx2.inl \
|
||||
rasterizer/common/simdlib_128_avx512.inl \
|
||||
rasterizer/common/simdlib_256_avx.inl \
|
||||
rasterizer/common/simdlib_256_avx2.inl \
|
||||
rasterizer/common/simdlib_256_avx512.inl \
|
||||
rasterizer/common/simdlib_512_avx512.inl \
|
||||
rasterizer/common/simdlib_512_avx512_masks.inl \
|
||||
rasterizer/common/simdlib_512_emu.inl \
|
||||
rasterizer/common/simdlib_512_emu_masks.inl \
|
||||
rasterizer/common/simdlib_interface.hpp \
|
||||
rasterizer/common/simdlib_types.hpp \
|
||||
rasterizer/common/swr_assert.cpp \
|
||||
rasterizer/common/swr_assert.h
|
||||
|
||||
|
|
|
|||
|
|
@ -26,89 +26,37 @@
|
|||
|
||||
#include "os.h"
|
||||
|
||||
#include <cassert>
|
||||
#define SIMD_ARCH KNOB_ARCH
|
||||
#include "simdlib_types.hpp"
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include <immintrin.h>
|
||||
#include <xmmintrin.h>
|
||||
typedef SIMDImpl::SIMD128Impl::Float simd4scalar;
|
||||
typedef SIMDImpl::SIMD128Impl::Double simd4scalard;
|
||||
typedef SIMDImpl::SIMD128Impl::Integer simd4scalari;
|
||||
typedef SIMDImpl::SIMD128Impl::Vec4 simd4vector;
|
||||
typedef SIMDImpl::SIMD128Impl::Mask simd4mask;
|
||||
|
||||
typedef SIMDImpl::SIMD256Impl::Float simd8scalar;
|
||||
typedef SIMDImpl::SIMD256Impl::Double simd8scalard;
|
||||
typedef SIMDImpl::SIMD256Impl::Integer simd8scalari;
|
||||
typedef SIMDImpl::SIMD256Impl::Vec4 simd8vector;
|
||||
typedef SIMDImpl::SIMD256Impl::Mask simd8mask;
|
||||
|
||||
typedef SIMDImpl::SIMD512Impl::Float simd16scalar;
|
||||
typedef SIMDImpl::SIMD512Impl::Double simd16scalard;
|
||||
typedef SIMDImpl::SIMD512Impl::Integer simd16scalari;
|
||||
typedef SIMDImpl::SIMD512Impl::Vec4 simd16vector;
|
||||
typedef SIMDImpl::SIMD512Impl::Mask simd16mask;
|
||||
|
||||
#if KNOB_SIMD_WIDTH == 8
|
||||
typedef __m256 simdscalar;
|
||||
typedef __m256i simdscalari;
|
||||
typedef uint8_t simdmask;
|
||||
typedef simd8scalar simdscalar;
|
||||
typedef simd8scalard simdscalard;
|
||||
typedef simd8scalari simdscalari;
|
||||
typedef simd8vector simdvector;
|
||||
typedef simd8mask simdmask;
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
|
||||
// simd vector
|
||||
OSALIGNSIMD(union) simdvector
|
||||
{
|
||||
simdscalar v[4];
|
||||
struct
|
||||
{
|
||||
simdscalar x, y, z, w;
|
||||
};
|
||||
|
||||
simdscalar& operator[] (const int i) { return v[i]; }
|
||||
const simdscalar& operator[] (const int i) const { return v[i]; }
|
||||
};
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
|
||||
#if KNOB_SIMD16_WIDTH == 16
|
||||
|
||||
#if ENABLE_AVX512_EMULATION
|
||||
struct simd16scalar
|
||||
{
|
||||
__m256 lo;
|
||||
__m256 hi;
|
||||
};
|
||||
struct simd16scalard
|
||||
{
|
||||
__m256d lo;
|
||||
__m256d hi;
|
||||
};
|
||||
struct simd16scalari
|
||||
{
|
||||
__m256i lo;
|
||||
__m256i hi;
|
||||
};
|
||||
typedef uint16_t simd16mask;
|
||||
|
||||
#else
|
||||
typedef __m512 simd16scalar;
|
||||
typedef __m512d simd16scalard;
|
||||
typedef __m512i simd16scalari;
|
||||
typedef __mmask16 simd16mask;
|
||||
#endif//ENABLE_AVX512_EMULATION
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif//KNOB_SIMD16_WIDTH == 16
|
||||
|
||||
#define _simd16_masklo(mask) ((mask) & 0xFF)
|
||||
#define _simd16_maskhi(mask) (((mask) >> 8) & 0xFF)
|
||||
#define _simd16_setmask(hi, lo) (((hi) << 8) | (lo))
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define SIMDAPI __vectorcall
|
||||
#else
|
||||
#define SIMDAPI
|
||||
#endif
|
||||
|
||||
OSALIGN(union, KNOB_SIMD16_BYTES) simd16vector
|
||||
{
|
||||
simd16scalar v[4];
|
||||
struct
|
||||
{
|
||||
simd16scalar x, y, z, w;
|
||||
};
|
||||
|
||||
simd16scalar& operator[] (const int i) { return v[i]; }
|
||||
const simd16scalar& operator[] (const int i) const { return v[i]; }
|
||||
};
|
||||
|
||||
#endif // ENABLE_AVX512_SIMD16
|
||||
|
||||
INLINE
|
||||
UINT pdep_u32(UINT a, UINT mask)
|
||||
{
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
550
src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
Normal file
550
src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
Normal file
|
|
@ -0,0 +1,550 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#include "simdlib_types.hpp"
|
||||
|
||||
// For documentation, please see the following include...
|
||||
// #include "simdlib_interface.hpp"
|
||||
|
||||
namespace SIMDImpl
|
||||
{
|
||||
namespace SIMD128Impl
|
||||
{
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX
|
||||
struct AVXImpl
|
||||
{
|
||||
#define __SIMD_LIB_AVX_HPP__
|
||||
#include "simdlib_128_avx.inl"
|
||||
#undef __SIMD_LIB_AVX_HPP__
|
||||
}; // struct AVXImpl
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
|
||||
|
||||
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX2
|
||||
struct AVX2Impl : AVXImpl
|
||||
{
|
||||
#define __SIMD_LIB_AVX2_HPP__
|
||||
#include "simdlib_128_avx2.inl"
|
||||
#undef __SIMD_LIB_AVX2_HPP__
|
||||
}; // struct AVX2Impl
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
|
||||
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
struct AVX512Impl : AVX2Impl
|
||||
{
|
||||
#define __SIMD_LIB_AVX512_HPP__
|
||||
#include "simdlib_128_avx512.inl"
|
||||
#undef __SIMD_LIB_AVX512_HPP__
|
||||
}; // struct AVX2Impl
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
|
||||
struct Traits : SIMDImpl::Traits
|
||||
{
|
||||
#if SIMD_ARCH == SIMD_ARCH_AVX
|
||||
using IsaImpl = AVXImpl;
|
||||
#elif SIMD_ARCH == SIMD_ARCH_AVX2
|
||||
using IsaImpl = AVX2Impl;
|
||||
#elif SIMD_ARCH == SIMD_ARCH_AVX512
|
||||
using IsaImpl = AVX512Impl;
|
||||
#else
|
||||
#error Invalid value for SIMD_ARCH
|
||||
#endif
|
||||
|
||||
using Float = SIMD128Impl::Float;
|
||||
using Double = SIMD128Impl::Double;
|
||||
using Integer = SIMD128Impl::Integer;
|
||||
using Vec4 = SIMD128Impl::Vec4;
|
||||
using Mask = SIMD128Impl::Mask;
|
||||
};
|
||||
} // ns SIMD128Impl
|
||||
|
||||
namespace SIMD256Impl
|
||||
{
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX
|
||||
struct AVXImpl
|
||||
{
|
||||
#define __SIMD_LIB_AVX_HPP__
|
||||
#include "simdlib_256_avx.inl"
|
||||
#undef __SIMD_LIB_AVX_HPP__
|
||||
}; // struct AVXImpl
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
|
||||
|
||||
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX2
|
||||
struct AVX2Impl : AVXImpl
|
||||
{
|
||||
#define __SIMD_LIB_AVX2_HPP__
|
||||
#include "simdlib_256_avx2.inl"
|
||||
#undef __SIMD_LIB_AVX2_HPP__
|
||||
}; // struct AVX2Impl
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
|
||||
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
struct AVX512Impl : AVX2Impl
|
||||
{
|
||||
#define __SIMD_LIB_AVX512_HPP__
|
||||
#include "simdlib_256_avx512.inl"
|
||||
#undef __SIMD_LIB_AVX512_HPP__
|
||||
}; // struct AVX2Impl
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
|
||||
struct Traits : SIMDImpl::Traits
|
||||
{
|
||||
#if SIMD_ARCH == SIMD_ARCH_AVX
|
||||
using IsaImpl = AVXImpl;
|
||||
#elif SIMD_ARCH == SIMD_ARCH_AVX2
|
||||
using IsaImpl = AVX2Impl;
|
||||
#elif SIMD_ARCH == SIMD_ARCH_AVX512
|
||||
using IsaImpl = AVX512Impl;
|
||||
#else
|
||||
#error Invalid value for SIMD_ARCH
|
||||
#endif
|
||||
|
||||
using Float = SIMD256Impl::Float;
|
||||
using Double = SIMD256Impl::Double;
|
||||
using Integer = SIMD256Impl::Integer;
|
||||
using Vec4 = SIMD256Impl::Vec4;
|
||||
using Mask = SIMD256Impl::Mask;
|
||||
};
|
||||
} // ns SIMD256Impl
|
||||
|
||||
namespace SIMD512Impl
|
||||
{
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX
|
||||
template<typename SIMD256T>
|
||||
struct AVXImplBase
|
||||
{
|
||||
#define __SIMD_LIB_AVX_HPP__
|
||||
#include "simdlib_512_emu.inl"
|
||||
#include "simdlib_512_emu_masks.inl"
|
||||
#undef __SIMD_LIB_AVX_HPP__
|
||||
}; // struct AVXImplBase
|
||||
using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>;
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
|
||||
|
||||
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX2
|
||||
using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>;
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
|
||||
|
||||
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
struct AVX512Impl
|
||||
{
|
||||
#define __SIMD_LIB_AVX512_HPP__
|
||||
#include "simdlib_512_avx512.inl"
|
||||
#include "simdlib_512_avx512_masks.inl"
|
||||
#undef __SIMD_LIB_AVX512_HPP__
|
||||
}; // struct AVX512Impl
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
|
||||
struct Traits : SIMDImpl::Traits
|
||||
{
|
||||
#if SIMD_ARCH == SIMD_ARCH_AVX
|
||||
using IsaImpl = AVXImpl;
|
||||
#elif SIMD_ARCH == SIMD_ARCH_AVX2
|
||||
using IsaImpl = AVX2Impl;
|
||||
#elif SIMD_ARCH == SIMD_ARCH_AVX512
|
||||
using IsaImpl = AVX512Impl;
|
||||
#else
|
||||
#error Invalid value for SIMD_ARCH
|
||||
#endif
|
||||
|
||||
using Float = SIMD512Impl::Float;
|
||||
using Double = SIMD512Impl::Double;
|
||||
using Integer = SIMD512Impl::Integer;
|
||||
using Vec4 = SIMD512Impl::Vec4;
|
||||
using Mask = SIMD512Impl::Mask;
|
||||
};
|
||||
} // ns SIMD512Impl
|
||||
} // ns SIMDImpl
|
||||
|
||||
template <typename Traits>
|
||||
struct SIMDBase : Traits::IsaImpl
|
||||
{
|
||||
using CompareType = typename Traits::CompareType;
|
||||
using ScaleFactor = typename Traits::ScaleFactor;
|
||||
using RoundMode = typename Traits::RoundMode;
|
||||
using SIMD = typename Traits::IsaImpl;
|
||||
using Float = typename Traits::Float;
|
||||
using Double = typename Traits::Double;
|
||||
using Integer = typename Traits::Integer;
|
||||
using Vec4 = typename Traits::Vec4;
|
||||
using Mask = typename Traits::Mask;
|
||||
|
||||
// Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
|
||||
static SIMDINLINE
|
||||
void vec4_load1_ps(Vec4& r, const float *p)
|
||||
{
|
||||
r[0] = SIMD::set1_ps(p[0]);
|
||||
r[1] = SIMD::set1_ps(p[1]);
|
||||
r[2] = SIMD::set1_ps(p[2]);
|
||||
r[3] = SIMD::set1_ps(p[3]);
|
||||
}
|
||||
|
||||
static SIMDINLINE
|
||||
void vec4_set1_vps(Vec4& r, Float s)
|
||||
{
|
||||
r[0] = s;
|
||||
r[1] = s;
|
||||
r[2] = s;
|
||||
r[3] = s;
|
||||
}
|
||||
|
||||
static SIMDINLINE
|
||||
Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1)
|
||||
{
|
||||
Float tmp, r;
|
||||
r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
|
||||
|
||||
tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
|
||||
r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
|
||||
|
||||
tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
|
||||
r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static SIMDINLINE
|
||||
Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1)
|
||||
{
|
||||
Float tmp, r;
|
||||
r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
|
||||
|
||||
tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
|
||||
r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
|
||||
|
||||
tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
|
||||
r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
|
||||
|
||||
tmp = SIMD::mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
|
||||
r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static SIMDINLINE
|
||||
Float vec4_rcp_length_ps(const Vec4& v)
|
||||
{
|
||||
Float length = vec4_dp4_ps(v, v);
|
||||
return SIMD::rsqrt_ps(length);
|
||||
}
|
||||
|
||||
static SIMDINLINE
|
||||
void vec4_normalize_ps(Vec4& r, const Vec4& v)
|
||||
{
|
||||
Float rcpLength = vec4_rcp_length_ps(v);
|
||||
|
||||
r[0] = SIMD::mul_ps(v[0], rcpLength);
|
||||
r[1] = SIMD::mul_ps(v[1], rcpLength);
|
||||
r[2] = SIMD::mul_ps(v[2], rcpLength);
|
||||
r[3] = SIMD::mul_ps(v[3], rcpLength);
|
||||
}
|
||||
|
||||
static SIMDINLINE
|
||||
void vec4_mul_ps(Vec4& r, const Vec4& v, Float s)
|
||||
{
|
||||
r[0] = SIMD::mul_ps(v[0], s);
|
||||
r[1] = SIMD::mul_ps(v[1], s);
|
||||
r[2] = SIMD::mul_ps(v[2], s);
|
||||
r[3] = SIMD::mul_ps(v[3], s);
|
||||
}
|
||||
|
||||
static SIMDINLINE
|
||||
void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
|
||||
{
|
||||
r[0] = SIMD::mul_ps(v0[0], v1[0]);
|
||||
r[1] = SIMD::mul_ps(v0[1], v1[1]);
|
||||
r[2] = SIMD::mul_ps(v0[2], v1[2]);
|
||||
r[3] = SIMD::mul_ps(v0[3], v1[3]);
|
||||
}
|
||||
|
||||
static SIMDINLINE
|
||||
void vec4_add_ps(Vec4& r, const Vec4& v0, Float s)
|
||||
{
|
||||
r[0] = SIMD::add_ps(v0[0], s);
|
||||
r[1] = SIMD::add_ps(v0[1], s);
|
||||
r[2] = SIMD::add_ps(v0[2], s);
|
||||
r[3] = SIMD::add_ps(v0[3], s);
|
||||
}
|
||||
|
||||
static SIMDINLINE
|
||||
void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
|
||||
{
|
||||
r[0] = SIMD::add_ps(v0[0], v1[0]);
|
||||
r[1] = SIMD::add_ps(v0[1], v1[1]);
|
||||
r[2] = SIMD::add_ps(v0[2], v1[2]);
|
||||
r[3] = SIMD::add_ps(v0[3], v1[3]);
|
||||
}
|
||||
|
||||
static SIMDINLINE
|
||||
void vec4_min_ps(Vec4& r, const Vec4& v0, Float s)
|
||||
{
|
||||
r[0] = SIMD::min_ps(v0[0], s);
|
||||
r[1] = SIMD::min_ps(v0[1], s);
|
||||
r[2] = SIMD::min_ps(v0[2], s);
|
||||
r[3] = SIMD::min_ps(v0[3], s);
|
||||
}
|
||||
|
||||
static SIMDINLINE
|
||||
void vec4_max_ps(Vec4& r, const Vec4& v0, Float s)
|
||||
{
|
||||
r[0] = SIMD::max_ps(v0[0], s);
|
||||
r[1] = SIMD::max_ps(v0[1], s);
|
||||
r[2] = SIMD::max_ps(v0[2], s);
|
||||
r[3] = SIMD::max_ps(v0[3], s);
|
||||
}
|
||||
|
||||
// Matrix4x4 * Vector4
|
||||
// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
|
||||
// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
|
||||
// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
|
||||
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
|
||||
static SIMDINLINE
|
||||
void SIMDCALL mat4x4_vec4_multiply(
|
||||
Vec4& result,
|
||||
const float *pMatrix,
|
||||
const Vec4& v)
|
||||
{
|
||||
Float m;
|
||||
Float r0;
|
||||
Float r1;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3]
|
||||
r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
|
||||
result[0] = r0;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3]
|
||||
r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
|
||||
result[1] = r0;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3]
|
||||
r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
|
||||
result[2] = r0;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 3*4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 3*4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 3*4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 3*4 + 3); // m[row][3]
|
||||
r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
|
||||
result[3] = r0;
|
||||
}
|
||||
|
||||
// Matrix4x4 * Vector3 - Direction Vector where w = 0.
|
||||
// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
|
||||
// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
|
||||
// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
|
||||
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
|
||||
static SIMDINLINE
|
||||
void SIMDCALL mat3x3_vec3_w0_multiply(
|
||||
Vec4& result,
|
||||
const float *pMatrix,
|
||||
const Vec4& v)
|
||||
{
|
||||
Float m;
|
||||
Float r0;
|
||||
Float r1;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
result[0] = r0;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
result[1] = r0;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
result[2] = r0;
|
||||
|
||||
result[3] = SIMD::setzero_ps();
|
||||
}
|
||||
|
||||
// Matrix4x4 * Vector3 - Position vector where w = 1.
|
||||
// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
|
||||
// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
|
||||
// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
|
||||
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
|
||||
static SIMDINLINE
|
||||
void SIMDCALL mat4x4_vec3_w1_multiply(
|
||||
Vec4& result,
|
||||
const float *pMatrix,
|
||||
const Vec4& v)
|
||||
{
|
||||
Float m;
|
||||
Float r0;
|
||||
Float r1;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3]
|
||||
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
|
||||
result[0] = r0;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3]
|
||||
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
|
||||
result[1] = r0;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3]
|
||||
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
|
||||
result[2] = r0;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 3*4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 3*4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 3*4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 3*4 + 3); // m[row][3]
|
||||
result[3] = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
|
||||
}
|
||||
|
||||
static SIMDINLINE
|
||||
void SIMDCALL mat4x3_vec3_w1_multiply(
|
||||
Vec4& result,
|
||||
const float *pMatrix,
|
||||
const Vec4& v)
|
||||
{
|
||||
Float m;
|
||||
Float r0;
|
||||
Float r1;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3]
|
||||
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
|
||||
result[0] = r0;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3]
|
||||
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
|
||||
result[1] = r0;
|
||||
|
||||
m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0]
|
||||
r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
|
||||
m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1]
|
||||
r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
|
||||
m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2]
|
||||
r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
|
||||
r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
|
||||
m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3]
|
||||
r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
|
||||
result[2] = r0;
|
||||
result[3] = SIMD::set1_ps(1.0f);
|
||||
}
|
||||
}; // struct SIMDBase
|
||||
|
||||
using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
|
||||
using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>;
|
||||
using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>;
|
||||
545
src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
Normal file
545
src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
Normal file
|
|
@ -0,0 +1,545 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD128 AVX (1) implementation
|
||||
//============================================================================
|
||||
|
||||
#define SIMD_WRAPPER_1(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) \
|
||||
{\
|
||||
return _mm_##op(a);\
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_2(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return _mm_##op(a, b);\
|
||||
}
|
||||
|
||||
#define SIMD_DWRAPPER_2(op) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{\
|
||||
return _mm_##op(a, b);\
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return _mm_##op(a, b, ImmT);\
|
||||
}
|
||||
|
||||
#define SIMD_DWRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{\
|
||||
return _mm_##op(a, b, ImmT);\
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_3(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
|
||||
{\
|
||||
return _mm_##op(a, b, c);\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return _mm_##op(a);\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1I_(op, intrin) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return intrin(a, ImmT);\
|
||||
}
|
||||
#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm_##op)
|
||||
|
||||
#define SIMD_IWRAPPER_2_(op, intrin) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return intrin(a, b);\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return _mm_##op(a, b);\
|
||||
}
|
||||
|
||||
#define SIMD_IFWRAPPER_2(op, intrin) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return _mm_##op(a, b, ImmT);\
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Single precision floating point arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2(add_ps); // return a + b
|
||||
SIMD_WRAPPER_2(div_ps); // return a / b
|
||||
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
|
||||
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
|
||||
SIMD_WRAPPER_2(mul_ps); // return a * b
|
||||
SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a
|
||||
SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
|
||||
SIMD_WRAPPER_2(sub_ps); // return a - b
|
||||
|
||||
static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
|
||||
{
|
||||
return add_ps(mul_ps(a, b), c);
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c) // return (a * b) - c
|
||||
{
|
||||
return sub_ps(mul_ps(a, b), c);
|
||||
}
|
||||
|
||||
template <RoundMode RMT>
|
||||
static SIMDINLINE Float SIMDCALL round_ps(Float a)
|
||||
{
|
||||
return _mm_round_ps(a, static_cast<int>(RMT));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
|
||||
static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Integer (various width) arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
|
||||
SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
|
||||
SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
|
||||
SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
|
||||
|
||||
// return (a * b) & 0xFFFFFFFF
|
||||
//
|
||||
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
|
||||
// and store the low 32 bits of the intermediate integers in dst.
|
||||
SIMD_IWRAPPER_2(mullo_epi32);
|
||||
SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
|
||||
SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
|
||||
SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Logical operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
|
||||
SIMD_IWRAPPER_2_(and_si, _mm_and_si128); // return a & b (int)
|
||||
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
|
||||
SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128); // return (~a) & b (int)
|
||||
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
|
||||
SIMD_IWRAPPER_2_(or_si, _mm_or_si128); // return a | b (int)
|
||||
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
|
||||
SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128); // return a ^ b (int)
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Shift operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b (uint32)
|
||||
{
|
||||
int32_t a, count;
|
||||
a = _mm_extract_epi32(vA, 0);
|
||||
count = _mm_extract_epi32(vB, 0);
|
||||
a <<= count;
|
||||
vA = _mm_insert_epi32(vA, a, 0);
|
||||
|
||||
a = _mm_extract_epi32(vA, 1);
|
||||
count = _mm_extract_epi32(vB, 1);
|
||||
a <<= count;
|
||||
vA = _mm_insert_epi32(vA, a, 1);
|
||||
|
||||
a = _mm_extract_epi32(vA, 2);
|
||||
count = _mm_extract_epi32(vB, 2);
|
||||
a <<= count;
|
||||
vA = _mm_insert_epi32(vA, a, 2);
|
||||
|
||||
a = _mm_extract_epi32(vA, 3);
|
||||
count = _mm_extract_epi32(vB, 3);
|
||||
a <<= count;
|
||||
vA = _mm_insert_epi32(vA, a, 3);
|
||||
|
||||
return vA;
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
|
||||
SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
|
||||
SIMD_IWRAPPER_1I_(srli_si, _mm_srli_si128); // return a >> (ImmT*8) (uint)
|
||||
|
||||
template<int ImmT> // same as srli_si, but with Float cast to int
|
||||
static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
|
||||
{
|
||||
return castsi_ps(srli_si<ImmT>(castps_si(a)));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b (uint32)
|
||||
{
|
||||
int32_t a, count;
|
||||
a = _mm_extract_epi32(vA, 0);
|
||||
count = _mm_extract_epi32(vB, 0);
|
||||
a >>= count;
|
||||
vA = _mm_insert_epi32(vA, a, 0);
|
||||
|
||||
a = _mm_extract_epi32(vA, 1);
|
||||
count = _mm_extract_epi32(vB, 1);
|
||||
a >>= count;
|
||||
vA = _mm_insert_epi32(vA, a, 1);
|
||||
|
||||
a = _mm_extract_epi32(vA, 2);
|
||||
count = _mm_extract_epi32(vB, 2);
|
||||
a >>= count;
|
||||
vA = _mm_insert_epi32(vA, a, 2);
|
||||
|
||||
a = _mm_extract_epi32(vA, 3);
|
||||
count = _mm_extract_epi32(vB, 3);
|
||||
a >>= count;
|
||||
vA = _mm_insert_epi32(vA, a, 3);
|
||||
|
||||
return vA;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Conversion operations
|
||||
//-----------------------------------------------------------------------
|
||||
static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
|
||||
{
|
||||
return _mm_castpd_ps(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
|
||||
{
|
||||
return _mm_castps_si128(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
|
||||
{
|
||||
return _mm_castsi128_pd(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
|
||||
{
|
||||
return _mm_castps_pd(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
|
||||
{
|
||||
return _mm_castsi128_ps(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float)
|
||||
{
|
||||
return _mm_cvtepi32_ps(a);
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_1(cvtepu8_epi16); // return (int16)a (uint8 --> int16)
|
||||
SIMD_IWRAPPER_1(cvtepu8_epi32); // return (int32)a (uint8 --> int32)
|
||||
SIMD_IWRAPPER_1(cvtepu16_epi32); // return (int32)a (uint16 --> int32)
|
||||
SIMD_IWRAPPER_1(cvtepu16_epi64); // return (int64)a (uint16 --> int64)
|
||||
SIMD_IWRAPPER_1(cvtepu32_epi64); // return (int64)a (uint32 --> int64)
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32)
|
||||
{
|
||||
return _mm_cvtps_epi32(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32)
|
||||
{
|
||||
return _mm_cvttps_epi32(a);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Comparison operations
|
||||
//-----------------------------------------------------------------------
|
||||
template<CompareType CmpTypeT>
|
||||
static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
|
||||
{
|
||||
return _mm_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
|
||||
static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
|
||||
static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
|
||||
static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
|
||||
static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
|
||||
static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
|
||||
|
||||
SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
|
||||
SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
|
||||
SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
|
||||
SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
|
||||
SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
|
||||
|
||||
static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
|
||||
{
|
||||
return 0 != _mm_testz_ps(a, b);
|
||||
}
|
||||
|
||||
static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
|
||||
{
|
||||
return 0 != _mm_testz_si128(a, b);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Blend / shuffle / permute operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
|
||||
SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
|
||||
{
|
||||
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
|
||||
{
|
||||
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value)
|
||||
{
|
||||
return _mm_broadcast_ss(p);
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm_packs_epi16 and _mm512_packs_epi16
|
||||
SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm_packs_epi32 and _mm512_packs_epi32
|
||||
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm_packus_epi16 and _mm512_packus_epi16
|
||||
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm_packus_epi32 and _mm512_packus_epi32
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
|
||||
{
|
||||
return castps_si(_mm_permutevar_ps(castsi_ps(a), swiz));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
|
||||
{
|
||||
return _mm_permutevar_ps(a, swiz);
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_1I(shuffle_epi32);
|
||||
|
||||
template<int ImmT>
|
||||
static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) = delete;
|
||||
|
||||
SIMD_IWRAPPER_2(shuffle_epi8);
|
||||
SIMD_DWRAPPER_2I(shuffle_pd);
|
||||
SIMD_WRAPPER_2I(shuffle_ps);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi16);
|
||||
|
||||
//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps);
|
||||
static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
|
||||
{
|
||||
return castps_si(_mm_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2(unpackhi_epi64);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi8);
|
||||
SIMD_DWRAPPER_2(unpackhi_pd);
|
||||
SIMD_WRAPPER_2(unpackhi_ps);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi16);
|
||||
SIMD_IFWRAPPER_2(unpacklo_epi32, _mm_unpacklo_ps);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi64);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi8);
|
||||
SIMD_DWRAPPER_2(unpacklo_pd);
|
||||
SIMD_WRAPPER_2(unpacklo_ps);
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Load / store operations
|
||||
//-----------------------------------------------------------------------
|
||||
template<ScaleFactor ScaleT>
|
||||
static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
|
||||
{
|
||||
uint32_t *pOffsets = (uint32_t*)&idx;
|
||||
Float vResult;
|
||||
float* pResult = (float*)&vResult;
|
||||
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
|
||||
{
|
||||
uint32_t offset = pOffsets[i];
|
||||
offset = offset * static_cast<uint32_t>(ScaleT);
|
||||
pResult[i] = *(float const*)(((uint8_t const*)p + offset));
|
||||
}
|
||||
|
||||
return vResult;
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements)
|
||||
{
|
||||
return broadcast_ss(p);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
|
||||
{
|
||||
return _mm_load_ps(p);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
|
||||
{
|
||||
return _mm_load_si128(&p->v);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
|
||||
{
|
||||
return _mm_loadu_ps(p);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
|
||||
{
|
||||
return _mm_lddqu_si128(&p->v);
|
||||
}
|
||||
|
||||
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
|
||||
template<ScaleFactor ScaleT>
|
||||
static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
|
||||
{
|
||||
uint32_t *pOffsets = (uint32_t*)&idx;
|
||||
Float vResult = old;
|
||||
float* pResult = (float*)&vResult;
|
||||
DWORD index;
|
||||
uint32_t umask = movemask_ps(mask);
|
||||
while (_BitScanForward(&index, umask))
|
||||
{
|
||||
umask &= ~(1 << index);
|
||||
uint32_t offset = pOffsets[index];
|
||||
offset = offset * static_cast<uint32_t>(ScaleT);
|
||||
pResult[index] = *(float const *)(((uint8_t const *)p + offset));
|
||||
}
|
||||
|
||||
return vResult;
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
|
||||
{
|
||||
_mm_maskstore_ps(p, mask, src);
|
||||
}
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
|
||||
{
|
||||
return static_cast<uint32_t>(_mm_movemask_epi8(a));
|
||||
}
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
|
||||
{
|
||||
return static_cast<uint32_t>(_mm_movemask_pd(a));
|
||||
}
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
|
||||
{
|
||||
return static_cast<uint32_t>(_mm_movemask_ps(a));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
|
||||
{
|
||||
return _mm_set1_epi32(i);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
|
||||
{
|
||||
return _mm_set1_epi8(i);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
|
||||
{
|
||||
return _mm_set1_ps(f);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
|
||||
{
|
||||
return _mm_setzero_ps();
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
|
||||
{
|
||||
return _mm_setzero_si128();
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
|
||||
{
|
||||
_mm_store_ps(p, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
|
||||
{
|
||||
_mm_store_si128(&p->v, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a (same as store_si but allows for unaligned mem)
|
||||
{
|
||||
_mm_storeu_si128(&p->v, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache)
|
||||
{
|
||||
_mm_stream_ps(p, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL set_ps(float in3, float in2, float in1, float in0)
|
||||
{
|
||||
return _mm_set_ps(in3, in2, in1, in0);
|
||||
}
|
||||
|
||||
template <int ImmT>
|
||||
static SIMDINLINE float SIMDCALL extract_ps(Float a)
|
||||
{
|
||||
int tmp = _mm_extract_ps(a, ImmT);
|
||||
return *reinterpret_cast<float*>(&tmp);
|
||||
}
|
||||
|
||||
#undef SIMD_WRAPPER_1
|
||||
#undef SIMD_WRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2I
|
||||
#undef SIMD_WRAPPER_2I
|
||||
#undef SIMD_WRAPPER_3
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IFWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_1I
|
||||
#undef SIMD_IWRAPPER_1I_
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX2_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD4 AVX (2) implementation
|
||||
//
|
||||
// Since this implementation inherits from the AVX (1) implementation,
|
||||
// the only operations below ones that replace AVX (1) operations.
|
||||
// Only 2 shifts and 2 gathers were introduced with AVX 2
|
||||
// Also, add native support for FMA operations
|
||||
//============================================================================
|
||||
#define SIMD_WRAPPER_3(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
|
||||
{\
|
||||
return _mm_##op(a, b, c);\
|
||||
}
|
||||
|
||||
SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
|
||||
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b (uint32)
|
||||
{
|
||||
return _mm_sllv_epi32(vA, vB);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b (uint32)
|
||||
{
|
||||
return _mm_srlv_epi32(vA, vB);
|
||||
}
|
||||
|
||||
template<ScaleFactor ScaleT>
|
||||
static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
|
||||
{
|
||||
return _mm_i32gather_ps(p, idx, static_cast<const int>(ScaleT));
|
||||
}
|
||||
|
||||
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
|
||||
template<ScaleFactor ScaleT>
|
||||
static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
|
||||
{
|
||||
return _mm_mask_i32gather_ps(old, p, idx, mask, static_cast<const int>(ScaleT));
|
||||
}
|
||||
|
||||
#undef SIMD_WRAPPER_3
|
||||
|
||||
408
src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
Normal file
408
src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
Normal file
|
|
@ -0,0 +1,408 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD128 AVX (512) implementation
|
||||
//
|
||||
// Since this implementation inherits from the AVX (2) implementation,
|
||||
// the only operations below ones that replace AVX (2) operations.
|
||||
// These use native AVX512 instructions with masking to enable a larger
|
||||
// register set.
|
||||
//============================================================================
|
||||
|
||||
private:
|
||||
static SIMDINLINE __m512 __conv(Float r) { return _mm512_castps128_ps512(r.v); }
|
||||
static SIMDINLINE __m512d __conv(Double r) { return _mm512_castpd128_pd512(r.v); }
|
||||
static SIMDINLINE __m512i __conv(Integer r) { return _mm512_castsi128_si512(r.v); }
|
||||
static SIMDINLINE Float __conv(__m512 r) { return _mm512_castps512_ps128(r); }
|
||||
static SIMDINLINE Double __conv(__m512d r) { return _mm512_castpd512_pd128(r); }
|
||||
static SIMDINLINE Integer __conv(__m512i r) { return _mm512_castsi512_si128(r); }
|
||||
public:
|
||||
|
||||
#define SIMD_WRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
|
||||
}
|
||||
#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
|
||||
|
||||
#define SIMD_WRAPPER_1I_(op, intrin, mask) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
|
||||
}
|
||||
#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
|
||||
|
||||
#define SIMD_WRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
|
||||
}
|
||||
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
|
||||
|
||||
#define SIMD_WRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_3_(op, intrin, mask) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
|
||||
}
|
||||
#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
|
||||
|
||||
#define SIMD_DWRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
|
||||
}
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
|
||||
#endif
|
||||
|
||||
#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
|
||||
}
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
|
||||
#endif
|
||||
|
||||
#define SIMD_DWRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
|
||||
}
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
|
||||
#endif
|
||||
|
||||
#define SIMD_DWRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT));\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
|
||||
}
|
||||
#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xf))
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
|
||||
#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
|
||||
#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
|
||||
#endif
|
||||
|
||||
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
|
||||
}
|
||||
#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xf))
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
|
||||
#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
|
||||
#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
|
||||
#endif
|
||||
|
||||
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
|
||||
}
|
||||
#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xf))
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
|
||||
#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
|
||||
#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
|
||||
#endif
|
||||
|
||||
#define SIMD_IWRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Single precision floating point arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2(add_ps); // return a + b
|
||||
SIMD_WRAPPER_2(div_ps); // return a / b
|
||||
SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
|
||||
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
|
||||
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
|
||||
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
|
||||
SIMD_WRAPPER_2(mul_ps); // return a * b
|
||||
SIMD_WRAPPER_1_(rcp_ps, rcp28_ps, __mmask16(0xf)); // return 1.0f / a
|
||||
SIMD_WRAPPER_1_(rsqrt_ps, rsqrt28_ps, __mmask16(0xf)); // return 1.0f / sqrt(a)
|
||||
SIMD_WRAPPER_2(sub_ps); // return a - b
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Integer (various width) arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32)
|
||||
SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32)
|
||||
SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
|
||||
|
||||
#if !defined(AVX512F_STRICT)
|
||||
|
||||
SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
|
||||
SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
|
||||
#endif
|
||||
|
||||
// return (a * b) & 0xFFFFFFFF
|
||||
//
|
||||
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
|
||||
// and store the low 32 bits of the intermediate integers in dst.
|
||||
SIMD_IWRAPPER_2_32(mullo_epi32);
|
||||
SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
|
||||
|
||||
#if !defined(AVX512F_STRICT)
|
||||
|
||||
SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
|
||||
SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
#endif
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Logical operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xf)); // return a & b (int)
|
||||
SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xf)); // return (~a) & b (int)
|
||||
SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xf)); // return a | b (int)
|
||||
SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xf)); // return a ^ b (int)
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Shift operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT
|
||||
SIMD_IWRAPPER_2_32(sllv_epi32); // return a << b (uint32)
|
||||
SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT (int32)
|
||||
SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT (uint32)
|
||||
SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32)
|
||||
|
||||
// use AVX2 version
|
||||
//SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Conversion operations (Use AVX2 versions)
|
||||
//-----------------------------------------------------------------------
|
||||
// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff); // return (int16)a (uint8 --> int16)
|
||||
// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff); // return (int32)a (uint8 --> int32)
|
||||
// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff); // return (int32)a (uint16 --> int32)
|
||||
// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf); // return (int64)a (uint16 --> int64)
|
||||
// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf); // return (int64)a (uint32 --> int64)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Comparison operations (Use AVX2 versions
|
||||
//-----------------------------------------------------------------------
|
||||
//SIMD_IWRAPPER_2_CMP(cmpeq_epi8); // return a == b (int8)
|
||||
//SIMD_IWRAPPER_2_CMP(cmpeq_epi16); // return a == b (int16)
|
||||
//SIMD_IWRAPPER_2_CMP(cmpeq_epi32); // return a == b (int32)
|
||||
//SIMD_IWRAPPER_2_CMP(cmpeq_epi64); // return a == b (int64)
|
||||
//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,); // return a > b (int8)
|
||||
//SIMD_IWRAPPER_2_CMP(cmpgt_epi16); // return a > b (int16)
|
||||
//SIMD_IWRAPPER_2_CMP(cmpgt_epi32); // return a > b (int32)
|
||||
//SIMD_IWRAPPER_2_CMP(cmpgt_epi64); // return a > b (int64)
|
||||
//
|
||||
//static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b) // return a < b (int32)
|
||||
//{
|
||||
// return cmpgt_epi32(b, a);
|
||||
//}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Blend / shuffle / permute operations
|
||||
//-----------------------------------------------------------------------
|
||||
#if !defined(AVX512F_STRICT)
|
||||
|
||||
SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
|
||||
SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
|
||||
SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
|
||||
SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
|
||||
|
||||
#endif
|
||||
// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
|
||||
|
||||
//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
|
||||
//{
|
||||
// return _mm256_permutevar8x32_ps(a, swiz);
|
||||
//}
|
||||
|
||||
SIMD_IWRAPPER_1I_32(shuffle_epi32);
|
||||
//template<int ImmT>
|
||||
//static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
|
||||
//{
|
||||
// return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
|
||||
//}
|
||||
//SIMD_IWRAPPER_2(shuffle_epi8);
|
||||
SIMD_IWRAPPER_2_32(unpackhi_epi32);
|
||||
SIMD_IWRAPPER_2_32(unpacklo_epi32);
|
||||
|
||||
#if !defined(AVX512F_STRICT)
|
||||
|
||||
SIMD_IWRAPPER_2_16(unpackhi_epi16);
|
||||
SIMD_IWRAPPER_2_64(unpackhi_epi64);
|
||||
SIMD_IWRAPPER_2_8(unpackhi_epi8);
|
||||
SIMD_IWRAPPER_2_16(unpacklo_epi16);
|
||||
SIMD_IWRAPPER_2_64(unpacklo_epi64);
|
||||
SIMD_IWRAPPER_2_8(unpacklo_epi8);
|
||||
|
||||
#endif
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Load / store operations
|
||||
//-----------------------------------------------------------------------
|
||||
static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
|
||||
{
|
||||
return __conv(_mm512_maskz_load_ps(__mmask16(0xf), p));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
|
||||
{
|
||||
return __conv(_mm512_maskz_load_epi32(__mmask16(0xf), p));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
|
||||
{
|
||||
return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
|
||||
{
|
||||
return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p));
|
||||
}
|
||||
|
||||
template<ScaleFactor ScaleT>
|
||||
static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
|
||||
{
|
||||
return __conv(_mm512_mask_i32gather_ps(
|
||||
_mm512_setzero_ps(),
|
||||
__mmask16(0xf),
|
||||
__conv(idx),
|
||||
p,
|
||||
static_cast<int>(ScaleT)));
|
||||
}
|
||||
|
||||
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
|
||||
template<ScaleFactor ScaleT>
|
||||
static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
|
||||
{
|
||||
__mmask16 m = 0xf;
|
||||
m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)),
|
||||
_mm512_set1_epi32(0x8000000));
|
||||
return __conv(_mm512_mask_i32gather_ps(
|
||||
__conv(old),
|
||||
m,
|
||||
__conv(idx),
|
||||
p,
|
||||
static_cast<int>(ScaleT)));
|
||||
}
|
||||
|
||||
#if !defined(AVX512F_STRICT)
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
|
||||
{
|
||||
__mmask64 m = 0xffffull;
|
||||
return static_cast<uint32_t>(
|
||||
_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
|
||||
{
|
||||
__mmask16 m = 0xf;
|
||||
m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
|
||||
_mm512_mask_store_ps(p, m, __conv(src));
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
|
||||
{
|
||||
_mm512_mask_store_ps(p, __mmask16(0xf), __conv(a));
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
|
||||
{
|
||||
_mm512_mask_store_epi32(p, __mmask16(0xf), __conv(a));
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// Legacy interface (available only in SIMD256 width)
|
||||
//=======================================================================
|
||||
|
||||
#undef SIMD_WRAPPER_1_
|
||||
#undef SIMD_WRAPPER_1
|
||||
#undef SIMD_WRAPPER_1I_
|
||||
#undef SIMD_WRAPPER_1I
|
||||
#undef SIMD_WRAPPER_2_
|
||||
#undef SIMD_WRAPPER_2
|
||||
#undef SIMD_WRAPPER_2I
|
||||
#undef SIMD_WRAPPER_3_
|
||||
#undef SIMD_WRAPPER_3
|
||||
#undef SIMD_DWRAPPER_1_
|
||||
#undef SIMD_DWRAPPER_1
|
||||
#undef SIMD_DWRAPPER_1I_
|
||||
#undef SIMD_DWRAPPER_1I
|
||||
#undef SIMD_DWRAPPER_2_
|
||||
#undef SIMD_DWRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_1_
|
||||
#undef SIMD_IWRAPPER_1_8
|
||||
#undef SIMD_IWRAPPER_1_16
|
||||
#undef SIMD_IWRAPPER_1_32
|
||||
#undef SIMD_IWRAPPER_1_64
|
||||
#undef SIMD_IWRAPPER_1I_
|
||||
#undef SIMD_IWRAPPER_1I_8
|
||||
#undef SIMD_IWRAPPER_1I_16
|
||||
#undef SIMD_IWRAPPER_1I_32
|
||||
#undef SIMD_IWRAPPER_1I_64
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_2_8
|
||||
#undef SIMD_IWRAPPER_2_16
|
||||
#undef SIMD_IWRAPPER_2_32
|
||||
#undef SIMD_IWRAPPER_2_64
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
//#undef SIMD_IWRAPPER_2I_8
|
||||
//#undef SIMD_IWRAPPER_2I_16
|
||||
//#undef SIMD_IWRAPPER_2I_32
|
||||
//#undef SIMD_IWRAPPER_2I_64
|
||||
757
src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
Normal file
757
src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
Normal file
|
|
@ -0,0 +1,757 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
using SIMD128T = SIMD128Impl::AVXImpl;
|
||||
|
||||
//============================================================================
|
||||
// SIMD256 AVX (1) implementation
|
||||
//============================================================================
|
||||
|
||||
#define SIMD_WRAPPER_1(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) \
|
||||
{\
|
||||
return _mm256_##op(a);\
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_2(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return _mm256_##op(a, b);\
|
||||
}
|
||||
|
||||
#define SIMD_DWRAPPER_2(op) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{\
|
||||
return _mm256_##op(a, b);\
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return _mm256_##op(a, b, ImmT);\
|
||||
}
|
||||
|
||||
#define SIMD_DWRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{\
|
||||
return _mm256_##op(a, b, ImmT);\
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_3(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
|
||||
{\
|
||||
return _mm256_##op(a, b, c);\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return _mm256_##op(a);\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return _mm256_##op(a, b);\
|
||||
}
|
||||
|
||||
#define SIMD_IFWRAPPER_2(op, intrin) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\
|
||||
}
|
||||
|
||||
#define SIMD_IFWRAPPER_2I(op, intrin) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return castps_si( intrin(castsi_ps(a), castsi_ps(b), ImmT) );\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2I_(op, intrin) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return _mm256_##intrin(a, b, ImmT);\
|
||||
}
|
||||
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
|
||||
|
||||
#define SIMD_IWRAPPER_3(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c) \
|
||||
{\
|
||||
return _mm256_##op(a, b, c);\
|
||||
}
|
||||
|
||||
// emulated integer simd
|
||||
#define SIMD_EMU_IWRAPPER_1(op) \
|
||||
static SIMDINLINE \
|
||||
Integer SIMDCALL op(Integer a)\
|
||||
{\
|
||||
return Integer\
|
||||
{\
|
||||
SIMD128T::op(a.v4[0]),\
|
||||
SIMD128T::op(a.v4[1]),\
|
||||
};\
|
||||
}
|
||||
#define SIMD_EMU_IWRAPPER_1L(op, shift) \
|
||||
static SIMDINLINE \
|
||||
Integer SIMDCALL op(Integer a)\
|
||||
{\
|
||||
return Integer \
|
||||
{\
|
||||
SIMD128T::op(a.v4[0]), \
|
||||
SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])), \
|
||||
};\
|
||||
}\
|
||||
static SIMDINLINE \
|
||||
Integer SIMDCALL op(SIMD128Impl::Integer a)\
|
||||
{\
|
||||
return Integer \
|
||||
{\
|
||||
SIMD128T::op(a), \
|
||||
SIMD128T::op(SIMD128T::template srli_si<shift>(a)), \
|
||||
};\
|
||||
}
|
||||
|
||||
#define SIMD_EMU_IWRAPPER_1I(op) \
|
||||
template <int ImmT> static SIMDINLINE \
|
||||
Integer SIMDCALL op(Integer a)\
|
||||
{\
|
||||
return Integer\
|
||||
{\
|
||||
SIMD128T::template op<ImmT>(a.v4[0]),\
|
||||
SIMD128T::template op<ImmT>(a.v4[1]),\
|
||||
};\
|
||||
}
|
||||
|
||||
#define SIMD_EMU_IWRAPPER_2(op) \
|
||||
static SIMDINLINE \
|
||||
Integer SIMDCALL op(Integer a, Integer b)\
|
||||
{\
|
||||
return Integer\
|
||||
{\
|
||||
SIMD128T::op(a.v4[0], b.v4[0]),\
|
||||
SIMD128T::op(a.v4[1], b.v4[1]),\
|
||||
};\
|
||||
}
|
||||
|
||||
#define SIMD_EMU_IWRAPPER_2I(op) \
|
||||
template <int ImmT> static SIMDINLINE \
|
||||
Integer SIMDCALL op(Integer a, Integer b)\
|
||||
{\
|
||||
return Integer\
|
||||
{\
|
||||
SIMD128T::template op<ImmT>(a.v4[0], b.v[0]),\
|
||||
SIMD128T::template op<ImmT>(a.v4[1], b.v[1]),\
|
||||
};\
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Single precision floating point arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2(add_ps); // return a + b
|
||||
SIMD_WRAPPER_2(div_ps); // return a / b
|
||||
|
||||
static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
|
||||
{
|
||||
return add_ps(mul_ps(a, b), c);
|
||||
}
|
||||
|
||||
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
|
||||
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
|
||||
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
|
||||
SIMD_WRAPPER_2(mul_ps); // return a * b
|
||||
SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a
|
||||
SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
|
||||
SIMD_WRAPPER_2(sub_ps); // return a - b
|
||||
|
||||
template <RoundMode RMT>
|
||||
static SIMDINLINE Float SIMDCALL round_ps(Float a)
|
||||
{
|
||||
return _mm256_round_ps(a, static_cast<int>(RMT));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
|
||||
static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Integer (various width) arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_EMU_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
|
||||
SIMD_EMU_IWRAPPER_2(add_epi32); // return a + b (int32)
|
||||
SIMD_EMU_IWRAPPER_2(add_epi8); // return a + b (int8)
|
||||
SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
SIMD_EMU_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
|
||||
SIMD_EMU_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
|
||||
SIMD_EMU_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
|
||||
SIMD_EMU_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
|
||||
SIMD_EMU_IWRAPPER_2(mul_epi32); // return a * b (int32)
|
||||
|
||||
// return (a * b) & 0xFFFFFFFF
|
||||
//
|
||||
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
|
||||
// and store the low 32 bits of the intermediate integers in dst.
|
||||
SIMD_EMU_IWRAPPER_2(mullo_epi32);
|
||||
SIMD_EMU_IWRAPPER_2(sub_epi32); // return a - b (int32)
|
||||
SIMD_EMU_IWRAPPER_2(sub_epi64); // return a - b (int64)
|
||||
SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Logical operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
|
||||
SIMD_EMU_IWRAPPER_2(and_si); // return a & b (int)
|
||||
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
|
||||
SIMD_EMU_IWRAPPER_2(andnot_si); // return (~a) & b (int)
|
||||
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
|
||||
SIMD_EMU_IWRAPPER_2(or_si); // return a | b (int)
|
||||
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
|
||||
SIMD_EMU_IWRAPPER_2(xor_si); // return a ^ b (int)
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Shift operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_EMU_IWRAPPER_1I(slli_epi32); // return a << ImmT
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vCount) // return a << b (uint32)
|
||||
{
|
||||
int32_t aHi, aLow, countHi, countLow;
|
||||
__m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
|
||||
__m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
|
||||
__m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
|
||||
__m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
|
||||
|
||||
aHi = _mm_extract_epi32(vAHi, 0);
|
||||
countHi = _mm_extract_epi32(vCountHi, 0);
|
||||
aHi <<= countHi;
|
||||
vAHi = _mm_insert_epi32(vAHi, aHi, 0);
|
||||
|
||||
aLow = _mm_extract_epi32(vALow, 0);
|
||||
countLow = _mm_extract_epi32(vCountLow, 0);
|
||||
aLow <<= countLow;
|
||||
vALow = _mm_insert_epi32(vALow, aLow, 0);
|
||||
|
||||
aHi = _mm_extract_epi32(vAHi, 1);
|
||||
countHi = _mm_extract_epi32(vCountHi, 1);
|
||||
aHi <<= countHi;
|
||||
vAHi = _mm_insert_epi32(vAHi, aHi, 1);
|
||||
|
||||
aLow = _mm_extract_epi32(vALow, 1);
|
||||
countLow = _mm_extract_epi32(vCountLow, 1);
|
||||
aLow <<= countLow;
|
||||
vALow = _mm_insert_epi32(vALow, aLow, 1);
|
||||
|
||||
aHi = _mm_extract_epi32(vAHi, 2);
|
||||
countHi = _mm_extract_epi32(vCountHi, 2);
|
||||
aHi <<= countHi;
|
||||
vAHi = _mm_insert_epi32(vAHi, aHi, 2);
|
||||
|
||||
aLow = _mm_extract_epi32(vALow, 2);
|
||||
countLow = _mm_extract_epi32(vCountLow, 2);
|
||||
aLow <<= countLow;
|
||||
vALow = _mm_insert_epi32(vALow, aLow, 2);
|
||||
|
||||
aHi = _mm_extract_epi32(vAHi, 3);
|
||||
countHi = _mm_extract_epi32(vCountHi, 3);
|
||||
aHi <<= countHi;
|
||||
vAHi = _mm_insert_epi32(vAHi, aHi, 3);
|
||||
|
||||
aLow = _mm_extract_epi32(vALow, 3);
|
||||
countLow = _mm_extract_epi32(vCountLow, 3);
|
||||
aLow <<= countLow;
|
||||
vALow = _mm_insert_epi32(vALow, aLow, 3);
|
||||
|
||||
__m256i ret = _mm256_set1_epi32(0);
|
||||
ret = _mm256_insertf128_si256(ret, vAHi, 1);
|
||||
ret = _mm256_insertf128_si256(ret, vALow, 0);
|
||||
return ret;
|
||||
}
|
||||
|
||||
SIMD_EMU_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
|
||||
SIMD_EMU_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
|
||||
SIMD_EMU_IWRAPPER_1I(srli_si); // return a >> (ImmT*8) (uint)
|
||||
|
||||
template<int ImmT> // same as srli_si, but with Float cast to int
|
||||
static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
|
||||
{
|
||||
return castsi_ps(srli_si<ImmT>(castps_si(a)));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vCount) // return a >> b (uint32)
|
||||
{
|
||||
int32_t aHi, aLow, countHi, countLow;
|
||||
__m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
|
||||
__m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
|
||||
__m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
|
||||
__m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
|
||||
|
||||
aHi = _mm_extract_epi32(vAHi, 0);
|
||||
countHi = _mm_extract_epi32(vCountHi, 0);
|
||||
aHi >>= countHi;
|
||||
vAHi = _mm_insert_epi32(vAHi, aHi, 0);
|
||||
|
||||
aLow = _mm_extract_epi32(vALow, 0);
|
||||
countLow = _mm_extract_epi32(vCountLow, 0);
|
||||
aLow >>= countLow;
|
||||
vALow = _mm_insert_epi32(vALow, aLow, 0);
|
||||
|
||||
aHi = _mm_extract_epi32(vAHi, 1);
|
||||
countHi = _mm_extract_epi32(vCountHi, 1);
|
||||
aHi >>= countHi;
|
||||
vAHi = _mm_insert_epi32(vAHi, aHi, 1);
|
||||
|
||||
aLow = _mm_extract_epi32(vALow, 1);
|
||||
countLow = _mm_extract_epi32(vCountLow, 1);
|
||||
aLow >>= countLow;
|
||||
vALow = _mm_insert_epi32(vALow, aLow, 1);
|
||||
|
||||
aHi = _mm_extract_epi32(vAHi, 2);
|
||||
countHi = _mm_extract_epi32(vCountHi, 2);
|
||||
aHi >>= countHi;
|
||||
vAHi = _mm_insert_epi32(vAHi, aHi, 2);
|
||||
|
||||
aLow = _mm_extract_epi32(vALow, 2);
|
||||
countLow = _mm_extract_epi32(vCountLow, 2);
|
||||
aLow >>= countLow;
|
||||
vALow = _mm_insert_epi32(vALow, aLow, 2);
|
||||
|
||||
aHi = _mm_extract_epi32(vAHi, 3);
|
||||
countHi = _mm_extract_epi32(vCountHi, 3);
|
||||
aHi >>= countHi;
|
||||
vAHi = _mm_insert_epi32(vAHi, aHi, 3);
|
||||
|
||||
aLow = _mm_extract_epi32(vALow, 3);
|
||||
countLow = _mm_extract_epi32(vCountLow, 3);
|
||||
aLow >>= countLow;
|
||||
vALow = _mm_insert_epi32(vALow, aLow, 3);
|
||||
|
||||
__m256i ret = _mm256_set1_epi32(0);
|
||||
ret = _mm256_insertf128_si256(ret, vAHi, 1);
|
||||
ret = _mm256_insertf128_si256(ret, vALow, 0);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Conversion operations
|
||||
//-----------------------------------------------------------------------
|
||||
static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
|
||||
{
|
||||
return _mm256_castpd_ps(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
|
||||
{
|
||||
return _mm256_castps_si256(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
|
||||
{
|
||||
return _mm256_castsi256_pd(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
|
||||
{
|
||||
return _mm256_castps_pd(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL castpd_si(Double a) // return *(Integer*)(&a)
|
||||
{
|
||||
return _mm256_castpd_si256(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
|
||||
{
|
||||
return _mm256_castsi256_ps(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float)
|
||||
{
|
||||
return _mm256_cvtepi32_ps(a);
|
||||
}
|
||||
|
||||
SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8); // return (int16)a (uint8 --> int16)
|
||||
SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4); // return (int32)a (uint8 --> int32)
|
||||
SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8); // return (int32)a (uint16 --> int32)
|
||||
SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4); // return (int64)a (uint16 --> int64)
|
||||
SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8); // return (int64)a (uint32 --> int64)
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32)
|
||||
{
|
||||
return _mm256_cvtps_epi32(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32)
|
||||
{
|
||||
return _mm256_cvttps_epi32(a);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Comparison operations
|
||||
//-----------------------------------------------------------------------
|
||||
template<CompareType CmpTypeT>
|
||||
static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
|
||||
{
|
||||
return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
|
||||
static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
|
||||
static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
|
||||
static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
|
||||
static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
|
||||
static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
|
||||
|
||||
SIMD_EMU_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
|
||||
SIMD_EMU_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
|
||||
SIMD_EMU_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
|
||||
SIMD_EMU_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
|
||||
SIMD_EMU_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
|
||||
SIMD_EMU_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
|
||||
SIMD_EMU_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
|
||||
SIMD_EMU_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
|
||||
SIMD_EMU_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
|
||||
|
||||
static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
|
||||
{
|
||||
return 0 != _mm256_testz_ps(a, b);
|
||||
}
|
||||
|
||||
static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
|
||||
{
|
||||
return 0 != _mm256_testz_si256(a, b);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Blend / shuffle / permute operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
|
||||
SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps); // return ImmT ? b : a (int32)
|
||||
SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
|
||||
{
|
||||
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
|
||||
{
|
||||
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value)
|
||||
{
|
||||
return _mm256_broadcast_ss(p);
|
||||
}
|
||||
|
||||
SIMD_EMU_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
|
||||
SIMD_EMU_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
|
||||
SIMD_EMU_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
|
||||
SIMD_EMU_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
|
||||
{
|
||||
Integer result;
|
||||
|
||||
// Ugly slow implementation
|
||||
uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
|
||||
uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
|
||||
uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
|
||||
|
||||
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
|
||||
{
|
||||
pResult[i] = pA[0xF & pSwiz[i]];
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
|
||||
{
|
||||
Float result;
|
||||
|
||||
// Ugly slow implementation
|
||||
float const *pA = reinterpret_cast<float const*>(&a);
|
||||
uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
|
||||
float *pResult = reinterpret_cast<float *>(&result);
|
||||
|
||||
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
|
||||
{
|
||||
pResult[i] = pA[0xF & pSwiz[i]];
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
SIMD_WRAPPER_2I(permute2f128_ps);
|
||||
SIMD_DWRAPPER_2I(permute2f128_pd);
|
||||
SIMD_IWRAPPER_2I_(permute2f128_si, permute2f128_si256);
|
||||
|
||||
|
||||
SIMD_EMU_IWRAPPER_1I(shuffle_epi32);
|
||||
|
||||
template<int ImmT>
|
||||
static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
|
||||
{
|
||||
return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
|
||||
}
|
||||
SIMD_EMU_IWRAPPER_2(shuffle_epi8);
|
||||
SIMD_DWRAPPER_2I(shuffle_pd);
|
||||
SIMD_WRAPPER_2I(shuffle_ps);
|
||||
SIMD_EMU_IWRAPPER_2(unpackhi_epi16);
|
||||
SIMD_IFWRAPPER_2(unpackhi_epi32, _mm256_unpackhi_ps);
|
||||
SIMD_EMU_IWRAPPER_2(unpackhi_epi64);
|
||||
SIMD_EMU_IWRAPPER_2(unpackhi_epi8);
|
||||
SIMD_DWRAPPER_2(unpackhi_pd);
|
||||
SIMD_WRAPPER_2(unpackhi_ps);
|
||||
SIMD_EMU_IWRAPPER_2(unpacklo_epi16);
|
||||
SIMD_IFWRAPPER_2(unpacklo_epi32, _mm256_unpacklo_ps);
|
||||
SIMD_EMU_IWRAPPER_2(unpacklo_epi64);
|
||||
SIMD_EMU_IWRAPPER_2(unpacklo_epi8);
|
||||
SIMD_DWRAPPER_2(unpacklo_pd);
|
||||
SIMD_WRAPPER_2(unpacklo_ps);
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Load / store operations
|
||||
//-----------------------------------------------------------------------
|
||||
template<ScaleFactor ScaleT>
|
||||
static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
|
||||
{
|
||||
uint32_t *pOffsets = (uint32_t*)&idx;
|
||||
Float vResult;
|
||||
float* pResult = (float*)&vResult;
|
||||
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
|
||||
{
|
||||
uint32_t offset = pOffsets[i];
|
||||
offset = offset * static_cast<uint32_t>(ScaleT);
|
||||
pResult[i] = *(float const*)(((uint8_t const*)p + offset));
|
||||
}
|
||||
|
||||
return vResult;
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements)
|
||||
{
|
||||
return broadcast_ss(p);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
|
||||
{
|
||||
return _mm256_load_ps(p);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
|
||||
{
|
||||
return _mm256_load_si256(&p->v);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
|
||||
{
|
||||
return _mm256_loadu_ps(p);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
|
||||
{
|
||||
return _mm256_lddqu_si256(&p->v);
|
||||
}
|
||||
|
||||
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
|
||||
template<ScaleFactor ScaleT>
|
||||
static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
|
||||
{
|
||||
uint32_t *pOffsets = (uint32_t*)&idx;
|
||||
Float vResult = old;
|
||||
float* pResult = (float*)&vResult;
|
||||
DWORD index;
|
||||
uint32_t umask = movemask_ps(mask);
|
||||
while (_BitScanForward(&index, umask))
|
||||
{
|
||||
umask &= ~(1 << index);
|
||||
uint32_t offset = pOffsets[index];
|
||||
offset = offset * static_cast<uint32_t>(ScaleT);
|
||||
pResult[index] = *(float const *)(((uint8_t const *)p + offset));
|
||||
}
|
||||
|
||||
return vResult;
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
|
||||
{
|
||||
_mm256_maskstore_ps(p, mask, src);
|
||||
}
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
|
||||
{
|
||||
return SIMD128T::movemask_epi8(a.v4[0]) |
|
||||
(SIMD128T::movemask_epi8(a.v4[1]) << 16);
|
||||
}
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
|
||||
{
|
||||
return static_cast<uint32_t>(_mm256_movemask_pd(a));
|
||||
}
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
|
||||
{
|
||||
return static_cast<uint32_t>(_mm256_movemask_ps(a));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
|
||||
{
|
||||
return _mm256_set1_epi32(i);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
|
||||
{
|
||||
return _mm256_set1_epi8(i);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
|
||||
{
|
||||
return _mm256_set1_ps(f);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
|
||||
{
|
||||
return _mm256_setzero_ps();
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
|
||||
{
|
||||
return _mm256_setzero_si256();
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
|
||||
{
|
||||
_mm256_store_ps(p, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
|
||||
{
|
||||
_mm256_store_si256(&p->v, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache)
|
||||
{
|
||||
_mm256_stream_ps(p, a);
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// Legacy interface (available only in SIMD256 width)
|
||||
//=======================================================================
|
||||
|
||||
static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const *p)
|
||||
{
|
||||
return _mm256_broadcast_ps(&p->v);
|
||||
}
|
||||
|
||||
template<int ImmT>
|
||||
static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double a)
|
||||
{
|
||||
return _mm256_extractf128_pd(a, ImmT);
|
||||
}
|
||||
|
||||
template<int ImmT>
|
||||
static SIMDINLINE SIMD128Impl::Float SIMDCALL extractf128_ps(Float a)
|
||||
{
|
||||
return _mm256_extractf128_ps(a, ImmT);
|
||||
}
|
||||
|
||||
template<int ImmT>
|
||||
static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer a)
|
||||
{
|
||||
return _mm256_extractf128_si256(a, ImmT);
|
||||
}
|
||||
|
||||
template<int ImmT>
|
||||
static SIMDINLINE Double SIMDCALL insertf128_pd(Double a, SIMD128Impl::Double b)
|
||||
{
|
||||
return _mm256_insertf128_pd(a, b, ImmT);
|
||||
}
|
||||
|
||||
template<int ImmT>
|
||||
static SIMDINLINE Float SIMDCALL insertf128_ps(Float a, SIMD128Impl::Float b)
|
||||
{
|
||||
return _mm256_insertf128_ps(a, b, ImmT);
|
||||
}
|
||||
|
||||
template<int ImmT>
|
||||
static SIMDINLINE Integer SIMDCALL insertf128_si(Integer a, SIMD128Impl::Integer b)
|
||||
{
|
||||
return _mm256_insertf128_si256(a, b, ImmT);
|
||||
}
|
||||
|
||||
#ifndef _mm256_set_m128i
|
||||
#define _mm256_set_m128i(/* SIMD128Impl::Integer */ hi, /* SIMD128Impl::Integer */ lo) \
|
||||
_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
|
||||
#endif
|
||||
|
||||
#ifndef _mm256_loadu2_m128i
|
||||
#define _mm256_loadu2_m128i(/* SIMD128Impl::Integer const* */ hiaddr, \
|
||||
/* SIMD128Impl::Integer const* */ loaddr) \
|
||||
_mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
|
||||
#endif
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi, SIMD128Impl::Integer const* plo)
|
||||
{
|
||||
return _mm256_loadu2_m128i(&phi->v, &plo->v);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
|
||||
{
|
||||
return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
|
||||
{
|
||||
return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer *phi, SIMD128Impl::Integer *plo, Integer src)
|
||||
{
|
||||
_mm256_storeu2_m128i(&phi->v, &plo->v, src);
|
||||
}
|
||||
|
||||
#undef SIMD_WRAPPER_1
|
||||
#undef SIMD_WRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2I
|
||||
#undef SIMD_WRAPPER_2I
|
||||
#undef SIMD_WRAPPER_3
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IFWRAPPER_2
|
||||
#undef SIMD_IFWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_2I_
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_3
|
||||
#undef SIMD_EMU_IWRAPPER_1
|
||||
#undef SIMD_EMU_IWRAPPER_1I
|
||||
#undef SIMD_EMU_IWRAPPER_2
|
||||
#undef SIMD_EMU_IWRAPPER_2I
|
||||
234
src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
Normal file
234
src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
Normal file
|
|
@ -0,0 +1,234 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX2_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD256 AVX (2) implementation
|
||||
//
|
||||
// Since this implementation inherits from the AVX (1) implementation,
|
||||
// the only operations below ones that replace AVX (1) operations.
|
||||
// Mostly these are integer operations that are no longer emulated with SSE
|
||||
//============================================================================
|
||||
|
||||
#define SIMD_IWRAPPER_1(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return _mm256_##op(a);\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1L(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return _mm256_##op(_mm256_castsi256_si128(a));\
|
||||
}\
|
||||
|
||||
#define SIMD_IWRAPPER_1I(op) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return _mm256_##op(a, ImmT);\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1I_(op, intrin) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return _mm256_##intrin(a, ImmT);\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2_(op, intrin) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return _mm256_##intrin(a, b);\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return _mm256_##op(a, b);\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2I(op) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return _mm256_##op(a, b, ImmT);\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return _mm256_##op(a, b, ImmT);\
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Floating point arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
|
||||
{
|
||||
return _mm256_fmadd_ps(a, b, c);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Integer (various width) arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
|
||||
SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
|
||||
SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
|
||||
SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
|
||||
|
||||
// return (a * b) & 0xFFFFFFFF
|
||||
//
|
||||
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
|
||||
// and store the low 32 bits of the intermediate integers in dst.
|
||||
SIMD_IWRAPPER_2(mullo_epi32);
|
||||
SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
|
||||
SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
|
||||
SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Logical operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_2_(and_si, and_si256); // return a & b (int)
|
||||
SIMD_IWRAPPER_2_(andnot_si, andnot_si256); // return (~a) & b (int)
|
||||
SIMD_IWRAPPER_2_(or_si, or_si256); // return a | b (int)
|
||||
SIMD_IWRAPPER_2_(xor_si, xor_si256); // return a ^ b (int)
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Shift operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
|
||||
SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32)
|
||||
SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
|
||||
SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
|
||||
SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32)
|
||||
SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint)
|
||||
|
||||
template<int ImmT> // same as srli_si, but with Float cast to int
|
||||
static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
|
||||
{
|
||||
return castsi_ps(srli_si<ImmT>(castps_si(a)));
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Conversion operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1L(cvtepu8_epi16); // return (int16)a (uint8 --> int16)
|
||||
SIMD_IWRAPPER_1L(cvtepu8_epi32); // return (int32)a (uint8 --> int32)
|
||||
SIMD_IWRAPPER_1L(cvtepu16_epi32); // return (int32)a (uint16 --> int32)
|
||||
SIMD_IWRAPPER_1L(cvtepu16_epi64); // return (int64)a (uint16 --> int64)
|
||||
SIMD_IWRAPPER_1L(cvtepu32_epi64); // return (int64)a (uint32 --> int64)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Comparison operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
|
||||
SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
|
||||
SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
|
||||
SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b) // return a < b (int32)
|
||||
{
|
||||
return cmpgt_epi32(b, a);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Blend / shuffle / permute operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32)
|
||||
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
|
||||
SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
|
||||
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
|
||||
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
|
||||
SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
|
||||
|
||||
static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
|
||||
{
|
||||
return _mm256_permutevar8x32_ps(a, swiz);
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_1I(shuffle_epi32);
|
||||
template<int ImmT>
|
||||
static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
|
||||
{
|
||||
return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
|
||||
}
|
||||
SIMD_IWRAPPER_2(shuffle_epi8);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi16);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi32);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi64);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi8);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi16);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi32);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi64);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi8);
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Load / store operations
|
||||
//-----------------------------------------------------------------------
|
||||
template<ScaleFactor ScaleT>
|
||||
static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
|
||||
{
|
||||
return _mm256_i32gather_ps(p, idx, static_cast<int>(ScaleT));
|
||||
}
|
||||
|
||||
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
|
||||
template<ScaleFactor ScaleT>
|
||||
static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
|
||||
{
|
||||
// g++ in debug mode needs the explicit .v suffix instead of relying on operator __m256()
|
||||
// Only for this intrinsic - not sure why. :(
|
||||
return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast<int>(ScaleT));
|
||||
}
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
|
||||
{
|
||||
return static_cast<uint32_t>(_mm256_movemask_epi8(a));
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// Legacy interface (available only in SIMD256 width)
|
||||
//=======================================================================
|
||||
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_1L
|
||||
#undef SIMD_IWRAPPER_1I
|
||||
#undef SIMD_IWRAPPER_1I_
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
409
src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
Normal file
409
src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
Normal file
|
|
@ -0,0 +1,409 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD256 AVX (512) implementation
|
||||
//
|
||||
// Since this implementation inherits from the AVX (2) implementation,
|
||||
// the only operations below ones that replace AVX (2) operations.
|
||||
// These use native AVX512 instructions with masking to enable a larger
|
||||
// register set.
|
||||
//============================================================================
|
||||
|
||||
private:
|
||||
static SIMDINLINE __m512 __conv(Float r) { return _mm512_castps256_ps512(r.v); }
|
||||
static SIMDINLINE __m512d __conv(Double r) { return _mm512_castpd256_pd512(r.v); }
|
||||
static SIMDINLINE __m512i __conv(Integer r) { return _mm512_castsi256_si512(r.v); }
|
||||
static SIMDINLINE Float __conv(__m512 r) { return _mm512_castps512_ps256(r); }
|
||||
static SIMDINLINE Double __conv(__m512d r) { return _mm512_castpd512_pd256(r); }
|
||||
static SIMDINLINE Integer __conv(__m512i r) { return _mm512_castsi512_si256(r); }
|
||||
public:
|
||||
|
||||
#define SIMD_WRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
|
||||
}
|
||||
#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xff))
|
||||
|
||||
#define SIMD_WRAPPER_1I_(op, intrin, mask) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
|
||||
}
|
||||
#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xff))
|
||||
|
||||
#define SIMD_WRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
|
||||
}
|
||||
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xff))
|
||||
|
||||
#define SIMD_WRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT));\
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_3_(op, intrin, mask) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
|
||||
}
|
||||
#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xff))
|
||||
|
||||
#define SIMD_DWRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
|
||||
}
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
|
||||
#endif
|
||||
|
||||
#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
|
||||
}
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
|
||||
#endif
|
||||
|
||||
#define SIMD_DWRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
|
||||
}
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
|
||||
#endif
|
||||
|
||||
#define SIMD_DWRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
|
||||
}
|
||||
#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xff))
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
|
||||
#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
|
||||
#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
|
||||
#endif
|
||||
|
||||
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
|
||||
}
|
||||
#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xff))
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
|
||||
#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
|
||||
#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
|
||||
#endif
|
||||
|
||||
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
|
||||
}
|
||||
#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xff))
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
|
||||
#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
|
||||
#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
|
||||
#endif
|
||||
|
||||
#define SIMD_IWRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT));\
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Single precision floating point arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2(add_ps); // return a + b
|
||||
SIMD_WRAPPER_2(div_ps); // return a / b
|
||||
SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
|
||||
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
|
||||
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
|
||||
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
|
||||
SIMD_WRAPPER_2(mul_ps); // return a * b
|
||||
//SIMD_WRAPPER_1_(rcp_ps, rcp28_ps, __mmask16(0xff)); // return 1.0f / a
|
||||
//SIMD_WRAPPER_1_(rsqrt_ps, rsqrt28_ps, __mmask16(0xff)); // return 1.0f / sqrt(a)
|
||||
SIMD_WRAPPER_2(sub_ps); // return a - b
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Integer (various width) arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32)
|
||||
SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32)
|
||||
SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
|
||||
|
||||
#if !defined(AVX512F_STRICT)
|
||||
|
||||
SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
|
||||
SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
|
||||
#endif
|
||||
|
||||
// return (a * b) & 0xFFFFFFFF
|
||||
//
|
||||
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
|
||||
// and store the low 32 bits of the intermediate integers in dst.
|
||||
SIMD_IWRAPPER_2_32(mullo_epi32);
|
||||
SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
|
||||
|
||||
#if !defined(AVX512F_STRICT)
|
||||
|
||||
SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
|
||||
SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
#endif
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Logical operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xff)); // return a & b (int)
|
||||
SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xff)); // return (~a) & b (int)
|
||||
SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xff)); // return a | b (int)
|
||||
SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xff)); // return a ^ b (int)
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Shift operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT
|
||||
SIMD_IWRAPPER_2_32(sllv_epi32); // return a << b (uint32)
|
||||
SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT (int32)
|
||||
SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT (uint32)
|
||||
SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32)
|
||||
|
||||
// use AVX2 version
|
||||
//SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Conversion operations (Use AVX2 versions)
|
||||
//-----------------------------------------------------------------------
|
||||
// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff); // return (int16)a (uint8 --> int16)
|
||||
// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff); // return (int32)a (uint8 --> int32)
|
||||
// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff); // return (int32)a (uint16 --> int32)
|
||||
// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf); // return (int64)a (uint16 --> int64)
|
||||
// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf); // return (int64)a (uint32 --> int64)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Comparison operations (Use AVX2 versions
|
||||
//-----------------------------------------------------------------------
|
||||
//SIMD_IWRAPPER_2_CMP(cmpeq_epi8); // return a == b (int8)
|
||||
//SIMD_IWRAPPER_2_CMP(cmpeq_epi16); // return a == b (int16)
|
||||
//SIMD_IWRAPPER_2_CMP(cmpeq_epi32); // return a == b (int32)
|
||||
//SIMD_IWRAPPER_2_CMP(cmpeq_epi64); // return a == b (int64)
|
||||
//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,); // return a > b (int8)
|
||||
//SIMD_IWRAPPER_2_CMP(cmpgt_epi16); // return a > b (int16)
|
||||
//SIMD_IWRAPPER_2_CMP(cmpgt_epi32); // return a > b (int32)
|
||||
//SIMD_IWRAPPER_2_CMP(cmpgt_epi64); // return a > b (int64)
|
||||
//
|
||||
//static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b) // return a < b (int32)
|
||||
//{
|
||||
// return cmpgt_epi32(b, a);
|
||||
//}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Blend / shuffle / permute operations
|
||||
//-----------------------------------------------------------------------
|
||||
#if !defined(AVX512F_STRICT)
|
||||
|
||||
SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
|
||||
SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
|
||||
SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
|
||||
SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
|
||||
|
||||
#endif
|
||||
|
||||
// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
|
||||
|
||||
//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
|
||||
//{
|
||||
// return _mm256_permutevar8x32_ps(a, swiz);
|
||||
//}
|
||||
|
||||
SIMD_IWRAPPER_1I_32(shuffle_epi32);
|
||||
//template<int ImmT>
|
||||
//static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
|
||||
//{
|
||||
// return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
|
||||
//}
|
||||
//SIMD_IWRAPPER_2(shuffle_epi8);
|
||||
SIMD_IWRAPPER_2_32(unpackhi_epi32);
|
||||
SIMD_IWRAPPER_2_32(unpacklo_epi32);
|
||||
|
||||
#if !defined(AVX512F_STRICT)
|
||||
|
||||
SIMD_IWRAPPER_2_16(unpackhi_epi16);
|
||||
SIMD_IWRAPPER_2_64(unpackhi_epi64);
|
||||
SIMD_IWRAPPER_2_8(unpackhi_epi8);
|
||||
SIMD_IWRAPPER_2_16(unpacklo_epi16);
|
||||
SIMD_IWRAPPER_2_64(unpacklo_epi64);
|
||||
SIMD_IWRAPPER_2_8(unpacklo_epi8);
|
||||
|
||||
#endif
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Load / store operations
|
||||
//-----------------------------------------------------------------------
|
||||
static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
|
||||
{
|
||||
return __conv(_mm512_maskz_load_ps(__mmask16(0xff), p));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
|
||||
{
|
||||
return __conv(_mm512_maskz_load_epi32(__mmask16(0xff), p));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
|
||||
{
|
||||
return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
|
||||
{
|
||||
return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p));
|
||||
}
|
||||
|
||||
template<ScaleFactor ScaleT>
|
||||
static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
|
||||
{
|
||||
return __conv(_mm512_mask_i32gather_ps(
|
||||
_mm512_setzero_ps(),
|
||||
__mmask16(0xff),
|
||||
__conv(idx),
|
||||
p,
|
||||
static_cast<int>(ScaleT)));
|
||||
}
|
||||
|
||||
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
|
||||
template<ScaleFactor ScaleT>
|
||||
static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
|
||||
{
|
||||
__mmask16 m = 0xff;
|
||||
m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)),
|
||||
_mm512_set1_epi32(0x8000000));
|
||||
return __conv(_mm512_mask_i32gather_ps(
|
||||
__conv(old),
|
||||
m,
|
||||
__conv(idx),
|
||||
p,
|
||||
static_cast<int>(ScaleT)));
|
||||
}
|
||||
|
||||
#if !defined(AVX512F_STRICT)
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
|
||||
{
|
||||
__mmask64 m = 0xffffffffull;
|
||||
return static_cast<uint32_t>(
|
||||
_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
|
||||
{
|
||||
__mmask16 m = 0xff;
|
||||
m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
|
||||
_mm512_mask_store_ps(p, m, __conv(src));
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
|
||||
{
|
||||
_mm512_mask_store_ps(p, __mmask16(0xff), __conv(a));
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
|
||||
{
|
||||
_mm512_mask_store_epi32(p, __mmask16(0xff), __conv(a));
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// Legacy interface (available only in SIMD256 width)
|
||||
//=======================================================================
|
||||
|
||||
#undef SIMD_WRAPPER_1_
|
||||
#undef SIMD_WRAPPER_1
|
||||
#undef SIMD_WRAPPER_1I_
|
||||
#undef SIMD_WRAPPER_1I
|
||||
#undef SIMD_WRAPPER_2_
|
||||
#undef SIMD_WRAPPER_2
|
||||
#undef SIMD_WRAPPER_2I
|
||||
#undef SIMD_WRAPPER_3_
|
||||
#undef SIMD_WRAPPER_3
|
||||
#undef SIMD_DWRAPPER_1_
|
||||
#undef SIMD_DWRAPPER_1
|
||||
#undef SIMD_DWRAPPER_1I_
|
||||
#undef SIMD_DWRAPPER_1I
|
||||
#undef SIMD_DWRAPPER_2_
|
||||
#undef SIMD_DWRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_1_
|
||||
#undef SIMD_IWRAPPER_1_8
|
||||
#undef SIMD_IWRAPPER_1_16
|
||||
#undef SIMD_IWRAPPER_1_32
|
||||
#undef SIMD_IWRAPPER_1_64
|
||||
#undef SIMD_IWRAPPER_1I_
|
||||
#undef SIMD_IWRAPPER_1I_8
|
||||
#undef SIMD_IWRAPPER_1I_16
|
||||
#undef SIMD_IWRAPPER_1I_32
|
||||
#undef SIMD_IWRAPPER_1I_64
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_2_8
|
||||
#undef SIMD_IWRAPPER_2_16
|
||||
#undef SIMD_IWRAPPER_2_32
|
||||
#undef SIMD_IWRAPPER_2_64
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
//#undef SIMD_IWRAPPER_2I_8
|
||||
//#undef SIMD_IWRAPPER_2I_16
|
||||
//#undef SIMD_IWRAPPER_2I_32
|
||||
//#undef SIMD_IWRAPPER_2I_64
|
||||
682
src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
Normal file
682
src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
Normal file
|
|
@ -0,0 +1,682 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD16 AVX512 (F) implementation
|
||||
//
|
||||
// TODO: Optimize for KNL / KNH or for SKX??
|
||||
// For now probably optimizing more for KNL as that's where
|
||||
// immediate customers are.
|
||||
//============================================================================
|
||||
|
||||
static const int TARGET_SIMD_WIDTH = 16;
|
||||
using SIMD256T = SIMD256Impl::AVX2Impl;
|
||||
|
||||
#define SIMD_WRAPPER_1_(op, intrin) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) \
|
||||
{\
|
||||
return intrin(a);\
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_1(op) \
|
||||
SIMD_WRAPPER_1_(op, _mm512_##op)
|
||||
|
||||
#define SIMD_WRAPPER_2_(op, intrin) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return _mm512_##intrin(a, b);\
|
||||
}
|
||||
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
|
||||
|
||||
#define SIMD_WRAPPERI_2_(op, intrin) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return _mm512_castsi512_ps(_mm512_##intrin(\
|
||||
_mm512_castps_si512(a), _mm512_castps_si512(b)));\
|
||||
}
|
||||
|
||||
#define SIMD_DWRAPPER_2(op) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{\
|
||||
return _mm512_##op(a, b);\
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_2I_(op, intrin) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return _mm512_##intrin(a, b, ImmT);\
|
||||
}
|
||||
#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
|
||||
|
||||
#define SIMD_DWRAPPER_2I_(op, intrin) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{\
|
||||
return _mm512_##intrin(a, b, ImmT);\
|
||||
}
|
||||
#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
|
||||
|
||||
#define SIMD_WRAPPER_3(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
|
||||
{\
|
||||
return _mm512_##op(a, b, c);\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return _mm512_##op(a);\
|
||||
}
|
||||
#define SIMD_IWRAPPER_1_8(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) \
|
||||
{\
|
||||
return _mm512_##op(a);\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1_4(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) \
|
||||
{\
|
||||
return _mm512_##op(a);\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1I_(op, intrin) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return intrin(a, ImmT);\
|
||||
}
|
||||
#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
|
||||
|
||||
#define SIMD_IWRAPPER_2_(op, intrin) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return _mm512_##intrin(a, b);\
|
||||
}
|
||||
#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
|
||||
|
||||
#define SIMD_IWRAPPER_2_CMP(op, cmp) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return cmp(a, b);\
|
||||
}
|
||||
|
||||
#define SIMD_IFWRAPPER_2(op, intrin) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2I_(op, intrin) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return _mm512_##intrin(a, b, ImmT);\
|
||||
}
|
||||
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
|
||||
|
||||
private:
|
||||
static SIMDINLINE Integer vmask(__mmask8 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi64(m, -1LL);
|
||||
}
|
||||
static SIMDINLINE Integer vmask(__mmask16 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi32(m, -1);
|
||||
}
|
||||
static SIMDINLINE Integer vmask(__mmask32 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi16(m, -1);
|
||||
}
|
||||
static SIMDINLINE Integer vmask(__mmask64 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi8(m, -1);
|
||||
}
|
||||
|
||||
public:
|
||||
//-----------------------------------------------------------------------
|
||||
// Single precision floating point arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2(add_ps); // return a + b
|
||||
SIMD_WRAPPER_2(div_ps); // return a / b
|
||||
SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
|
||||
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
|
||||
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
|
||||
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
|
||||
SIMD_WRAPPER_2(mul_ps); // return a * b
|
||||
SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp28_ps); // return 1.0f / a
|
||||
SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt28_ps); // return 1.0f / sqrt(a)
|
||||
SIMD_WRAPPER_2(sub_ps); // return a - b
|
||||
|
||||
template <RoundMode RMT>
|
||||
static SIMDINLINE Float SIMDCALL round_ps(Float a)
|
||||
{
|
||||
return _mm512_roundscale_ps(a, static_cast<int>(RMT));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
|
||||
static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Integer (various width) arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
|
||||
SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
|
||||
SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
|
||||
SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
|
||||
|
||||
// return (a * b) & 0xFFFFFFFF
|
||||
//
|
||||
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
|
||||
// and store the low 32 bits of the intermediate integers in dst.
|
||||
SIMD_IWRAPPER_2(mullo_epi32);
|
||||
SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
|
||||
SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
|
||||
SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Logical operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_2_(and_si, and_si512); // return a & b (int)
|
||||
SIMD_IWRAPPER_2_(andnot_si, andnot_si512); // return (~a) & b (int)
|
||||
SIMD_IWRAPPER_2_(or_si, or_si512); // return a | b (int)
|
||||
SIMD_IWRAPPER_2_(xor_si, xor_si512); // return a ^ b (int)
|
||||
|
||||
#if defined(AVX512F_STRICT)
|
||||
|
||||
SIMD_WRAPPERI_2_(and_ps, and_epi32); // return a & b (float treated as int)
|
||||
SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32); // return (~a) & b (float treated as int)
|
||||
SIMD_WRAPPERI_2_(or_ps, or_epi32); // return a | b (float treated as int)
|
||||
SIMD_WRAPPERI_2_(xor_ps, xor_epi32); // return a ^ b (float treated as int)
|
||||
|
||||
#else
|
||||
|
||||
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
|
||||
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
|
||||
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
|
||||
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Shift operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
|
||||
SIMD_IWRAPPER_2(sllv_epi32);
|
||||
SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
|
||||
SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
|
||||
SIMD_IWRAPPER_1I_(srli_si, srli_si512); // return a >> (ImmT*8) (uint)
|
||||
|
||||
template<int ImmT> // same as srli_si, but with Float cast to int
|
||||
static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
|
||||
{
|
||||
return castsi_ps(srli_si<ImmT>(castps_si(a)));
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2(srlv_epi32);
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Conversion operations
|
||||
//-----------------------------------------------------------------------
|
||||
static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
|
||||
{
|
||||
return _mm512_castpd_ps(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
|
||||
{
|
||||
return _mm512_castps_si512(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
|
||||
{
|
||||
return _mm512_castsi512_pd(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
|
||||
{
|
||||
return _mm512_castps_pd(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL castpd_si(Double a) // return *(Integer*)(&a)
|
||||
{
|
||||
return _mm512_castpd_si512(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
|
||||
{
|
||||
return _mm512_castsi512_ps(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float)
|
||||
{
|
||||
return _mm512_cvtepi32_ps(a);
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a (uint8 --> int16)
|
||||
SIMD_IWRAPPER_1_4(cvtepu8_epi32); // return (int32)a (uint8 --> int32)
|
||||
SIMD_IWRAPPER_1_8(cvtepu16_epi32); // return (int32)a (uint16 --> int32)
|
||||
SIMD_IWRAPPER_1_4(cvtepu16_epi64); // return (int64)a (uint16 --> int64)
|
||||
SIMD_IWRAPPER_1_8(cvtepu32_epi64); // return (int64)a (uint32 --> int64)
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32)
|
||||
{
|
||||
return _mm512_cvtps_epi32(a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32)
|
||||
{
|
||||
return _mm512_cvttps_epi32(a);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Comparison operations
|
||||
//-----------------------------------------------------------------------
|
||||
template<CompareType CmpTypeT>
|
||||
static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
|
||||
{
|
||||
return _mm512_cmp_ps_mask(a, b, static_cast<const int>(CmpTypeT));
|
||||
}
|
||||
|
||||
template<CompareType CmpTypeT>
|
||||
static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
|
||||
{
|
||||
// Legacy vector mask generator
|
||||
__mmask16 result = cmp_ps_mask<CmpTypeT>(a, b);
|
||||
return castsi_ps(vmask(result));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
|
||||
static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
|
||||
static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
|
||||
static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
|
||||
static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
|
||||
static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
|
||||
|
||||
template<CompareTypeInt CmpTypeT>
|
||||
static SIMDINLINE Integer SIMDCALL cmp_epi8(Integer a, Integer b)
|
||||
{
|
||||
// Legacy vector mask generator
|
||||
__mmask64 result = _mm512_cmp_epi8_mask(a, b, static_cast<const int>(CmpTypeT));
|
||||
return vmask(result);
|
||||
}
|
||||
template<CompareTypeInt CmpTypeT>
|
||||
static SIMDINLINE Integer SIMDCALL cmp_epi16(Integer a, Integer b)
|
||||
{
|
||||
// Legacy vector mask generator
|
||||
__mmask32 result = _mm512_cmp_epi16_mask(a, b, static_cast<const int>(CmpTypeT));
|
||||
return vmask(result);
|
||||
}
|
||||
template<CompareTypeInt CmpTypeT>
|
||||
static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b)
|
||||
{
|
||||
// Legacy vector mask generator
|
||||
__mmask16 result = _mm512_cmp_epi32_mask(a, b, static_cast<const int>(CmpTypeT));
|
||||
return vmask(result);
|
||||
}
|
||||
template<CompareTypeInt CmpTypeT>
|
||||
static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b)
|
||||
{
|
||||
// Legacy vector mask generator
|
||||
__mmask8 result = _mm512_cmp_epi64_mask(a, b, static_cast<const int>(CmpTypeT));
|
||||
return vmask(result);
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8<CompareTypeInt::EQ>); // return a == b (int8)
|
||||
SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>); // return a == b (int16)
|
||||
SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>); // return a == b (int32)
|
||||
SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>); // return a == b (int64)
|
||||
SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8<CompareTypeInt::GT>); // return a > b (int8)
|
||||
SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>); // return a > b (int16)
|
||||
SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>); // return a > b (int32)
|
||||
SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>); // return a > b (int64)
|
||||
SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>); // return a < b (int32)
|
||||
|
||||
static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
|
||||
{
|
||||
return (0 == static_cast<int>(_mm512_test_epi32_mask(castps_si(a), castps_si(b))));
|
||||
}
|
||||
|
||||
static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
|
||||
{
|
||||
return (0 == static_cast<int>(_mm512_test_epi32_mask(a, b)));
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Blend / shuffle / permute operations
|
||||
//-----------------------------------------------------------------------
|
||||
template <int ImmT>
|
||||
static SIMDINLINE Float blend_ps(Float a, Float b) // return ImmT ? b : a (float)
|
||||
{
|
||||
return _mm512_mask_blend_ps(__mmask16(ImmT), a, b);
|
||||
}
|
||||
|
||||
template <int ImmT>
|
||||
static SIMDINLINE Float blend_epi32(Integer a, Integer b) // return ImmT ? b : a (int32)
|
||||
{
|
||||
return _mm512_mask_blend_epi32(__mmask16(ImmT), a, b);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float blendv_ps(Float a, Float b, Float mask) // return mask ? b : a (float)
|
||||
{
|
||||
return _mm512_mask_blend_ps(__mmask16(movemask_ps(mask)), a, b);
|
||||
}
|
||||
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
|
||||
{
|
||||
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
|
||||
{
|
||||
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value)
|
||||
{
|
||||
return _mm512_set1_ps(*p);
|
||||
}
|
||||
|
||||
template<int imm>
|
||||
static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
|
||||
{
|
||||
return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), imm));
|
||||
}
|
||||
|
||||
template<int imm>
|
||||
static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
|
||||
{
|
||||
return _mm512_extractf64x4_pd(a, imm);
|
||||
}
|
||||
|
||||
template<int imm>
|
||||
static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
|
||||
{
|
||||
return _mm512_extracti64x4_epi64(a, imm);
|
||||
}
|
||||
|
||||
template<int imm>
|
||||
static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
|
||||
{
|
||||
return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(a), _mm256_castps_pd(b), imm));
|
||||
}
|
||||
|
||||
template<int imm>
|
||||
static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
|
||||
{
|
||||
return _mm512_insertf64x4(a, b, imm);
|
||||
}
|
||||
|
||||
template<int imm>
|
||||
static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
|
||||
{
|
||||
return _mm512_inserti64x4(a, b, imm);
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
|
||||
SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
|
||||
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
|
||||
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
|
||||
{
|
||||
return _mm512_permutexvar_epi32(swiz, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
|
||||
{
|
||||
return _mm512_permutexvar_ps(swiz, a);
|
||||
}
|
||||
|
||||
SIMD_WRAPPER_2I_(permute2f128_ps, shuffle_f32x4);
|
||||
SIMD_DWRAPPER_2I_(permute2f128_pd, shuffle_f64x2);
|
||||
SIMD_IWRAPPER_2I_(permute2f128_si, shuffle_i32x4);
|
||||
|
||||
SIMD_IWRAPPER_1I(shuffle_epi32);
|
||||
|
||||
SIMD_IWRAPPER_2(shuffle_epi8);
|
||||
SIMD_DWRAPPER_2I(shuffle_pd);
|
||||
SIMD_WRAPPER_2I(shuffle_ps);
|
||||
|
||||
template<int ImmT>
|
||||
static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
|
||||
{
|
||||
return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2(unpackhi_epi16);
|
||||
|
||||
//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps);
|
||||
static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
|
||||
{
|
||||
return castps_si(_mm512_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2(unpackhi_epi64);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi8);
|
||||
SIMD_DWRAPPER_2(unpackhi_pd);
|
||||
SIMD_WRAPPER_2(unpackhi_ps);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi16);
|
||||
SIMD_IFWRAPPER_2(unpacklo_epi32, unpacklo_ps);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi64);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi8);
|
||||
SIMD_DWRAPPER_2(unpacklo_pd);
|
||||
SIMD_WRAPPER_2(unpacklo_ps);
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Load / store operations
|
||||
//-----------------------------------------------------------------------
|
||||
template<ScaleFactor ScaleT>
|
||||
static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
|
||||
{
|
||||
uint32_t *pOffsets = (uint32_t*)&idx;
|
||||
Float vResult;
|
||||
float* pResult = (float*)&vResult;
|
||||
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
|
||||
{
|
||||
uint32_t offset = pOffsets[i];
|
||||
offset = offset * static_cast<uint32_t>(ScaleT);
|
||||
pResult[i] = *(float const*)(((uint8_t const*)p + offset));
|
||||
}
|
||||
|
||||
return vResult;
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements)
|
||||
{
|
||||
return broadcast_ss(p);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
|
||||
{
|
||||
return _mm512_load_ps(p);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
|
||||
{
|
||||
return _mm512_load_si512(&p->v);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
|
||||
{
|
||||
return _mm512_loadu_ps(p);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
|
||||
{
|
||||
return _mm512_loadu_si512(p);
|
||||
}
|
||||
|
||||
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
|
||||
template<ScaleFactor ScaleT>
|
||||
static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
|
||||
{
|
||||
__mmask16 k = _mm512_cmpneq_ps_mask(mask, setzero_ps());
|
||||
|
||||
return _mm512_mask_i32gather_ps(old, k, idx, p, ScaleT);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
|
||||
{
|
||||
Mask m = _mm512_cmplt_epi32_mask(mask, setzero_si());
|
||||
_mm512_mask_store_ps(p, m, src);
|
||||
}
|
||||
|
||||
static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
|
||||
{
|
||||
__mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si());
|
||||
return static_cast<uint64_t>(m);
|
||||
}
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
|
||||
{
|
||||
__mmask8 m = _mm512_cmplt_pd_mask(a, setzero_pd());
|
||||
return static_cast<uint32_t>(m);
|
||||
}
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
|
||||
{
|
||||
__mmask16 m = _mm512_cmplt_ps_mask(a, setzero_ps());
|
||||
return static_cast<uint32_t>(m);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
|
||||
{
|
||||
return _mm512_set1_epi32(i);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
|
||||
{
|
||||
return _mm512_set1_epi8(i);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
|
||||
{
|
||||
return _mm512_set1_ps(f);
|
||||
}
|
||||
|
||||
static SIMDINLINE Double SIMDCALL setzero_pd() // return 0 (double)
|
||||
{
|
||||
return _mm512_setzero_pd();
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
|
||||
{
|
||||
return _mm512_setzero_ps();
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
|
||||
{
|
||||
return _mm512_setzero_si512();
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
|
||||
{
|
||||
_mm512_store_ps(p, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
|
||||
{
|
||||
_mm512_store_si512(&p->v, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a (same as store_si but allows for unaligned mem)
|
||||
{
|
||||
_mm512_storeu_si512(&p->v, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache)
|
||||
{
|
||||
_mm512_stream_ps(p, a);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set_epi32(
|
||||
int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
|
||||
int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
|
||||
{
|
||||
return _mm512_set_epi32(
|
||||
i15, i14, i13, i12, i11, i10, i9, i8,
|
||||
i7, i6, i5, i4, i3, i2, i1, i0);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set_epi32(
|
||||
int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
|
||||
{
|
||||
return set_epi32(
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
i7, i6, i5, i4, i3, i2, i1, i0);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL set_ps(
|
||||
float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
|
||||
float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
|
||||
{
|
||||
return _mm512_set_ps(
|
||||
i15, i14, i13, i12, i11, i10, i9, i8,
|
||||
i7, i6, i5, i4, i3, i2, i1, i0);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL set_ps(
|
||||
float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
|
||||
{
|
||||
return set_ps(
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
i7, i6, i5, i4, i3, i2, i1, i0);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
|
||||
{
|
||||
return castsi_ps(_mm512_maskz_mov_epi32(__mmask16(mask), set1_epi32(-1)));
|
||||
}
|
||||
|
||||
#undef SIMD_WRAPPER_1_
|
||||
#undef SIMD_WRAPPER_1
|
||||
#undef SIMD_WRAPPER_2
|
||||
#undef SIMD_WRAPPER_2_
|
||||
#undef SIMD_WRAPPERI_2_
|
||||
#undef SIMD_DWRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2I
|
||||
#undef SIMD_WRAPPER_2I_
|
||||
#undef SIMD_WRAPPER_3_
|
||||
#undef SIMD_WRAPPER_2I
|
||||
#undef SIMD_WRAPPER_3
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IFWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_1I
|
||||
#undef SIMD_IWRAPPER_1I_
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
// Implement mask-enabled SIMD functions
|
||||
842
src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
Normal file
842
src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
Normal file
|
|
@ -0,0 +1,842 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD16 AVX (1) implementation
|
||||
//============================================================================
|
||||
|
||||
static const int TARGET_SIMD_WIDTH = 8;
|
||||
using SIMD128T = SIMD128Impl::AVXImpl;
|
||||
|
||||
#define SIMD_WRAPPER_1(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) \
|
||||
{\
|
||||
return Float\
|
||||
{\
|
||||
SIMD256T::op(a.v8[0]),\
|
||||
SIMD256T::op(a.v8[1]),\
|
||||
};\
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_2(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return Float\
|
||||
{\
|
||||
SIMD256T::op(a.v8[0], b.v8[0]),\
|
||||
SIMD256T::op(a.v8[1], b.v8[1]),\
|
||||
};\
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return Float\
|
||||
{\
|
||||
SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
|
||||
SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
|
||||
};\
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_2I_1(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return Float\
|
||||
{\
|
||||
SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
|
||||
SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
|
||||
};\
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_3(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
|
||||
{\
|
||||
return Float\
|
||||
{\
|
||||
SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
|
||||
SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
|
||||
};\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return Integer\
|
||||
{\
|
||||
SIMD256T::op(a.v8[0]),\
|
||||
SIMD256T::op(a.v8[1]),\
|
||||
};\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return Integer\
|
||||
{\
|
||||
SIMD256T::op(a.v8[0], b.v8[0]),\
|
||||
SIMD256T::op(a.v8[1], b.v8[1]),\
|
||||
};\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return Integer\
|
||||
{\
|
||||
SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
|
||||
SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
|
||||
};\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2I_1(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return Integer\
|
||||
{\
|
||||
SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
|
||||
SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
|
||||
};\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2I_2(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return Integer\
|
||||
{\
|
||||
SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),\
|
||||
SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),\
|
||||
};\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_3(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c) \
|
||||
{\
|
||||
return Integer\
|
||||
{\
|
||||
SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
|
||||
SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
|
||||
};\
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Single precision floating point arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2(add_ps); // return a + b
|
||||
SIMD_WRAPPER_2(div_ps); // return a / b
|
||||
SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
|
||||
SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
|
||||
SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
|
||||
SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
|
||||
SIMD_WRAPPER_2(mul_ps); // return a * b
|
||||
SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a
|
||||
SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
|
||||
SIMD_WRAPPER_2(sub_ps); // return a - b
|
||||
|
||||
template <RoundMode RMT>
|
||||
static SIMDINLINE Float SIMDCALL round_ps(Float a)
|
||||
{
|
||||
return Float
|
||||
{
|
||||
SIMD256T::template round_ps<RMT>(a.v8[0]),
|
||||
SIMD256T::template round_ps<RMT>(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
|
||||
static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Integer (various width) arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
|
||||
SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
|
||||
SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
|
||||
SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
|
||||
SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
|
||||
|
||||
// return (a * b) & 0xFFFFFFFF
|
||||
//
|
||||
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
|
||||
// and store the low 32 bits of the intermediate integers in dst.
|
||||
SIMD_IWRAPPER_2(mullo_epi32);
|
||||
SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
|
||||
SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
|
||||
SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Logical operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
|
||||
SIMD_IWRAPPER_2(and_si); // return a & b (int)
|
||||
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
|
||||
SIMD_IWRAPPER_2(andnot_si); // return (~a) & b (int)
|
||||
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
|
||||
SIMD_IWRAPPER_2(or_si); // return a | b (int)
|
||||
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
|
||||
SIMD_IWRAPPER_2(xor_si); // return a ^ b (int)
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Shift operations
|
||||
//-----------------------------------------------------------------------
|
||||
template<int ImmT>
|
||||
static SIMDINLINE Integer SIMDCALL slli_epi32(Integer a) // return a << ImmT
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
|
||||
SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32)
|
||||
|
||||
template<int ImmT>
|
||||
static SIMDINLINE Integer SIMDCALL srai_epi32(Integer a) // return a >> ImmT (int32)
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
|
||||
SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
template<int ImmT>
|
||||
static SIMDINLINE Integer SIMDCALL srli_epi32(Integer a) // return a >> ImmT (uint32)
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
|
||||
SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
template<int ImmT> // for each 128-bit lane:
|
||||
static SIMDINLINE Integer SIMDCALL srli_si(Integer a) // return a >> (ImmT*8) (uint)
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::template srli_si<ImmT>(a.v8[0]),
|
||||
SIMD256T::template srli_si<ImmT>(a.v8[1]),
|
||||
};
|
||||
}
|
||||
template<int ImmT>
|
||||
static SIMDINLINE Float SIMDCALL srlisi_ps(Float a) // same as srli_si, but with Float cast to int
|
||||
{
|
||||
return Float
|
||||
{
|
||||
SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
|
||||
SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Conversion operations
|
||||
//-----------------------------------------------------------------------
|
||||
static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
|
||||
{
|
||||
return Float
|
||||
{
|
||||
SIMD256T::castpd_ps(a.v8[0]),
|
||||
SIMD256T::castpd_ps(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::castps_si(a.v8[0]),
|
||||
SIMD256T::castps_si(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
|
||||
{
|
||||
return Double
|
||||
{
|
||||
SIMD256T::castsi_pd(a.v8[0]),
|
||||
SIMD256T::castsi_pd(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
|
||||
{
|
||||
return Double
|
||||
{
|
||||
SIMD256T::castps_pd(a.v8[0]),
|
||||
SIMD256T::castps_pd(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
|
||||
{
|
||||
return Float
|
||||
{
|
||||
SIMD256T::castsi_ps(a.v8[0]),
|
||||
SIMD256T::castsi_ps(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float)
|
||||
{
|
||||
return Float
|
||||
{
|
||||
SIMD256T::cvtepi32_ps(a.v8[0]),
|
||||
SIMD256T::cvtepi32_ps(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer a) // return (int16)a (uint8 --> int16)
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::cvtepu8_epi16(a.v4[0]),
|
||||
SIMD256T::cvtepu8_epi16(a.v4[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer a) // return (int32)a (uint8 --> int32)
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::cvtepu8_epi32(a.v4[0]),
|
||||
SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer a) // return (int32)a (uint16 --> int32)
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::cvtepu16_epi32(a.v4[0]),
|
||||
SIMD256T::cvtepu16_epi32(a.v4[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer a) // return (int64)a (uint16 --> int64)
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::cvtepu16_epi64(a.v4[0]),
|
||||
SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer a) // return (int64)a (uint32 --> int64)
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::cvtepu32_epi64(a.v4[0]),
|
||||
SIMD256T::cvtepu32_epi64(a.v4[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32)
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::cvtps_epi32(a.v8[0]),
|
||||
SIMD256T::cvtps_epi32(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32)
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::cvtps_epi32(a.v8[0]),
|
||||
SIMD256T::cvtps_epi32(a.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Comparison operations
|
||||
//-----------------------------------------------------------------------
|
||||
template<CompareType CmpTypeT>
|
||||
static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
|
||||
{
|
||||
return Float
|
||||
{
|
||||
SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
|
||||
SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
|
||||
};
|
||||
}
|
||||
static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
|
||||
static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
|
||||
static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
|
||||
static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
|
||||
static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
|
||||
static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
|
||||
|
||||
template<CompareType CmpTypeT>
|
||||
static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
|
||||
{
|
||||
return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
|
||||
}
|
||||
|
||||
|
||||
SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
|
||||
SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
|
||||
SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
|
||||
SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
|
||||
SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
|
||||
SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
|
||||
|
||||
static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
|
||||
{
|
||||
return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) &
|
||||
SIMD256T::testz_ps(a.v8[1], b.v8[1]));
|
||||
}
|
||||
|
||||
static SIMDINLINE int SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
|
||||
{
|
||||
return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) &
|
||||
SIMD256T::testz_si(a.v8[1], b.v8[1]));
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Blend / shuffle / permute operations
|
||||
//-----------------------------------------------------------------------
|
||||
SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
|
||||
SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32)
|
||||
SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
|
||||
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
|
||||
SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
|
||||
SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value)
|
||||
{
|
||||
float f = *p;
|
||||
return Float
|
||||
{
|
||||
SIMD256T::set1_ps(f),
|
||||
SIMD256T::set1_ps(f),
|
||||
};
|
||||
}
|
||||
|
||||
template<int imm>
|
||||
static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
|
||||
{
|
||||
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
|
||||
return a.v8[imm];
|
||||
}
|
||||
|
||||
template<int imm>
|
||||
static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
|
||||
{
|
||||
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
|
||||
return a.v8[imm];
|
||||
}
|
||||
|
||||
template<int imm>
|
||||
static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
|
||||
{
|
||||
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
|
||||
return a.v8[imm];
|
||||
}
|
||||
|
||||
template<int imm>
|
||||
static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
|
||||
{
|
||||
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
|
||||
a.v8[imm] = b;
|
||||
return a;
|
||||
}
|
||||
|
||||
template<int imm>
|
||||
static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
|
||||
{
|
||||
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
|
||||
a.v8[imm] = b;
|
||||
return a;
|
||||
}
|
||||
|
||||
template<int imm>
|
||||
static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
|
||||
{
|
||||
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
|
||||
a.v8[imm] = b;
|
||||
return a;
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
|
||||
SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
|
||||
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
|
||||
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
|
||||
{
|
||||
Integer result;
|
||||
|
||||
// Ugly slow implementation
|
||||
uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
|
||||
uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
|
||||
uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
|
||||
|
||||
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
|
||||
{
|
||||
pResult[i] = pA[0xF & pSwiz[i]];
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
|
||||
{
|
||||
Float result;
|
||||
|
||||
// Ugly slow implementation
|
||||
float const *pA = reinterpret_cast<float const*>(&a);
|
||||
uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
|
||||
float *pResult = reinterpret_cast<float *>(&result);
|
||||
|
||||
for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
|
||||
{
|
||||
pResult[i] = pA[0xF & pSwiz[i]];
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// All of the 512-bit permute2f128_XX intrinsics do the following:
|
||||
//
|
||||
// SELECT4(src, control) {
|
||||
// CASE(control[1:0])
|
||||
// 0: tmp[127:0] : = src[127:0]
|
||||
// 1 : tmp[127:0] : = src[255:128]
|
||||
// 2 : tmp[127:0] : = src[383:256]
|
||||
// 3 : tmp[127:0] : = src[511:384]
|
||||
// ESAC
|
||||
// RETURN tmp[127:0]
|
||||
// }
|
||||
//
|
||||
// dst[127:0] : = SELECT4(a[511:0], imm8[1:0])
|
||||
// dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
|
||||
// dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
|
||||
// dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
|
||||
// dst[MAX:512] : = 0
|
||||
//
|
||||
// Since the 256-bit AVX instructions use a 4-bit control field (instead
|
||||
// of 2-bit for AVX512), we need to expand the control bits sent to the
|
||||
// AVX instructions for emulation.
|
||||
//
|
||||
template <int shuf>
|
||||
static SIMDINLINE Float SIMDCALL permute2f128_ps(Float a, Float b)
|
||||
{
|
||||
return Float
|
||||
{
|
||||
SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
|
||||
SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
template <int shuf>
|
||||
static SIMDINLINE Double SIMDCALL permute2f128_pd(Double a, Double b)
|
||||
{
|
||||
return Double
|
||||
{
|
||||
SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
|
||||
SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
template <int shuf>
|
||||
static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer a, Integer b)
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
|
||||
SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2I_1(shuffle_epi32);
|
||||
SIMD_IWRAPPER_2I_2(shuffle_epi64);
|
||||
SIMD_IWRAPPER_2(shuffle_epi8);
|
||||
SIMD_WRAPPER_2I_1(shuffle_pd);
|
||||
SIMD_WRAPPER_2I_1(shuffle_ps);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi16);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi32);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi64);
|
||||
SIMD_IWRAPPER_2(unpackhi_epi8);
|
||||
SIMD_WRAPPER_2(unpackhi_pd);
|
||||
SIMD_WRAPPER_2(unpackhi_ps);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi16);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi32);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi64);
|
||||
SIMD_IWRAPPER_2(unpacklo_epi8);
|
||||
SIMD_WRAPPER_2(unpacklo_pd);
|
||||
SIMD_WRAPPER_2(unpacklo_ps);
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Load / store operations
|
||||
//-----------------------------------------------------------------------
|
||||
template<ScaleFactor ScaleT>
|
||||
static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
|
||||
{
|
||||
return Float
|
||||
{
|
||||
SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
|
||||
SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements)
|
||||
{
|
||||
return broadcast_ss(p);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
|
||||
{
|
||||
return Float
|
||||
{
|
||||
SIMD256T::load_ps(p),
|
||||
SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::load_si(&p->v8[0]),
|
||||
SIMD256T::load_si(&p->v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
|
||||
{
|
||||
return Float
|
||||
{
|
||||
SIMD256T::loadu_ps(p),
|
||||
SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::loadu_si(&p->v8[0]),
|
||||
SIMD256T::loadu_si(&p->v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
|
||||
template<ScaleFactor ScaleT>
|
||||
static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
|
||||
{
|
||||
return Float
|
||||
{
|
||||
SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
|
||||
SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
|
||||
{
|
||||
SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
|
||||
SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
|
||||
}
|
||||
|
||||
static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
|
||||
{
|
||||
uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
|
||||
mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
|
||||
|
||||
return mask;
|
||||
}
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
|
||||
{
|
||||
uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
|
||||
mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
|
||||
|
||||
return mask;
|
||||
}
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
|
||||
{
|
||||
uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
|
||||
mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
|
||||
|
||||
return mask;
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::set1_epi32(i),
|
||||
SIMD256T::set1_epi32(i)
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::set1_epi8(i),
|
||||
SIMD256T::set1_epi8(i)
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
|
||||
{
|
||||
return Float
|
||||
{
|
||||
SIMD256T::set1_ps(f),
|
||||
SIMD256T::set1_ps(f)
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
|
||||
{
|
||||
return Float
|
||||
{
|
||||
SIMD256T::setzero_ps(),
|
||||
SIMD256T::setzero_ps()
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::setzero_si(),
|
||||
SIMD256T::setzero_si()
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
|
||||
{
|
||||
SIMD256T::store_ps(p, a.v8[0]);
|
||||
SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
|
||||
{
|
||||
SIMD256T::store_si(&p->v8[0], a.v8[0]);
|
||||
SIMD256T::store_si(&p->v8[1], a.v8[1]);
|
||||
}
|
||||
|
||||
static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache)
|
||||
{
|
||||
SIMD256T::stream_ps(p, a.v8[0]);
|
||||
SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set_epi32(
|
||||
int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
|
||||
int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
|
||||
{
|
||||
return Integer
|
||||
{
|
||||
SIMD256T::set_epi32(
|
||||
i7, i6, i5, i4, i3, i2, i1, i0),
|
||||
SIMD256T::set_epi32(
|
||||
i15, i14, i13, i12, i11, i10, i9, i8)
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set_epi32(
|
||||
int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
|
||||
{
|
||||
return set_epi32(
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
i7, i6, i5, i4, i3, i2, i1, i0);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL set_ps(
|
||||
float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
|
||||
float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
|
||||
{
|
||||
return Float
|
||||
{
|
||||
SIMD256T::set_ps(
|
||||
i7, i6, i5, i4, i3, i2, i1, i0),
|
||||
SIMD256T::set_ps(
|
||||
i15, i14, i13, i12, i11, i10, i9, i8)
|
||||
};
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL set_ps(
|
||||
float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
|
||||
{
|
||||
return set_ps(
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
i7, i6, i5, i4, i3, i2, i1, i0);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
|
||||
{
|
||||
Integer vec = set1_epi32(mask);
|
||||
const Integer bit = set_epi32(
|
||||
0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100,
|
||||
0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
|
||||
vec = and_si(vec, bit);
|
||||
vec = cmplt_epi32(setzero_si(), vec);
|
||||
return castsi_ps(vec);
|
||||
}
|
||||
|
||||
#undef SIMD_WRAPPER_1
|
||||
#undef SIMD_WRAPPER_2
|
||||
#undef SIMD_WRAPPER_2I
|
||||
#undef SIMD_WRAPPER_2I_1
|
||||
#undef SIMD_WRAPPER_3
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_2I_1
|
||||
#undef SIMD_IWRAPPER_3
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
// no backwards compatibility for simd mask-enabled functions
|
||||
|
||||
428
src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
Normal file
428
src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
Normal file
|
|
@ -0,0 +1,428 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#pragma once
|
||||
#if 0
|
||||
//===========================================================================
|
||||
// Placeholder name representing either SIMD4, SIMD256, or SIMD16 structures.
|
||||
//===========================================================================
|
||||
struct SIMD256 // or SIMD4 or SIMD16
|
||||
{
|
||||
//=======================================================================
|
||||
// SIMD Types
|
||||
//
|
||||
// These typedefs are examples. The SIMD256 and SIMD16 implementations will
|
||||
// use different base types with this same naming.
|
||||
using Float = __m256; // Packed single-precision float vector
|
||||
using Double = __m256d; // Packed double-precision float vector
|
||||
using Integer = __m256i; // Packed integer vector (mutable element widths)
|
||||
using Mask = uint8_t; // Integer representing mask bits
|
||||
|
||||
//=======================================================================
|
||||
// Standard interface
|
||||
// (available in both SIMD256 and SIMD16 widths)
|
||||
//=======================================================================
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Single precision floating point arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
static Float add_ps(Float a, Float b); // return a + b
|
||||
static Float div_ps(Float a, Float b); // return a / b
|
||||
static Float fmadd_ps(Float a, Float b, Float c); // return (a * b) + c
|
||||
static Float fmsub_ps(Float a, Float b, Float c); // return (a * b) - c
|
||||
static Float max_ps(Float a, Float b); // return (a > b) ? a : b
|
||||
static Float min_ps(Float a, Float b); // return (a < b) ? a : b
|
||||
static Float mul_ps(Float a, Float b); // return a * b
|
||||
static Float rcp_ps(Float a); // return 1.0f / a
|
||||
static Float rsqrt_ps(Float a); // return 1.0f / sqrt(a)
|
||||
static Float sub_ps(Float a, Float b); // return a - b
|
||||
|
||||
enum class RoundMode
|
||||
{
|
||||
TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + (signof(value))0.5)
|
||||
TO_NEG_INF = 0x01, // Round to negative infinity
|
||||
TO_POS_INF = 0x02, // Round to positive infinity
|
||||
TO_ZERO = 0x03, // Round to 0 a.k.a. truncate
|
||||
CUR_DIRECTION = 0x04, // Round in direction set in MXCSR register
|
||||
|
||||
RAISE_EXC = 0x00, // Raise exception on overflow
|
||||
NO_EXC = 0x08, // Suppress exceptions
|
||||
|
||||
NINT = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC),
|
||||
NINT_NOEXC = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC),
|
||||
FLOOR = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC),
|
||||
FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC),
|
||||
CEIL = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC),
|
||||
CEIL_NOEXC = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC),
|
||||
TRUNC = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC),
|
||||
TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC),
|
||||
RINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC),
|
||||
NEARBYINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC),
|
||||
};
|
||||
|
||||
// return round_func(a)
|
||||
//
|
||||
// round_func is chosen on the RMT template parameter. See the documentation
|
||||
// for the RoundMode enumeration above.
|
||||
template <RoundMode RMT>
|
||||
static Float round_ps(Float a); // return round(a)
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Integer (various width) arithmetic operations
|
||||
//-----------------------------------------------------------------------
|
||||
static Integer abs_epi32(Integer a); // return absolute_value(a) (int32)
|
||||
static Integer add_epi32(Integer a, Integer b); // return a + b (int32)
|
||||
static Integer add_epi8(Integer a, Integer b); // return a + b (int8)
|
||||
static Integer adds_epu8(Integer a, Integer b); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
static Integer max_epi32(Integer a, Integer b); // return (a > b) ? a : b (int32)
|
||||
static Integer max_epu32(Integer a, Integer b); // return (a > b) ? a : b (uint32)
|
||||
static Integer min_epi32(Integer a, Integer b); // return (a < b) ? a : b (int32)
|
||||
static Integer min_epu32(Integer a, Integer b); // return (a < b) ? a : b (uint32)
|
||||
static Integer mul_epi32(Integer a, Integer b); // return a * b (int32)
|
||||
|
||||
// return (a * b) & 0xFFFFFFFF
|
||||
//
|
||||
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
|
||||
// and store the low 32 bits of the intermediate integers in dst.
|
||||
static Float mullo_epi32(Integer a, Integer b);
|
||||
|
||||
static Integer sub_epi32(Integer a, Integer b); // return a - b (int32)
|
||||
static Integer sub_epi64(Integer a, Integer b); // return a - b (int64)
|
||||
static Integer subs_epu8(Integer a, Integer b); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Logical operations
|
||||
//-----------------------------------------------------------------------
|
||||
static Float and_ps(Float a, Float b); // return a & b (float treated as int)
|
||||
static Integer and_si(Integer a, Integer b); // return a & b (int)
|
||||
static Float andnot_ps(Float a, Float b); // return (~a) & b (float treated as int)
|
||||
static Integer andnot_si(Integer a, Integer b); // return (~a) & b (int)
|
||||
static Float or_ps(Float a, Float b); // return a | b (float treated as int)
|
||||
static Float or_si(Integer a, Integer b); // return a | b (int)
|
||||
static Float xor_ps(Float a, Float b); // return a ^ b (float treated as int)
|
||||
static Integer xor_si(Integer a, Integer b); // return a ^ b (int)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Shift operations
|
||||
//-----------------------------------------------------------------------
|
||||
template<int ImmT>
|
||||
static Integer slli_epi32(Integer a); // return a << ImmT
|
||||
static Integer sllv_epi32(Integer a, Integer b); // return a << b
|
||||
template<int ImmT>
|
||||
static Integer srai_epi32(Integer a); // return a >> ImmT (int32)
|
||||
template<int ImmT>
|
||||
static Integer srli_epi32(Integer a); // return a >> ImmT (uint32)
|
||||
template<int ImmT> // for each 128-bit lane:
|
||||
static Integer srli_si(Integer a); // return a >> (ImmT*8) (uint)
|
||||
template<int ImmT>
|
||||
static Float srlisi_ps(Float a); // same as srli_si, but with Float cast to int
|
||||
static Integer srlv_epi32(Integer a, Integer b); // return a >> b (uint32)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Conversion operations
|
||||
//-----------------------------------------------------------------------
|
||||
static Float castpd_ps(Double a); // return *(Float*)(&a)
|
||||
static Integer castps_si(Float a); // return *(Integer*)(&a)
|
||||
static Double castsi_pd(Integer a); // return *(Double*)(&a)
|
||||
static Double castps_pd(Float a); // return *(Double*)(&a)
|
||||
static Float castsi_ps(Integer a); // return *(Float*)(&a)
|
||||
static Float cvtepi32_ps(Integer a); // return (float)a (int32 --> float)
|
||||
static Integer cvtepu8_epi16(Integer a); // return (int16)a (uint8 --> int16)
|
||||
static Integer cvtepu8_epi32(Integer a); // return (int32)a (uint8 --> int32)
|
||||
static Integer cvtepu16_epi32(Integer a); // return (int32)a (uint16 --> int32)
|
||||
static Integer cvtepu16_epi64(Integer a); // return (int64)a (uint16 --> int64)
|
||||
static Integer cvtepu32_epi64(Integer a); // return (int64)a (uint32 --> int64)
|
||||
static Integer cvtps_epi32(Float a); // return (int32)a (float --> int32)
|
||||
static Integer cvttps_epi32(Float a); // return (int32)a (rnd_to_zero(float) --> int32)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Comparison operations
|
||||
//-----------------------------------------------------------------------
|
||||
|
||||
// Comparison types used with cmp_ps:
|
||||
// - ordered comparisons are always false if either operand is NaN
|
||||
// - unordered comparisons are always true if either operand is NaN
|
||||
// - signaling comparisons raise an exception if either operand is NaN
|
||||
// - non-signaling comparisons will never raise an exception
|
||||
//
|
||||
// Ordered: return (a != NaN) && (b != NaN) && (a cmp b)
|
||||
// Unordered: return (a == NaN) || (b == NaN) || (a cmp b)
|
||||
enum class CompareType
|
||||
{
|
||||
EQ_OQ = 0x00, // Equal (ordered, nonsignaling)
|
||||
LT_OS = 0x01, // Less-than (ordered, signaling)
|
||||
LE_OS = 0x02, // Less-than-or-equal (ordered, signaling)
|
||||
UNORD_Q = 0x03, // Unordered (nonsignaling)
|
||||
NEQ_UQ = 0x04, // Not-equal (unordered, nonsignaling)
|
||||
NLT_US = 0x05, // Not-less-than (unordered, signaling)
|
||||
NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling)
|
||||
ORD_Q = 0x07, // Ordered (nonsignaling)
|
||||
EQ_UQ = 0x08, // Equal (unordered, non-signaling)
|
||||
NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling)
|
||||
NGT_US = 0x0A, // Not-greater-than (unordered, signaling)
|
||||
FALSE_OQ = 0x0B, // False (ordered, nonsignaling)
|
||||
NEQ_OQ = 0x0C, // Not-equal (ordered, non-signaling)
|
||||
GE_OS = 0x0D, // Greater-than-or-equal (ordered, signaling)
|
||||
GT_OS = 0x0E, // Greater-than (ordered, signaling)
|
||||
TRUE_UQ = 0x0F, // True (unordered, non-signaling)
|
||||
EQ_OS = 0x10, // Equal (ordered, signaling)
|
||||
LT_OQ = 0x11, // Less-than (ordered, nonsignaling)
|
||||
LE_OQ = 0x12, // Less-than-or-equal (ordered, nonsignaling)
|
||||
UNORD_S = 0x13, // Unordered (signaling)
|
||||
NEQ_US = 0x14, // Not-equal (unordered, signaling)
|
||||
NLT_UQ = 0x15, // Not-less-than (unordered, nonsignaling)
|
||||
NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
|
||||
ORD_S = 0x17, // Ordered (signaling)
|
||||
EQ_US = 0x18, // Equal (unordered, signaling)
|
||||
NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
|
||||
NGT_UQ = 0x1A, // Not-greater-than (unordered, nonsignaling)
|
||||
FALSE_OS = 0x1B, // False (ordered, signaling)
|
||||
NEQ_OS = 0x1C, // Not-equal (ordered, signaling)
|
||||
GE_OQ = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
|
||||
GT_OQ = 0x1E, // Greater-than (ordered, nonsignaling)
|
||||
TRUE_US = 0x1F, // True (unordered, signaling)
|
||||
};
|
||||
|
||||
// return a (CmpTypeT) b (float)
|
||||
//
|
||||
// See documentation for CompareType above for valid values for CmpTypeT.
|
||||
template<CompareType CmpTypeT>
|
||||
static Float cmp_ps(Float a, Float b); // return a (CmtTypeT) b (see above)
|
||||
static Float cmpgt_ps(Float a, Float b); // return cmp_ps<CompareType::GT_OQ>(a, b)
|
||||
static Float cmple_ps(Float a, Float b); // return cmp_ps<CompareType::LE_OQ>(a, b)
|
||||
static Float cmplt_ps(Float a, Float b); // return cmp_ps<CompareType::LT_OQ>(a, b)
|
||||
static Float cmpneq_ps(Float a, Float b); // return cmp_ps<CompareType::NEQ_OQ>(a, b)
|
||||
static Float cmpeq_ps(Float a, Float b); // return cmp_ps<CompareType::EQ_OQ>(a, b)
|
||||
static Float cmpge_ps(Float a, Float b); // return cmp_ps<CompareType::GE_OQ>(a, b)
|
||||
static Integer cmpeq_epi8(Integer a, Integer b); // return a == b (int8)
|
||||
static Integer cmpeq_epi16(Integer a, Integer b); // return a == b (int16)
|
||||
static Integer cmpeq_epi32(Integer a, Integer b); // return a == b (int32)
|
||||
static Integer cmpeq_epi64(Integer a, Integer b); // return a == b (int64)
|
||||
static Integer cmpgt_epi8(Integer a, Integer b); // return a > b (int8)
|
||||
static Integer cmpgt_epi16(Integer a, Integer b); // return a > b (int16)
|
||||
static Integer cmpgt_epi32(Integer a, Integer b); // return a > b (int32)
|
||||
static Integer cmpgt_epi64(Integer a, Integer b); // return a > b (int64)
|
||||
static Integer cmplt_epi32(Integer a, Integer b); // return a < b (int32)
|
||||
static bool testz_ps(Float a, Float b); // return all_lanes_zero(a & b) ? 1 : 0 (float)
|
||||
static bool testz_si(Integer a, Integer b); // return all_lanes_zero(a & b) ? 1 : 0 (int)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Blend / shuffle / permute operations
|
||||
//-----------------------------------------------------------------------
|
||||
template<int ImmT>
|
||||
static Float blend_ps(Float a, Float b); // return ImmT ? b : a (float)
|
||||
static Integer blendv_epi32(Integer a, Integer b, Float mask); // return mask ? b : a (int)
|
||||
static Float blendv_ps(Float a, Float b, Float mask); // return mask ? b : a (float)
|
||||
static Float broadcast_ss(float const *p); // return *p (all elements in vector get same value)
|
||||
static Integer packs_epi16(Integer a, Integer b); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
|
||||
static Integer packs_epi32(Integer a, Integer b); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
|
||||
static Integer packus_epi16(Integer a, Integer b); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
|
||||
static Integer packus_epi32(Integer a, Integer b); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
|
||||
static Float permute_epi32(Integer a, Integer swiz); // return a[swiz[i]] for each 32-bit lane i (int32)
|
||||
static Float permute_ps(Float a, Integer swiz); // return a[swiz[i]] for each 32-bit lane i (float)
|
||||
template<int SwizT>
|
||||
static Integer shuffle_epi32(Integer a, Integer b);
|
||||
template<int SwizT>
|
||||
static Integer shuffle_epi64(Integer a, Integer b);
|
||||
static Integer shuffle_epi8(Integer a, Integer b);
|
||||
template<int SwizT>
|
||||
static Float shuffle_pd(Double a, Double b);
|
||||
template<int SwizT>
|
||||
static Float shuffle_ps(Float a, Float b);
|
||||
static Integer unpackhi_epi16(Integer a, Integer b);
|
||||
static Integer unpackhi_epi32(Integer a, Integer b);
|
||||
static Integer unpackhi_epi64(Integer a, Integer b);
|
||||
static Integer unpackhi_epi8(Integer a, Integer b);
|
||||
static Float unpackhi_pd(Double a, Double b);
|
||||
static Float unpackhi_ps(Float a, Float b);
|
||||
static Integer unpacklo_epi16(Integer a, Integer b);
|
||||
static Integer unpacklo_epi32(Integer a, Integer b);
|
||||
static Integer unpacklo_epi64(Integer a, Integer b);
|
||||
static Integer unpacklo_epi8(Integer a, Integer b);
|
||||
static Float unpacklo_pd(Double a, Double b);
|
||||
static Float unpacklo_ps(Float a, Float b);
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Load / store operations
|
||||
//-----------------------------------------------------------------------
|
||||
enum class ScaleFactor
|
||||
{
|
||||
SF_1, // No scaling
|
||||
SF_2, // Scale offset by 2
|
||||
SF_4, // Scale offset by 4
|
||||
SF_8, // Scale offset by 8
|
||||
};
|
||||
|
||||
template<ScaleFactor ScaleT>
|
||||
static Float i32gather_ps(float const* p, Integer idx); // return *(float*)(((int8*)p) + (idx * ScaleT))
|
||||
static Float load1_ps(float const *p); // return *p (broadcast 1 value to all elements)
|
||||
static Float load_ps(float const *p); // return *p (loads SIMD width elements from memory)
|
||||
static Integer load_si(Integer const *p); // return *p
|
||||
static Float loadu_ps(float const *p); // return *p (same as load_ps but allows for unaligned mem)
|
||||
static Integer loadu_si(Integer const *p); // return *p (same as load_si but allows for unaligned mem)
|
||||
|
||||
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
|
||||
template<int ScaleT>
|
||||
static Float mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask);
|
||||
|
||||
static void maskstore_ps(float *p, Integer mask, Float src);
|
||||
static int movemask_epi8(Integer a);
|
||||
static int movemask_pd(Double a);
|
||||
static int movemask_ps(Float a);
|
||||
static Integer set1_epi32(int i); // return i (all elements are same value)
|
||||
static Integer set1_epi8(char i); // return i (all elements are same value)
|
||||
static Float set1_ps(float f); // return f (all elements are same value)
|
||||
static Float setzero_ps(); // return 0 (float)
|
||||
static Integer setzero_si(); // return 0 (integer)
|
||||
static void store_ps(float *p, Float a); // *p = a (stores all elements contiguously in memory)
|
||||
static void store_si(Integer *p, Integer a); // *p = a
|
||||
static void stream_ps(float *p, Float a); // *p = a (same as store_ps, but doesn't keep memory in cache)
|
||||
|
||||
//=======================================================================
|
||||
// Legacy interface (available only in SIMD256 width)
|
||||
//=======================================================================
|
||||
|
||||
static Float broadcast_ps(__m128 const *p);
|
||||
template<int ImmT>
|
||||
static __m128d extractf128_pd(Double a);
|
||||
template<int ImmT>
|
||||
static __m128 extractf128_ps(Float a);
|
||||
template<int ImmT>
|
||||
static __m128i extractf128_si(Integer a);
|
||||
template<int ImmT>
|
||||
static Double insertf128_pd(Double a, __m128d b);
|
||||
template<int ImmT>
|
||||
static Float insertf128_ps(Float a, __m128 b);
|
||||
template<int ImmT>
|
||||
static Integer insertf128_si(Integer a, __m128i b);
|
||||
static Integer loadu2_si(__m128 const* phi, __m128 const* plo);
|
||||
template<int ImmT>
|
||||
static Double permute2f128_pd(Double a, Double b);
|
||||
template<int ImmT>
|
||||
static Float permute2f128_ps(Float a, Float b);
|
||||
template<int ImmT>
|
||||
static Integer permute2f128_si(Integer a, Integer b);
|
||||
static Integer set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0);
|
||||
static void storeu2_si(__m128i *phi, __m128i *plo, Integer src);
|
||||
|
||||
//=======================================================================
|
||||
// Advanced masking interface (currently available only in SIMD16 width)
|
||||
//=======================================================================
|
||||
|
||||
|
||||
//=======================================================================
|
||||
// Extended Utility Functions (common to SIMD256 and SIMD16)
|
||||
//=======================================================================
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Extended Types
|
||||
//-----------------------------------------------------------------------
|
||||
|
||||
// Vec4, an SOA SIMD set of 4-dimensional vectors
|
||||
union Vec4
|
||||
{
|
||||
Vec4() = default;
|
||||
Vec4(Float in)
|
||||
{
|
||||
s.x = in;
|
||||
s.y = in;
|
||||
s.z = in;
|
||||
s.w = in;
|
||||
}
|
||||
Vec4(Float x, Float y, Float z, Float w)
|
||||
{
|
||||
s.x = x;
|
||||
s.y = y;
|
||||
s.z = z;
|
||||
s.w = w;
|
||||
}
|
||||
|
||||
Float v[4];
|
||||
Integer vi[4];
|
||||
struct
|
||||
{
|
||||
Float x;
|
||||
Float y;
|
||||
Float z;
|
||||
Float w;
|
||||
} s;
|
||||
Float& operator[] (const int i) { return v[i]; }
|
||||
Float const & operator[] (const int i) const { return v[i]; }
|
||||
};
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Extended Functions
|
||||
//-----------------------------------------------------------------------
|
||||
static void vec4_set1_ps(Vec4& r, const float *p); // r[0] = set1(p[0]), r[1] = set1(p[1]), ...
|
||||
static void vec4_set1_vps(Vec4& r, Float s); // r[0] = s, r[1] = s, ...
|
||||
static Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1); // return dp3(v0, v1)
|
||||
static Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1); // return dp4(v0, v1)
|
||||
static Float vec4_rcp_length_ps(const Vec4& v); // return 1.0f / sqrt(dp4(v, v))
|
||||
static void vec4_normalize_ps(Vec4& r, const Vec4& v); // r = v * rcp_length(v)
|
||||
static void vec4_mul_ps(Vec4& r, const Vec4& v, Float s); // r = v * set1_vps(s)
|
||||
static void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1); // r = v0 * v1
|
||||
static void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1); // r = v0 + v1
|
||||
static void vec4_min_ps(Vec4& r, const Vec4& v0, Float s); // r = (v0 < s) ? v0 : s
|
||||
static void vec4_max_ps(Vec4& r, const Vec4& v0, Float s); // r = (v0 > s) ? v0 : s
|
||||
|
||||
// Matrix4x4 * Vector4
|
||||
// result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * v.s.w)
|
||||
// result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * v.s.w)
|
||||
// result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * v.s.w)
|
||||
// result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * v.s.w)
|
||||
static void mat4x4_vec4_multiply(
|
||||
Vec4& result,
|
||||
const float *pMatrix,
|
||||
const Vec4& v);
|
||||
|
||||
// Matrix4x4 * Vector3 - Direction Vector where w = 0.
|
||||
// result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 0)
|
||||
// result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 0)
|
||||
// result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 0)
|
||||
// result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 0)
|
||||
static void mat3x3_vec3_w0_multiply(
|
||||
Vec4& result,
|
||||
const float *pMatrix,
|
||||
const Vec4& v);
|
||||
|
||||
// Matrix4x4 * Vector3 - Position vector where w = 1.
|
||||
// result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
|
||||
// result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
|
||||
// result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
|
||||
// result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 1)
|
||||
static void mat4x4_vec3_w1_multiply(
|
||||
Vec4& result,
|
||||
const float *pMatrix,
|
||||
const Vec4& v);
|
||||
|
||||
// Matrix4x3 * Vector3 - Position vector where w = 1.
|
||||
// result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
|
||||
// result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
|
||||
// result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
|
||||
// result.s.w = 1
|
||||
static void mat4x3_vec3_w1_multiply(
|
||||
Vec4& result,
|
||||
const float *pMatrix,
|
||||
const Vec4& v);
|
||||
};
|
||||
#endif // #if 0
|
||||
377
src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
Normal file
377
src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
Normal file
|
|
@ -0,0 +1,377 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#pragma once
|
||||
|
||||
#if !defined(__cplusplus)
|
||||
#error C++ compilation required
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <inttypes.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#define SIMD_ARCH_AVX 0
|
||||
#define SIMD_ARCH_AVX2 1
|
||||
#define SIMD_ARCH_AVX512 2
|
||||
|
||||
#if !defined(SIMD_ARCH)
|
||||
#define SIMD_ARCH SIMD_ARCH_AVX
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define SIMDCALL __vectorcall
|
||||
#define SIMDINLINE __forceinline
|
||||
#define SIMDALIGN(type_, align_) __declspec(align(align_)) type_
|
||||
#else
|
||||
#define SIMDCALL
|
||||
#define SIMDINLINE inline
|
||||
#define SIMDALIGN(type_, align_) type_ __attribute__((aligned(align_)))
|
||||
#endif
|
||||
|
||||
// For documentation, please see the following include...
|
||||
// #include "simdlib_interface.hpp"
|
||||
|
||||
namespace SIMDImpl
|
||||
{
|
||||
enum class CompareType
|
||||
{
|
||||
EQ_OQ = 0x00, // Equal (ordered, nonsignaling)
|
||||
LT_OS = 0x01, // Less-than (ordered, signaling)
|
||||
LE_OS = 0x02, // Less-than-or-equal (ordered, signaling)
|
||||
UNORD_Q = 0x03, // Unordered (nonsignaling)
|
||||
NEQ_UQ = 0x04, // Not-equal (unordered, nonsignaling)
|
||||
NLT_US = 0x05, // Not-less-than (unordered, signaling)
|
||||
NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling)
|
||||
ORD_Q = 0x07, // Ordered (nonsignaling)
|
||||
EQ_UQ = 0x08, // Equal (unordered, non-signaling)
|
||||
NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling)
|
||||
NGT_US = 0x0A, // Not-greater-than (unordered, signaling)
|
||||
FALSE_OQ = 0x0B, // False (ordered, nonsignaling)
|
||||
NEQ_OQ = 0x0C, // Not-equal (ordered, non-signaling)
|
||||
GE_OS = 0x0D, // Greater-than-or-equal (ordered, signaling)
|
||||
GT_OS = 0x0E, // Greater-than (ordered, signaling)
|
||||
TRUE_UQ = 0x0F, // True (unordered, non-signaling)
|
||||
EQ_OS = 0x10, // Equal (ordered, signaling)
|
||||
LT_OQ = 0x11, // Less-than (ordered, nonsignaling)
|
||||
LE_OQ = 0x12, // Less-than-or-equal (ordered, nonsignaling)
|
||||
UNORD_S = 0x13, // Unordered (signaling)
|
||||
NEQ_US = 0x14, // Not-equal (unordered, signaling)
|
||||
NLT_UQ = 0x15, // Not-less-than (unordered, nonsignaling)
|
||||
NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
|
||||
ORD_S = 0x17, // Ordered (signaling)
|
||||
EQ_US = 0x18, // Equal (unordered, signaling)
|
||||
NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
|
||||
NGT_UQ = 0x1A, // Not-greater-than (unordered, nonsignaling)
|
||||
FALSE_OS = 0x1B, // False (ordered, signaling)
|
||||
NEQ_OS = 0x1C, // Not-equal (ordered, signaling)
|
||||
GE_OQ = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
|
||||
GT_OQ = 0x1E, // Greater-than (ordered, nonsignaling)
|
||||
TRUE_US = 0x1F, // True (unordered, signaling)
|
||||
};
|
||||
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
enum class CompareTypeInt
|
||||
{
|
||||
EQ = _MM_CMPINT_EQ, // Equal
|
||||
LT = _MM_CMPINT_LT, // Less than
|
||||
LE = _MM_CMPINT_LE, // Less than or Equal
|
||||
NE = _MM_CMPINT_NE, // Not Equal
|
||||
GE = _MM_CMPINT_GE, // Greater than or Equal
|
||||
GT = _MM_CMPINT_GT, // Greater than
|
||||
};
|
||||
#endif // SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
|
||||
enum class ScaleFactor
|
||||
{
|
||||
SF_1 = 1, // No scaling
|
||||
SF_2 = 2, // Scale offset by 2
|
||||
SF_4 = 4, // Scale offset by 4
|
||||
SF_8 = 8, // Scale offset by 8
|
||||
};
|
||||
|
||||
enum class RoundMode
|
||||
{
|
||||
TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + 0.5)
|
||||
TO_NEG_INF = 0x01, // Round to negative infinity
|
||||
TO_POS_INF = 0x02, // Round to positive infinity
|
||||
TO_ZERO = 0x03, // Round to 0 a.k.a. truncate
|
||||
CUR_DIRECTION = 0x04, // Round in direction set in MXCSR register
|
||||
|
||||
RAISE_EXC = 0x00, // Raise exception on overflow
|
||||
NO_EXC = 0x08, // Suppress exceptions
|
||||
|
||||
NINT = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC),
|
||||
NINT_NOEXC = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC),
|
||||
FLOOR = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC),
|
||||
FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC),
|
||||
CEIL = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC),
|
||||
CEIL_NOEXC = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC),
|
||||
TRUNC = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC),
|
||||
TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC),
|
||||
RINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC),
|
||||
NEARBYINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC),
|
||||
};
|
||||
|
||||
struct Traits
|
||||
{
|
||||
using CompareType = SIMDImpl::CompareType;
|
||||
using ScaleFactor = SIMDImpl::ScaleFactor;
|
||||
using RoundMode = SIMDImpl::RoundMode;
|
||||
};
|
||||
|
||||
// Attribute, 4-dimensional attribute in SIMD SOA layout
|
||||
template<typename Float, typename Integer, typename Double>
|
||||
union Vec4
|
||||
{
|
||||
Float v[4];
|
||||
Integer vi[4];
|
||||
Double vd[4];
|
||||
struct
|
||||
{
|
||||
Float x;
|
||||
Float y;
|
||||
Float z;
|
||||
Float w;
|
||||
};
|
||||
SIMDINLINE Float& operator[] (const int i) { return v[i]; }
|
||||
SIMDINLINE Float const & operator[] (const int i) const { return v[i]; }
|
||||
SIMDINLINE Vec4& operator=(Vec4 const & in)
|
||||
{
|
||||
v[0] = in.v[0];
|
||||
v[1] = in.v[1];
|
||||
v[2] = in.v[2];
|
||||
v[3] = in.v[3];
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
namespace SIMD128Impl
|
||||
{
|
||||
union Float
|
||||
{
|
||||
SIMDINLINE Float() = default;
|
||||
SIMDINLINE Float(__m128 in) : v(in) {}
|
||||
SIMDINLINE Float& operator=(__m128 in) { v = in; return *this; }
|
||||
SIMDINLINE Float& operator=(Float const & in) { v = in.v; return *this; }
|
||||
SIMDINLINE operator __m128() const { return v; }
|
||||
|
||||
SIMDALIGN(__m128, 16) v;
|
||||
};
|
||||
|
||||
union Integer
|
||||
{
|
||||
SIMDINLINE Integer() = default;
|
||||
SIMDINLINE Integer(__m128i in) : v(in) {}
|
||||
SIMDINLINE Integer& operator=(__m128i in) { v = in; return *this; }
|
||||
SIMDINLINE Integer& operator=(Integer const & in) { v = in.v; return *this; }
|
||||
SIMDINLINE operator __m128i() const { return v; }
|
||||
SIMDALIGN(__m128i, 16) v;
|
||||
};
|
||||
|
||||
union Double
|
||||
{
|
||||
SIMDINLINE Double() = default;
|
||||
SIMDINLINE Double(__m128d in) : v(in) {}
|
||||
SIMDINLINE Double& operator=(__m128d in) { v = in; return *this; }
|
||||
SIMDINLINE Double& operator=(Double const & in) { v = in.v; return *this; }
|
||||
SIMDINLINE operator __m128d() const { return v; }
|
||||
SIMDALIGN(__m128d, 16) v;
|
||||
};
|
||||
|
||||
using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
|
||||
using Mask = uint8_t;
|
||||
|
||||
static const uint32_t SIMD_WIDTH = 4;
|
||||
} // ns SIMD128Impl
|
||||
|
||||
namespace SIMD256Impl
|
||||
{
|
||||
union Float
|
||||
{
|
||||
SIMDINLINE Float() = default;
|
||||
SIMDINLINE Float(__m256 in) : v(in) {}
|
||||
SIMDINLINE Float(SIMD128Impl::Float in_lo, SIMD128Impl::Float in_hi = _mm_setzero_ps())
|
||||
{
|
||||
v = _mm256_insertf128_ps(_mm256_castps128_ps256(in_lo), in_hi, 0x1);
|
||||
}
|
||||
SIMDINLINE Float& operator=(__m256 in) { v = in; return *this; }
|
||||
SIMDINLINE Float& operator=(Float const & in) { v = in.v; return *this; }
|
||||
SIMDINLINE operator __m256() const { return v; }
|
||||
|
||||
SIMDALIGN(__m256, 32) v;
|
||||
SIMD128Impl::Float v4[2];
|
||||
};
|
||||
|
||||
union Integer
|
||||
{
|
||||
SIMDINLINE Integer() = default;
|
||||
SIMDINLINE Integer(__m256i in) : v(in) {}
|
||||
SIMDINLINE Integer(SIMD128Impl::Integer in_lo, SIMD128Impl::Integer in_hi = _mm_setzero_si128())
|
||||
{
|
||||
v = _mm256_insertf128_si256(_mm256_castsi128_si256(in_lo), in_hi, 0x1);
|
||||
}
|
||||
SIMDINLINE Integer& operator=(__m256i in) { v = in; return *this; }
|
||||
SIMDINLINE Integer& operator=(Integer const & in) { v = in.v; return *this; }
|
||||
SIMDINLINE operator __m256i() const { return v; }
|
||||
|
||||
SIMDALIGN(__m256i, 32) v;
|
||||
SIMD128Impl::Integer v4[2];
|
||||
};
|
||||
|
||||
union Double
|
||||
{
|
||||
SIMDINLINE Double() = default;
|
||||
SIMDINLINE Double(__m256d in) : v(in) {}
|
||||
SIMDINLINE Double(SIMD128Impl::Double in_lo, SIMD128Impl::Double in_hi = _mm_setzero_pd())
|
||||
{
|
||||
v = _mm256_insertf128_pd(_mm256_castpd128_pd256(in_lo), in_hi, 0x1);
|
||||
}
|
||||
SIMDINLINE Double& operator=(__m256d in) { v = in; return *this; }
|
||||
SIMDINLINE Double& operator=(Double const & in) { v = in.v; return *this; }
|
||||
SIMDINLINE operator __m256d() const { return v; }
|
||||
|
||||
SIMDALIGN(__m256d, 32) v;
|
||||
SIMD128Impl::Double v4[2];
|
||||
};
|
||||
|
||||
using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
|
||||
using Mask = uint8_t;
|
||||
|
||||
static const uint32_t SIMD_WIDTH = 8;
|
||||
} // ns SIMD256Impl
|
||||
|
||||
namespace SIMD512Impl
|
||||
{
|
||||
#if !defined(_MM_K0_REG)
|
||||
// Define AVX512 types if not included via immintrin.h.
|
||||
// All data members of these types are ONLY to viewed
|
||||
// in a debugger. Do NOT access them via code!
|
||||
union __m512
|
||||
{
|
||||
private:
|
||||
float m512_f32[16];
|
||||
};
|
||||
struct __m512d
|
||||
{
|
||||
private:
|
||||
double m512d_f64[8];
|
||||
};
|
||||
|
||||
union __m512i
|
||||
{
|
||||
private:
|
||||
int8_t m512i_i8[64];
|
||||
int16_t m512i_i16[32];
|
||||
int32_t m512i_i32[16];
|
||||
int64_t m512i_i64[8];
|
||||
uint8_t m512i_u8[64];
|
||||
uint16_t m512i_u16[32];
|
||||
uint32_t m512i_u32[16];
|
||||
uint64_t m512i_u64[8];
|
||||
};
|
||||
|
||||
using __mmask16 = uint16_t;
|
||||
#endif
|
||||
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
#define SIMD_ALIGNMENT_BYTES 64
|
||||
#else
|
||||
#define SIMD_ALIGNMENT_BYTES 32
|
||||
#endif
|
||||
|
||||
union Float
|
||||
{
|
||||
SIMDINLINE Float() = default;
|
||||
SIMDINLINE Float(__m512 in) : v(in) {}
|
||||
SIMDINLINE Float(SIMD256Impl::Float in_lo, SIMD256Impl::Float in_hi = _mm256_setzero_ps()) { v8[0] = in_lo; v8[1] = in_hi; }
|
||||
SIMDINLINE Float& operator=(__m512 in) { v = in; return *this; }
|
||||
SIMDINLINE Float& operator=(Float const & in)
|
||||
{
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
v = in.v;
|
||||
#else
|
||||
v8[0] = in.v8[0];
|
||||
v8[1] = in.v8[1];
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
SIMDINLINE operator __m512() const { return v; }
|
||||
|
||||
SIMDALIGN(__m512, SIMD_ALIGNMENT_BYTES) v;
|
||||
SIMD256Impl::Float v8[2];
|
||||
};
|
||||
|
||||
union Integer
|
||||
{
|
||||
SIMDINLINE Integer() = default;
|
||||
SIMDINLINE Integer(__m512i in) : v(in) {}
|
||||
SIMDINLINE Integer(SIMD256Impl::Integer in_lo, SIMD256Impl::Integer in_hi = _mm256_setzero_si256()) { v8[0] = in_lo; v8[1] = in_hi; }
|
||||
SIMDINLINE Integer& operator=(__m512i in) { v = in; return *this; }
|
||||
SIMDINLINE Integer& operator=(Integer const & in)
|
||||
{
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
v = in.v;
|
||||
#else
|
||||
v8[0] = in.v8[0];
|
||||
v8[1] = in.v8[1];
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
SIMDINLINE operator __m512i() const { return v; }
|
||||
|
||||
SIMDALIGN(__m512i, SIMD_ALIGNMENT_BYTES) v;
|
||||
SIMD256Impl::Integer v8[2];
|
||||
};
|
||||
|
||||
union Double
|
||||
{
|
||||
SIMDINLINE Double() = default;
|
||||
SIMDINLINE Double(__m512d in) : v(in) {}
|
||||
SIMDINLINE Double(SIMD256Impl::Double in_lo, SIMD256Impl::Double in_hi = _mm256_setzero_pd()) { v8[0] = in_lo; v8[1] = in_hi; }
|
||||
SIMDINLINE Double& operator=(__m512d in) { v = in; return *this; }
|
||||
SIMDINLINE Double& operator=(Double const & in)
|
||||
{
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
v = in.v;
|
||||
#else
|
||||
v8[0] = in.v8[0];
|
||||
v8[1] = in.v8[1];
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
SIMDINLINE operator __m512d() const { return v; }
|
||||
|
||||
SIMDALIGN(__m512d, SIMD_ALIGNMENT_BYTES) v;
|
||||
SIMD256Impl::Double v8[2];
|
||||
};
|
||||
|
||||
typedef SIMDImpl::Vec4<Float, Integer, Double> SIMDALIGN(Vec4, 64);
|
||||
using Mask = __mmask16;
|
||||
|
||||
static const uint32_t SIMD_WIDTH = 16;
|
||||
|
||||
#undef SIMD_ALIGNMENT_BYTES
|
||||
} // ns SIMD512Impl
|
||||
} // ns SIMDImpl
|
||||
|
|
@ -43,10 +43,10 @@ enum SWR_BACKEND_FUNCS
|
|||
};
|
||||
|
||||
#if KNOB_SIMD_WIDTH == 8
|
||||
static const simdscalar vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
|
||||
static const simdscalar vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
|
||||
static const simdscalar vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
|
||||
static const simdscalar vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
|
||||
static const __m256 vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
|
||||
static const __m256 vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
|
||||
static const __m256 vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
|
||||
static const __m256 vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
|
||||
#define MASK 0xff
|
||||
#endif
|
||||
|
||||
|
|
@ -163,52 +163,52 @@ struct generateInputCoverage
|
|||
uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
|
||||
if(T::MultisampleT::numSamples == 1)
|
||||
{
|
||||
sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
|
||||
sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
|
||||
}
|
||||
else if(T::MultisampleT::numSamples == 2)
|
||||
{
|
||||
sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
|
||||
sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
|
||||
}
|
||||
else if(T::MultisampleT::numSamples == 4)
|
||||
{
|
||||
sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
|
||||
sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
|
||||
}
|
||||
else if(T::MultisampleT::numSamples == 8)
|
||||
{
|
||||
sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
|
||||
sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
|
||||
}
|
||||
else if(T::MultisampleT::numSamples == 16)
|
||||
{
|
||||
sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
|
||||
sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
|
||||
sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
|
||||
sampleCoverage[1] = _simd_set1_epi32(centerCoverage);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
__m256i src = _mm256_set1_epi32(0);
|
||||
__m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
|
||||
simdscalari src = _simd_set1_epi32(0);
|
||||
simdscalari index0 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
|
||||
|
||||
if(T::MultisampleT::numSamples == 1)
|
||||
{
|
||||
mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
|
||||
mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
|
||||
}
|
||||
else if(T::MultisampleT::numSamples == 2)
|
||||
{
|
||||
mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
|
||||
mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
|
||||
}
|
||||
else if(T::MultisampleT::numSamples == 4)
|
||||
{
|
||||
mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
|
||||
mask[0] = _simd_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
|
||||
}
|
||||
else if(T::MultisampleT::numSamples == 8)
|
||||
{
|
||||
mask[0] = _mm256_set1_epi32(-1);
|
||||
mask[0] = _simd_set1_epi32(-1);
|
||||
}
|
||||
else if(T::MultisampleT::numSamples == 16)
|
||||
{
|
||||
mask[0] = _mm256_set1_epi32(-1);
|
||||
mask[1] = _mm256_set1_epi32(-1);
|
||||
index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
|
||||
mask[0] = _simd_set1_epi32(-1);
|
||||
mask[1] = _simd_set1_epi32(-1);
|
||||
index1 = _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
|
||||
}
|
||||
|
||||
// gather coverage for samples 0-7
|
||||
|
|
@ -253,14 +253,14 @@ struct generateInputCoverage
|
|||
packedSampleCoverage = packedCoverage0;
|
||||
}
|
||||
#else
|
||||
simdscalari permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
|
||||
simdscalari permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
|
||||
// pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
|
||||
packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
|
||||
|
||||
simdscalari packedSampleCoverage;
|
||||
if(T::MultisampleT::numSamples > 8)
|
||||
{
|
||||
permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
|
||||
permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
|
||||
// pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
|
||||
packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
|
||||
|
||||
|
|
@ -293,7 +293,7 @@ struct generateInputCoverage
|
|||
{
|
||||
uint32_t inputMask[KNOB_SIMD_WIDTH];
|
||||
generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
|
||||
inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
|
||||
inputCoverage = _simd_castsi_ps(_simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
|
||||
}
|
||||
|
||||
};
|
||||
|
|
@ -305,10 +305,10 @@ struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
|
|||
{
|
||||
// will need to update for avx512
|
||||
assert(KNOB_SIMD_WIDTH == 8);
|
||||
simdscalari vec = _mm256_set1_epi32(coverageMask[0]);
|
||||
const simdscalari bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
|
||||
simdscalari vec = _simd_set1_epi32(coverageMask[0]);
|
||||
const simdscalari bit = _simd_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
|
||||
vec = _simd_and_si(vec, bit);
|
||||
vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
|
||||
vec = _simd_cmplt_epi32(_simd_setzero_si(), vec);
|
||||
vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
|
||||
inputCoverage = _simd_castsi_ps(vec);
|
||||
}
|
||||
|
|
@ -357,7 +357,7 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS
|
|||
(inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
|
||||
|
||||
// look up and set the sample offsets from UL pixel corner for first covered sample
|
||||
__m256 vXSample = _mm256_set_ps(samplePos.X(sampleNum[7]),
|
||||
simdscalar vXSample = _simd_set_ps(samplePos.X(sampleNum[7]),
|
||||
samplePos.X(sampleNum[6]),
|
||||
samplePos.X(sampleNum[5]),
|
||||
samplePos.X(sampleNum[4]),
|
||||
|
|
@ -366,7 +366,7 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS
|
|||
samplePos.X(sampleNum[1]),
|
||||
samplePos.X(sampleNum[0]));
|
||||
|
||||
__m256 vYSample = _mm256_set_ps(samplePos.Y(sampleNum[7]),
|
||||
simdscalar vYSample = _simd_set_ps(samplePos.Y(sampleNum[7]),
|
||||
samplePos.Y(sampleNum[6]),
|
||||
samplePos.Y(sampleNum[5]),
|
||||
samplePos.Y(sampleNum[4]),
|
||||
|
|
@ -380,7 +380,7 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS
|
|||
|
||||
// Case (1) and case (3b) - All samples covered or not covered with full SampleMask
|
||||
static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
|
||||
simdscalari vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
|
||||
simdscalari vInputCoveragei = _simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
|
||||
simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
|
||||
|
||||
static const simdscalari vZero = _simd_setzero_si();
|
||||
|
|
|
|||
|
|
@ -88,7 +88,7 @@ INLINE void ProcessAttributes(
|
|||
inputSlot = backendState.vertexAttribOffset + i;
|
||||
}
|
||||
|
||||
__m128 attrib[3]; // triangle attribs (always 4 wide)
|
||||
simd4scalar attrib[3]; // triangle attribs (always 4 wide)
|
||||
float* pAttribStart = pBuffer;
|
||||
|
||||
if (HasConstantInterpT::value || IsDegenerate::value)
|
||||
|
|
@ -128,7 +128,7 @@ INLINE void ProcessAttributes(
|
|||
|
||||
for (uint32_t i = 0; i < NumVertsT::value; ++i)
|
||||
{
|
||||
_mm_store_ps(pBuffer, attrib[vid]);
|
||||
SIMD128::store_ps(pBuffer, attrib[vid]);
|
||||
pBuffer += 4;
|
||||
}
|
||||
}
|
||||
|
|
@ -138,7 +138,7 @@ INLINE void ProcessAttributes(
|
|||
|
||||
for (uint32_t i = 0; i < NumVertsT::value; ++i)
|
||||
{
|
||||
_mm_store_ps(pBuffer, attrib[i]);
|
||||
SIMD128::store_ps(pBuffer, attrib[i]);
|
||||
pBuffer += 4;
|
||||
}
|
||||
}
|
||||
|
|
@ -149,7 +149,7 @@ INLINE void ProcessAttributes(
|
|||
|
||||
for (uint32_t i = 0; i < NumVertsT::value; ++i)
|
||||
{
|
||||
_mm_store_ps(pBuffer, attrib[i]);
|
||||
SIMD128::store_ps(pBuffer, attrib[i]);
|
||||
pBuffer += 4;
|
||||
}
|
||||
}
|
||||
|
|
@ -160,7 +160,7 @@ INLINE void ProcessAttributes(
|
|||
// effect of the missing vertices in the triangle interpolation.
|
||||
for (uint32_t v = NumVertsT::value; v < 3; ++v)
|
||||
{
|
||||
_mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
|
||||
SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
|
||||
pBuffer += 4;
|
||||
}
|
||||
|
||||
|
|
@ -279,8 +279,7 @@ struct GatherScissors_simd16<16>
|
|||
{
|
||||
static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
|
||||
simd16scalari &scisXmin, simd16scalari &scisYmin,
|
||||
simd16scalari &scisXmax, simd16scalari &scisYmax)
|
||||
{
|
||||
simd16scalari &scisXmax, simd16scalari &scisYmax) {
|
||||
scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[1]].xmin,
|
||||
pScissorsInFixedPoint[pViewportIndex[2]].xmin,
|
||||
|
|
@ -390,14 +389,14 @@ void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask,
|
|||
uint32_t clipAttribSlot = clipSlot == 0 ?
|
||||
VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
|
||||
|
||||
__m128 primClipDist[3];
|
||||
simd4scalar primClipDist[3];
|
||||
pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
|
||||
|
||||
float vertClipDist[NumVerts];
|
||||
for (uint32_t e = 0; e < NumVerts; ++e)
|
||||
{
|
||||
OSALIGNSIMD(float) aVertClipDist[4];
|
||||
_mm_store_ps(aVertClipDist, primClipDist[e]);
|
||||
SIMD128::store_ps(aVertClipDist, primClipDist[e]);
|
||||
vertClipDist[e] = aVertClipDist[clipComp];
|
||||
};
|
||||
|
||||
|
|
@ -625,13 +624,14 @@ void BinTriangles(
|
|||
(SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
|
||||
}
|
||||
|
||||
simdBBox bbox;
|
||||
|
||||
if (!triMask)
|
||||
{
|
||||
goto endBinTriangles;
|
||||
}
|
||||
|
||||
// Calc bounding box of triangles
|
||||
simdBBox bbox;
|
||||
calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
|
||||
|
||||
// determine if triangle falls between pixel centers and discard
|
||||
|
|
@ -673,28 +673,30 @@ void BinTriangles(
|
|||
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
|
||||
// Gather the AOS effective scissor rects based on the per-prim VP index.
|
||||
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
|
||||
simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
|
||||
if (state.backendState.readViewportArrayIndex)
|
||||
{
|
||||
GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
|
||||
scisXmin, scisYmin, scisXmax, scisYmax);
|
||||
}
|
||||
else // broadcast fast path for non-VPAI case.
|
||||
{
|
||||
scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
|
||||
scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
|
||||
scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
|
||||
scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
|
||||
}
|
||||
simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
|
||||
if (state.backendState.readViewportArrayIndex)
|
||||
{
|
||||
GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
|
||||
scisXmin, scisYmin, scisXmax, scisYmax);
|
||||
}
|
||||
else // broadcast fast path for non-VPAI case.
|
||||
{
|
||||
scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
|
||||
scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
|
||||
scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
|
||||
scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
|
||||
}
|
||||
|
||||
// Make triangle bbox inclusive
|
||||
bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1));
|
||||
bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1));
|
||||
// Make triangle bbox inclusive
|
||||
bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1));
|
||||
bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1));
|
||||
|
||||
bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
|
||||
bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
|
||||
bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax);
|
||||
bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax);
|
||||
bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
|
||||
bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
|
||||
bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax);
|
||||
bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax);
|
||||
}
|
||||
|
||||
if (CT::IsConservativeT::value)
|
||||
{
|
||||
|
|
@ -768,7 +770,7 @@ endBinTriangles:
|
|||
|
||||
// transpose verts needed for backend
|
||||
/// @todo modify BE to take non-transformed verts
|
||||
__m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
|
||||
simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
|
||||
vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
|
||||
vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
|
||||
vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
|
||||
|
|
@ -837,10 +839,10 @@ endBinTriangles:
|
|||
// store triangle vertex data
|
||||
desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
|
||||
|
||||
_mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
|
||||
_mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
|
||||
_mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
|
||||
_mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
|
||||
SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
|
||||
SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
|
||||
SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
|
||||
SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
|
||||
|
||||
// store user clip distances
|
||||
if (rastState.clipDistanceMask)
|
||||
|
|
@ -870,7 +872,7 @@ endBinTriangles:
|
|||
|
||||
#if USE_SIMD16_FRONTEND
|
||||
template <typename CT>
|
||||
void SIMDAPI BinTriangles_simd16(
|
||||
void SIMDCALL BinTriangles_simd16(
|
||||
DRAW_CONTEXT *pDC,
|
||||
PA_STATE& pa,
|
||||
uint32_t workerId,
|
||||
|
|
@ -1124,29 +1126,31 @@ void SIMDAPI BinTriangles_simd16(
|
|||
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
|
||||
// Gather the AOS effective scissor rects based on the per-prim VP index.
|
||||
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
|
||||
simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
|
||||
|
||||
if (state.backendState.readViewportArrayIndex)
|
||||
{
|
||||
GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
|
||||
scisXmin, scisYmin, scisXmax, scisYmax);
|
||||
}
|
||||
else // broadcast fast path for non-VPAI case.
|
||||
{
|
||||
scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
|
||||
scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
|
||||
scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
|
||||
scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
|
||||
}
|
||||
simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
|
||||
|
||||
// Make triangle bbox inclusive
|
||||
bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1));
|
||||
bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1));
|
||||
if (state.backendState.readViewportArrayIndex)
|
||||
{
|
||||
GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
|
||||
scisXmin, scisYmin, scisXmax, scisYmax);
|
||||
}
|
||||
else // broadcast fast path for non-VPAI case.
|
||||
{
|
||||
scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
|
||||
scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
|
||||
scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
|
||||
scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
|
||||
}
|
||||
|
||||
bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
|
||||
bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
|
||||
bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax);
|
||||
bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax);
|
||||
// Make triangle bbox inclusive
|
||||
bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1));
|
||||
bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1));
|
||||
|
||||
bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
|
||||
bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
|
||||
bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax);
|
||||
bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax);
|
||||
}
|
||||
|
||||
if (CT::IsConservativeT::value)
|
||||
{
|
||||
|
|
@ -1221,10 +1225,10 @@ endBinTriangles:
|
|||
|
||||
// transpose verts needed for backend
|
||||
/// @todo modify BE to take non-transformed verts
|
||||
__m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
|
||||
__m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
|
||||
__m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
|
||||
__m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
|
||||
simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
|
||||
simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
|
||||
simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
|
||||
simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
|
||||
|
||||
vTranspose3x8(vHorizX[0], _simd16_extract_ps(tri[0].x, 0), _simd16_extract_ps(tri[1].x, 0), _simd16_extract_ps(tri[2].x, 0));
|
||||
vTranspose3x8(vHorizY[0], _simd16_extract_ps(tri[0].y, 0), _simd16_extract_ps(tri[1].y, 0), _simd16_extract_ps(tri[2].y, 0));
|
||||
|
|
@ -1547,24 +1551,26 @@ void BinPostSetupPoints(
|
|||
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
|
||||
// Gather the AOS effective scissor rects based on the per-prim VP index.
|
||||
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
|
||||
simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
|
||||
if (state.backendState.readViewportArrayIndex)
|
||||
{
|
||||
GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
|
||||
scisXmin, scisYmin, scisXmax, scisYmax);
|
||||
}
|
||||
else // broadcast fast path for non-VPAI case.
|
||||
{
|
||||
scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
|
||||
scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
|
||||
scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
|
||||
scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
|
||||
}
|
||||
simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
|
||||
if (state.backendState.readViewportArrayIndex)
|
||||
{
|
||||
GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
|
||||
scisXmin, scisYmin, scisXmax, scisYmax);
|
||||
}
|
||||
else // broadcast fast path for non-VPAI case.
|
||||
{
|
||||
scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
|
||||
scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
|
||||
scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
|
||||
scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
|
||||
}
|
||||
|
||||
bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
|
||||
bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
|
||||
bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
|
||||
bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
|
||||
bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
|
||||
bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
|
||||
bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
|
||||
bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
|
||||
}
|
||||
|
||||
// Cull bloated points completely outside scissor
|
||||
simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
|
||||
|
|
@ -1934,24 +1940,26 @@ void BinPostSetupPoints_simd16(
|
|||
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
|
||||
// Gather the AOS effective scissor rects based on the per-prim VP index.
|
||||
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
|
||||
simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
|
||||
if (state.backendState.readViewportArrayIndex)
|
||||
{
|
||||
GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
|
||||
scisXmin, scisYmin, scisXmax, scisYmax);
|
||||
}
|
||||
else // broadcast fast path for non-VPAI case.
|
||||
{
|
||||
scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
|
||||
scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
|
||||
scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
|
||||
scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
|
||||
}
|
||||
simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
|
||||
if (state.backendState.readViewportArrayIndex)
|
||||
{
|
||||
GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
|
||||
scisXmin, scisYmin, scisXmax, scisYmax);
|
||||
}
|
||||
else // broadcast fast path for non-VPAI case.
|
||||
{
|
||||
scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
|
||||
scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
|
||||
scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
|
||||
scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
|
||||
}
|
||||
|
||||
bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
|
||||
bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
|
||||
bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
|
||||
bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
|
||||
bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
|
||||
bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
|
||||
bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
|
||||
bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
|
||||
}
|
||||
|
||||
// Cull bloated points completely outside scissor
|
||||
simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
|
||||
|
|
@ -2071,7 +2079,7 @@ void BinPostSetupPoints_simd16(
|
|||
AR_END(FEBinPoints, 1);
|
||||
}
|
||||
|
||||
void SIMDAPI BinPoints_simd16(
|
||||
void SIMDCALL BinPoints_simd16(
|
||||
DRAW_CONTEXT *pDC,
|
||||
PA_STATE& pa,
|
||||
uint32_t workerId,
|
||||
|
|
@ -2168,6 +2176,8 @@ void BinPostSetupLines(
|
|||
simdscalar& vRecipW0 = recipW[0];
|
||||
simdscalar& vRecipW1 = recipW[1];
|
||||
|
||||
simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
|
||||
|
||||
// convert to fixed point
|
||||
simdscalari vXi[2], vYi[2];
|
||||
vXi[0] = fpToFixedPointVertical(prim[0].x);
|
||||
|
|
@ -2214,24 +2224,26 @@ void BinPostSetupLines(
|
|||
bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
|
||||
|
||||
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
|
||||
simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
|
||||
if (state.backendState.readViewportArrayIndex)
|
||||
{
|
||||
GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
|
||||
scisXmin, scisYmin, scisXmax, scisYmax);
|
||||
}
|
||||
else // broadcast fast path for non-VPAI case.
|
||||
{
|
||||
scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
|
||||
scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
|
||||
scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
|
||||
scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
|
||||
}
|
||||
simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
|
||||
if (state.backendState.readViewportArrayIndex)
|
||||
{
|
||||
GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
|
||||
scisXmin, scisYmin, scisXmax, scisYmax);
|
||||
}
|
||||
else // broadcast fast path for non-VPAI case.
|
||||
{
|
||||
scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
|
||||
scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
|
||||
scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
|
||||
scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
|
||||
}
|
||||
|
||||
bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
|
||||
bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
|
||||
bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
|
||||
bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
|
||||
bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
|
||||
bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
|
||||
bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
|
||||
bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
|
||||
}
|
||||
|
||||
// Cull prims completely outside scissor
|
||||
{
|
||||
|
|
@ -2261,7 +2273,6 @@ void BinPostSetupLines(
|
|||
|
||||
// transpose verts needed for backend
|
||||
/// @todo modify BE to take non-transformed verts
|
||||
__m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
|
||||
vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
|
||||
vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
|
||||
vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
|
||||
|
|
@ -2310,10 +2321,10 @@ void BinPostSetupLines(
|
|||
|
||||
// store line vertex data
|
||||
desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
|
||||
_mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
|
||||
_mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
|
||||
_mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
|
||||
_mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
|
||||
SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
|
||||
SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
|
||||
SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
|
||||
SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
|
||||
|
||||
// store user clip distances
|
||||
if (rastState.clipDistanceMask)
|
||||
|
|
@ -2417,25 +2428,27 @@ void BinPostSetupLines_simd16(
|
|||
bbox.ymax = _simd16_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
|
||||
|
||||
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
|
||||
simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
|
||||
|
||||
if (state.backendState.readViewportArrayIndex)
|
||||
{
|
||||
GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
|
||||
scisXmin, scisYmin, scisXmax, scisYmax);
|
||||
}
|
||||
else // broadcast fast path for non-VPAI case.
|
||||
{
|
||||
scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
|
||||
scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
|
||||
scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
|
||||
scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
|
||||
}
|
||||
simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
|
||||
|
||||
bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
|
||||
bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
|
||||
bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
|
||||
bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
|
||||
if (state.backendState.readViewportArrayIndex)
|
||||
{
|
||||
GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
|
||||
scisXmin, scisYmin, scisXmax, scisYmax);
|
||||
}
|
||||
else // broadcast fast path for non-VPAI case.
|
||||
{
|
||||
scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
|
||||
scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
|
||||
scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
|
||||
scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
|
||||
}
|
||||
|
||||
bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
|
||||
bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
|
||||
bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
|
||||
bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
|
||||
}
|
||||
|
||||
// Cull prims completely outside scissor
|
||||
{
|
||||
|
|
@ -2468,10 +2481,10 @@ void BinPostSetupLines_simd16(
|
|||
|
||||
// transpose verts needed for backend
|
||||
/// @todo modify BE to take non-transformed verts
|
||||
__m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
|
||||
__m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
|
||||
__m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
|
||||
__m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
|
||||
simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
|
||||
simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
|
||||
simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
|
||||
simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
|
||||
|
||||
vTranspose3x8(vHorizX[0], _simd16_extract_ps(prim[0].x, 0), _simd16_extract_ps(prim[1].x, 0), unused);
|
||||
vTranspose3x8(vHorizY[0], _simd16_extract_ps(prim[0].y, 0), _simd16_extract_ps(prim[1].y, 0), unused);
|
||||
|
|
@ -2650,7 +2663,7 @@ void BinLines(
|
|||
}
|
||||
|
||||
#if USE_SIMD16_FRONTEND
|
||||
void SIMDAPI BinLines_simd16(
|
||||
void SIMDCALL BinLines_simd16(
|
||||
DRAW_CONTEXT *pDC,
|
||||
PA_STATE& pa,
|
||||
uint32_t workerId,
|
||||
|
|
|
|||
|
|
@ -188,7 +188,7 @@ void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector p
|
|||
}
|
||||
|
||||
#if USE_SIMD16_FRONTEND
|
||||
void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
|
||||
void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
|
||||
{
|
||||
SWR_CONTEXT *pContext = pDC->pContext;
|
||||
AR_BEGIN(FEClipTriangles, pDC->drawId);
|
||||
|
|
@ -203,7 +203,7 @@ void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t work
|
|||
AR_END(FEClipTriangles, 1);
|
||||
}
|
||||
|
||||
void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
|
||||
void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
|
||||
{
|
||||
SWR_CONTEXT *pContext = pDC->pContext;
|
||||
AR_BEGIN(FEClipLines, pDC->drawId);
|
||||
|
|
@ -218,7 +218,7 @@ void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId
|
|||
AR_END(FEClipLines, 1);
|
||||
}
|
||||
|
||||
void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
|
||||
void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
|
||||
{
|
||||
SWR_CONTEXT *pContext = pDC->pContext;
|
||||
AR_BEGIN(FEClipPoints, pDC->drawId);
|
||||
|
|
|
|||
|
|
@ -1095,7 +1095,7 @@ public:
|
|||
AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
|
||||
// we have to clip tris, execute the clipper, which will also
|
||||
// call the binner
|
||||
ClipSimd(vMask16(primMask), vMask16(clipMask), pa, primId);
|
||||
ClipSimd(_simd16_vmask_ps(primMask), _simd16_vmask_ps(clipMask), pa, primId);
|
||||
AR_END(FEGuardbandClip, 1);
|
||||
}
|
||||
else if (validMask)
|
||||
|
|
@ -1180,7 +1180,7 @@ private:
|
|||
{
|
||||
simd16scalari vOffsets = ComputeOffsets(attrib, vIndices, component);
|
||||
simd16scalar vSrc = _simd16_setzero_ps();
|
||||
return _simd16_mask_i32gather_ps(vSrc, pBuffer, vOffsets, _simd16_castps_si(vMask), 1);
|
||||
return _simd16_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -1895,8 +1895,8 @@ void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvecto
|
|||
void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
|
||||
void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
|
||||
#if USE_SIMD16_FRONTEND
|
||||
void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
|
||||
void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
|
||||
void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
|
||||
void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
|
||||
void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
|
||||
void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -218,7 +218,7 @@ typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t worke
|
|||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
// function signature for pipeline stages that execute after primitive assembly
|
||||
typedef void(SIMDAPI *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
|
||||
typedef void(SIMDCALL *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
|
||||
uint32_t primMask, simd16scalari primID);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -202,7 +202,7 @@ INLINE void StoreSOA(const simdvector &src, uint8_t *pDst)
|
|||
/// @param pSrc - source data in SOA form
|
||||
/// @param dst - output data in SOA form
|
||||
template<SWR_FORMAT SrcFormat>
|
||||
INLINE void SIMDAPI LoadSOA(const uint8_t *pSrc, simd16vector &dst)
|
||||
INLINE void SIMDCALL LoadSOA(const uint8_t *pSrc, simd16vector &dst)
|
||||
{
|
||||
// fast path for float32
|
||||
if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32))
|
||||
|
|
@ -247,7 +247,7 @@ INLINE void SIMDAPI LoadSOA(const uint8_t *pSrc, simd16vector &dst)
|
|||
/// @param vComp - SIMD vector of floats
|
||||
/// @param Component - component
|
||||
template<SWR_FORMAT Format>
|
||||
INLINE simd16scalar SIMDAPI Clamp(simd16scalar vComp, uint32_t Component)
|
||||
INLINE simd16scalar SIMDCALL Clamp(simd16scalar vComp, uint32_t Component)
|
||||
{
|
||||
if (FormatTraits<Format>::isNormalized(Component))
|
||||
{
|
||||
|
|
@ -293,7 +293,7 @@ INLINE simd16scalar SIMDAPI Clamp(simd16scalar vComp, uint32_t Component)
|
|||
/// @param vComp - SIMD vector of floats
|
||||
/// @param Component - component
|
||||
template<SWR_FORMAT Format>
|
||||
INLINE simd16scalar SIMDAPI Normalize(simd16scalar vComp, uint32_t Component)
|
||||
INLINE simd16scalar SIMDCALL Normalize(simd16scalar vComp, uint32_t Component)
|
||||
{
|
||||
if (FormatTraits<Format>::isNormalized(Component))
|
||||
{
|
||||
|
|
@ -309,7 +309,7 @@ INLINE simd16scalar SIMDAPI Normalize(simd16scalar vComp, uint32_t Component)
|
|||
/// @param src - source data in SOA form
|
||||
/// @param dst - output data in SOA form
|
||||
template<SWR_FORMAT DstFormat>
|
||||
INLINE void SIMDAPI StoreSOA(const simd16vector &src, uint8_t *pDst)
|
||||
INLINE void SIMDCALL StoreSOA(const simd16vector &src, uint8_t *pDst)
|
||||
{
|
||||
// fast path for float32
|
||||
if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32))
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ struct PackTraits
|
|||
static simdscalar pack(simdscalar &in) = delete;
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
static simd16scalar loadSOA_16(const uint8_t *pSrc) = delete;
|
||||
static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src) = delete;
|
||||
static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src) = delete;
|
||||
static simd16scalar unpack(simd16scalar &in) = delete;
|
||||
static simd16scalar pack(simd16scalar &in) = delete;
|
||||
#endif
|
||||
|
|
@ -63,7 +63,7 @@ struct PackTraits<0, false>
|
|||
static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
static simd16scalar loadSOA_16(const uint8_t *pSrc) { return _simd16_setzero_ps(); }
|
||||
static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src) { return; }
|
||||
static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src) { return; }
|
||||
static simd16scalar unpack(simd16scalar &in) { return _simd16_setzero_ps(); }
|
||||
static simd16scalar pack(simd16scalar &in) { return _simd16_setzero_ps(); }
|
||||
#endif
|
||||
|
|
@ -109,7 +109,7 @@ struct PackTraits<8, false>
|
|||
|
||||
__m256i result = _mm256_castsi128_si256(resLo);
|
||||
result = _mm256_insertf128_si256(result, resHi, 1);
|
||||
return _mm256_castsi256_ps(result);
|
||||
return simdscalar{ _mm256_castsi256_ps(result) };
|
||||
#else
|
||||
return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
|
||||
#endif
|
||||
|
|
@ -144,7 +144,7 @@ struct PackTraits<8, false>
|
|||
return result;
|
||||
}
|
||||
|
||||
static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
|
||||
static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
|
||||
{
|
||||
// store simd16 bytes
|
||||
_mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
|
||||
|
|
@ -152,7 +152,8 @@ struct PackTraits<8, false>
|
|||
|
||||
static simd16scalar unpack(simd16scalar &in)
|
||||
{
|
||||
simd16scalari result = _simd16_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))));
|
||||
simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
|
||||
simd16scalari result = _simd16_cvtepu8_epi32(tmp);
|
||||
|
||||
return _simd16_castsi_ps(result);
|
||||
}
|
||||
|
|
@ -259,7 +260,7 @@ struct PackTraits<8, true>
|
|||
return result;
|
||||
}
|
||||
|
||||
static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
|
||||
static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
|
||||
{
|
||||
// store simd16 bytes
|
||||
_mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
|
||||
|
|
@ -267,7 +268,8 @@ struct PackTraits<8, true>
|
|||
|
||||
static simd16scalar unpack(simd16scalar &in)
|
||||
{
|
||||
simd16scalari result = _simd16_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))));
|
||||
simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
|
||||
simd16scalari result = _simd16_cvtepu8_epi32(tmp);
|
||||
|
||||
return _simd16_castsi_ps(result);
|
||||
}
|
||||
|
|
@ -370,7 +372,7 @@ struct PackTraits<16, false>
|
|||
return result;
|
||||
}
|
||||
|
||||
static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
|
||||
static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
|
||||
{
|
||||
_simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
|
||||
}
|
||||
|
|
@ -469,7 +471,7 @@ struct PackTraits<16, true>
|
|||
return result;
|
||||
}
|
||||
|
||||
static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
|
||||
static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
|
||||
{
|
||||
_simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
|
||||
}
|
||||
|
|
@ -514,7 +516,7 @@ struct PackTraits<32, false>
|
|||
return _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
|
||||
}
|
||||
|
||||
static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
|
||||
static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
|
||||
{
|
||||
_simd16_store_ps(reinterpret_cast<float *>(pDst), src);
|
||||
}
|
||||
|
|
@ -812,7 +814,7 @@ static inline __m128 ConvertFloatToSRGB2(__m128& Src)
|
|||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
|
||||
inline static simd16scalar SIMDAPI fastpow(simd16scalar value)
|
||||
inline static simd16scalar SIMDCALL fastpow(simd16scalar value)
|
||||
{
|
||||
static const float factor1 = exp2(127.0f * expden / expnum - 127.0f)
|
||||
* powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum);
|
||||
|
|
@ -834,7 +836,7 @@ inline static simd16scalar SIMDAPI fastpow(simd16scalar value)
|
|||
return result;
|
||||
}
|
||||
|
||||
inline static simd16scalar SIMDAPI pow512_4(simd16scalar arg)
|
||||
inline static simd16scalar SIMDCALL pow512_4(simd16scalar arg)
|
||||
{
|
||||
// 5/12 is too small, so compute the 4th root of 20/12 instead.
|
||||
// 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
|
||||
|
|
@ -855,7 +857,7 @@ inline static simd16scalar SIMDAPI pow512_4(simd16scalar arg)
|
|||
return xavg;
|
||||
}
|
||||
|
||||
inline static simd16scalar SIMDAPI powf_wrapper(const simd16scalar base, float exp)
|
||||
inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar base, float exp)
|
||||
{
|
||||
const float *f = reinterpret_cast<const float *>(&base);
|
||||
|
||||
|
|
@ -1410,7 +1412,7 @@ struct ComponentTraits
|
|||
return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
|
||||
}
|
||||
|
||||
INLINE static void SIMDAPI storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar src)
|
||||
INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar src)
|
||||
{
|
||||
switch (comp)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -31,58 +31,58 @@
|
|||
#include "common/simdintrin.h"
|
||||
|
||||
INLINE
|
||||
void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
|
||||
void vTranspose(simd4scalar &row0, simd4scalar &row1, simd4scalar &row2, simd4scalar &row3)
|
||||
{
|
||||
__m128i row0i = _mm_castps_si128(row0);
|
||||
__m128i row1i = _mm_castps_si128(row1);
|
||||
__m128i row2i = _mm_castps_si128(row2);
|
||||
__m128i row3i = _mm_castps_si128(row3);
|
||||
simd4scalari row0i = SIMD128::castps_si(row0);
|
||||
simd4scalari row1i = SIMD128::castps_si(row1);
|
||||
simd4scalari row2i = SIMD128::castps_si(row2);
|
||||
simd4scalari row3i = SIMD128::castps_si(row3);
|
||||
|
||||
__m128i vTemp = row2i;
|
||||
row2i = _mm_unpacklo_epi32(row2i, row3i);
|
||||
vTemp = _mm_unpackhi_epi32(vTemp, row3i);
|
||||
simd4scalari vTemp = row2i;
|
||||
row2i = SIMD128::unpacklo_epi32(row2i, row3i);
|
||||
vTemp = SIMD128::unpackhi_epi32(vTemp, row3i);
|
||||
|
||||
row3i = row0i;
|
||||
row0i = _mm_unpacklo_epi32(row0i, row1i);
|
||||
row3i = _mm_unpackhi_epi32(row3i, row1i);
|
||||
row0i = SIMD128::unpacklo_epi32(row0i, row1i);
|
||||
row3i = SIMD128::unpackhi_epi32(row3i, row1i);
|
||||
|
||||
row1i = row0i;
|
||||
row0i = _mm_unpacklo_epi64(row0i, row2i);
|
||||
row1i = _mm_unpackhi_epi64(row1i, row2i);
|
||||
row0i = SIMD128::unpacklo_epi64(row0i, row2i);
|
||||
row1i = SIMD128::unpackhi_epi64(row1i, row2i);
|
||||
|
||||
row2i = row3i;
|
||||
row2i = _mm_unpacklo_epi64(row2i, vTemp);
|
||||
row3i = _mm_unpackhi_epi64(row3i, vTemp);
|
||||
row2i = SIMD128::unpacklo_epi64(row2i, vTemp);
|
||||
row3i = SIMD128::unpackhi_epi64(row3i, vTemp);
|
||||
|
||||
row0 = _mm_castsi128_ps(row0i);
|
||||
row1 = _mm_castsi128_ps(row1i);
|
||||
row2 = _mm_castsi128_ps(row2i);
|
||||
row3 = _mm_castsi128_ps(row3i);
|
||||
row0 = SIMD128::castsi_ps(row0i);
|
||||
row1 = SIMD128::castsi_ps(row1i);
|
||||
row2 = SIMD128::castsi_ps(row2i);
|
||||
row3 = SIMD128::castsi_ps(row3i);
|
||||
}
|
||||
|
||||
INLINE
|
||||
void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
|
||||
void vTranspose(simd4scalari &row0, simd4scalari &row1, simd4scalari &row2, simd4scalari &row3)
|
||||
{
|
||||
__m128i vTemp = row2;
|
||||
row2 = _mm_unpacklo_epi32(row2, row3);
|
||||
vTemp = _mm_unpackhi_epi32(vTemp, row3);
|
||||
simd4scalari vTemp = row2;
|
||||
row2 = SIMD128::unpacklo_epi32(row2, row3);
|
||||
vTemp = SIMD128::unpackhi_epi32(vTemp, row3);
|
||||
|
||||
row3 = row0;
|
||||
row0 = _mm_unpacklo_epi32(row0, row1);
|
||||
row3 = _mm_unpackhi_epi32(row3, row1);
|
||||
row0 = SIMD128::unpacklo_epi32(row0, row1);
|
||||
row3 = SIMD128::unpackhi_epi32(row3, row1);
|
||||
|
||||
row1 = row0;
|
||||
row0 = _mm_unpacklo_epi64(row0, row2);
|
||||
row1 = _mm_unpackhi_epi64(row1, row2);
|
||||
row0 = SIMD128::unpacklo_epi64(row0, row2);
|
||||
row1 = SIMD128::unpackhi_epi64(row1, row2);
|
||||
|
||||
row2 = row3;
|
||||
row2 = _mm_unpacklo_epi64(row2, vTemp);
|
||||
row3 = _mm_unpackhi_epi64(row3, vTemp);
|
||||
row2 = SIMD128::unpacklo_epi64(row2, vTemp);
|
||||
row3 = SIMD128::unpackhi_epi64(row3, vTemp);
|
||||
}
|
||||
|
||||
#if KNOB_SIMD_WIDTH == 8
|
||||
INLINE
|
||||
void vTranspose3x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2)
|
||||
void vTranspose3x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2)
|
||||
{
|
||||
simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
|
||||
simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); //y0w0y1w1 y4w4y5w5
|
||||
|
|
@ -94,10 +94,10 @@ void vTranspose3x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar
|
|||
simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
|
||||
simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
|
||||
|
||||
vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
|
||||
vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
|
||||
vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
|
||||
vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
|
||||
vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
|
||||
vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
|
||||
vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
|
||||
vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
|
||||
|
||||
vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
|
||||
vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
|
||||
|
|
@ -106,7 +106,7 @@ void vTranspose3x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar
|
|||
}
|
||||
|
||||
INLINE
|
||||
void vTranspose4x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3)
|
||||
void vTranspose4x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3)
|
||||
{
|
||||
simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
|
||||
simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
|
||||
|
|
@ -118,10 +118,10 @@ void vTranspose4x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar
|
|||
simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
|
||||
simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
|
||||
|
||||
vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
|
||||
vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
|
||||
vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
|
||||
vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
|
||||
vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
|
||||
vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
|
||||
vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
|
||||
vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
|
||||
|
||||
vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
|
||||
vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
|
||||
|
|
@ -227,16 +227,16 @@ struct Transpose8_8_8_8
|
|||
|
||||
#if KNOB_SIMD_WIDTH == 8
|
||||
#if KNOB_ARCH <= KNOB_ARCH_AVX
|
||||
__m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg
|
||||
__m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa
|
||||
__m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
|
||||
__m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
|
||||
__m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
|
||||
__m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa
|
||||
__m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba
|
||||
__m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba
|
||||
_mm_store_si128((__m128i*)pDst, c0123lo);
|
||||
_mm_store_si128((__m128i*)(pDst + 16), c0123hi);
|
||||
simd4scalari c0c1 = src.v4[0]; // rrrrrrrrgggggggg
|
||||
simd4scalari c2c3 = SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1)); // bbbbbbbbaaaaaaaa
|
||||
simd4scalari c0c2 = SIMD128::unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
|
||||
simd4scalari c1c3 = SIMD128::unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
|
||||
simd4scalari c01 = SIMD128::unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
|
||||
simd4scalari c23 = SIMD128::unpackhi_epi8(c0c2, c1c3); // babababababababa
|
||||
simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23); // rgbargbargbargba
|
||||
simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23); // rgbargbargbargba
|
||||
SIMD128::store_si((simd4scalari*)pDst, c0123lo);
|
||||
SIMD128::store_si((simd4scalari*)(pDst + 16), c0123hi);
|
||||
#else
|
||||
simdscalari dst01 = _simd_shuffle_epi8(src,
|
||||
_simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
|
||||
|
|
@ -254,10 +254,10 @@ struct Transpose8_8_8_8
|
|||
|
||||
INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
|
||||
{
|
||||
__m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc)); // rrrrrrrrrrrrrrrr
|
||||
__m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
|
||||
__m128i src2 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
|
||||
__m128i src3 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
|
||||
simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc)); // rrrrrrrrrrrrrrrr
|
||||
simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg
|
||||
simd4scalari src2 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
|
||||
simd4scalari src3 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
|
||||
|
||||
simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
|
||||
simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
|
||||
|
|
@ -305,10 +305,10 @@ struct Transpose8_8
|
|||
#if KNOB_SIMD_WIDTH == 8
|
||||
simdscalari src = _simd_load_si((const simdscalari*)pSrc);
|
||||
|
||||
__m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg
|
||||
__m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg
|
||||
rg = _mm_unpacklo_epi8(rg, g);
|
||||
_mm_store_si128((__m128i*)pDst, rg);
|
||||
simd4scalari rg = src.v4[0]; // rrrrrrrr gggggggg
|
||||
simd4scalari g = SIMD128::unpackhi_epi64(rg, rg); // gggggggg gggggggg
|
||||
rg = SIMD128::unpacklo_epi8(rg, g);
|
||||
SIMD128::store_si((simd4scalari*)pDst, rg);
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
|
|
@ -317,8 +317,8 @@ struct Transpose8_8
|
|||
|
||||
INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
|
||||
{
|
||||
__m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc)); // rrrrrrrrrrrrrrrr
|
||||
__m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
|
||||
simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc)); // rrrrrrrrrrrrrrrr
|
||||
simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg
|
||||
|
||||
simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
|
||||
simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
|
||||
|
|
@ -349,16 +349,16 @@ struct Transpose32_32_32_32
|
|||
simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
|
||||
simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
|
||||
|
||||
__m128 vDst[8];
|
||||
simd4scalar vDst[8];
|
||||
vTranspose4x8(vDst, src0, src1, src2, src3);
|
||||
_mm_store_ps((float*)pDst, vDst[0]);
|
||||
_mm_store_ps((float*)pDst+4, vDst[1]);
|
||||
_mm_store_ps((float*)pDst+8, vDst[2]);
|
||||
_mm_store_ps((float*)pDst+12, vDst[3]);
|
||||
_mm_store_ps((float*)pDst+16, vDst[4]);
|
||||
_mm_store_ps((float*)pDst+20, vDst[5]);
|
||||
_mm_store_ps((float*)pDst+24, vDst[6]);
|
||||
_mm_store_ps((float*)pDst+28, vDst[7]);
|
||||
SIMD128::store_ps((float*)pDst, vDst[0]);
|
||||
SIMD128::store_ps((float*)pDst+4, vDst[1]);
|
||||
SIMD128::store_ps((float*)pDst+8, vDst[2]);
|
||||
SIMD128::store_ps((float*)pDst+12, vDst[3]);
|
||||
SIMD128::store_ps((float*)pDst+16, vDst[4]);
|
||||
SIMD128::store_ps((float*)pDst+20, vDst[5]);
|
||||
SIMD128::store_ps((float*)pDst+24, vDst[6]);
|
||||
SIMD128::store_ps((float*)pDst+28, vDst[7]);
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
|
|
@ -400,16 +400,16 @@ struct Transpose32_32_32
|
|||
simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
|
||||
simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
|
||||
|
||||
__m128 vDst[8];
|
||||
simd4scalar vDst[8];
|
||||
vTranspose3x8(vDst, src0, src1, src2);
|
||||
_mm_store_ps((float*)pDst, vDst[0]);
|
||||
_mm_store_ps((float*)pDst + 4, vDst[1]);
|
||||
_mm_store_ps((float*)pDst + 8, vDst[2]);
|
||||
_mm_store_ps((float*)pDst + 12, vDst[3]);
|
||||
_mm_store_ps((float*)pDst + 16, vDst[4]);
|
||||
_mm_store_ps((float*)pDst + 20, vDst[5]);
|
||||
_mm_store_ps((float*)pDst + 24, vDst[6]);
|
||||
_mm_store_ps((float*)pDst + 28, vDst[7]);
|
||||
SIMD128::store_ps((float*)pDst, vDst[0]);
|
||||
SIMD128::store_ps((float*)pDst + 4, vDst[1]);
|
||||
SIMD128::store_ps((float*)pDst + 8, vDst[2]);
|
||||
SIMD128::store_ps((float*)pDst + 12, vDst[3]);
|
||||
SIMD128::store_ps((float*)pDst + 16, vDst[4]);
|
||||
SIMD128::store_ps((float*)pDst + 20, vDst[5]);
|
||||
SIMD128::store_ps((float*)pDst + 24, vDst[6]);
|
||||
SIMD128::store_ps((float*)pDst + 28, vDst[7]);
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
|
|
@ -448,21 +448,21 @@ struct Transpose32_32
|
|||
{
|
||||
#if KNOB_SIMD_WIDTH == 8
|
||||
const float* pfSrc = (const float*)pSrc;
|
||||
__m128 src_r0 = _mm_load_ps(pfSrc + 0);
|
||||
__m128 src_r1 = _mm_load_ps(pfSrc + 4);
|
||||
__m128 src_g0 = _mm_load_ps(pfSrc + 8);
|
||||
__m128 src_g1 = _mm_load_ps(pfSrc + 12);
|
||||
simd4scalar src_r0 = SIMD128::load_ps(pfSrc + 0);
|
||||
simd4scalar src_r1 = SIMD128::load_ps(pfSrc + 4);
|
||||
simd4scalar src_g0 = SIMD128::load_ps(pfSrc + 8);
|
||||
simd4scalar src_g1 = SIMD128::load_ps(pfSrc + 12);
|
||||
|
||||
__m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
|
||||
__m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
|
||||
__m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
|
||||
__m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
|
||||
simd4scalar dst0 = SIMD128::unpacklo_ps(src_r0, src_g0);
|
||||
simd4scalar dst1 = SIMD128::unpackhi_ps(src_r0, src_g0);
|
||||
simd4scalar dst2 = SIMD128::unpacklo_ps(src_r1, src_g1);
|
||||
simd4scalar dst3 = SIMD128::unpackhi_ps(src_r1, src_g1);
|
||||
|
||||
float* pfDst = (float*)pDst;
|
||||
_mm_store_ps(pfDst + 0, dst0);
|
||||
_mm_store_ps(pfDst + 4, dst1);
|
||||
_mm_store_ps(pfDst + 8, dst2);
|
||||
_mm_store_ps(pfDst + 12, dst3);
|
||||
SIMD128::store_ps(pfDst + 0, dst0);
|
||||
SIMD128::store_ps(pfDst + 4, dst1);
|
||||
SIMD128::store_ps(pfDst + 8, dst2);
|
||||
SIMD128::store_ps(pfDst + 12, dst3);
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
|
|
@ -504,25 +504,25 @@ struct Transpose16_16_16_16
|
|||
simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
|
||||
simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
|
||||
|
||||
__m128i src_r = _mm256_extractf128_si256(src_rg, 0);
|
||||
__m128i src_g = _mm256_extractf128_si256(src_rg, 1);
|
||||
__m128i src_b = _mm256_extractf128_si256(src_ba, 0);
|
||||
__m128i src_a = _mm256_extractf128_si256(src_ba, 1);
|
||||
simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
|
||||
simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
|
||||
simd4scalari src_b = _simd_extractf128_si(src_ba, 0);
|
||||
simd4scalari src_a = _simd_extractf128_si(src_ba, 1);
|
||||
|
||||
__m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
|
||||
__m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
|
||||
__m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
|
||||
__m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
|
||||
simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
|
||||
simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
|
||||
simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
|
||||
simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
|
||||
|
||||
__m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
|
||||
__m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
|
||||
__m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
|
||||
__m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
|
||||
simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
|
||||
simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
|
||||
simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
|
||||
simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
|
||||
|
||||
_mm_store_si128(((__m128i*)pDst) + 0, dst0);
|
||||
_mm_store_si128(((__m128i*)pDst) + 1, dst1);
|
||||
_mm_store_si128(((__m128i*)pDst) + 2, dst2);
|
||||
_mm_store_si128(((__m128i*)pDst) + 3, dst3);
|
||||
SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
|
||||
SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
|
||||
SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
|
||||
SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
|
|
@ -573,25 +573,25 @@ struct Transpose16_16_16
|
|||
#if KNOB_SIMD_WIDTH == 8
|
||||
simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
|
||||
|
||||
__m128i src_r = _mm256_extractf128_si256(src_rg, 0);
|
||||
__m128i src_g = _mm256_extractf128_si256(src_rg, 1);
|
||||
__m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
|
||||
__m128i src_a = _mm_undefined_si128();
|
||||
simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
|
||||
simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
|
||||
simd4scalari src_b = SIMD128::load_si((const simd4scalari*)(pSrc + sizeof(simdscalari)));
|
||||
simd4scalari src_a = SIMD128::setzero_si();
|
||||
|
||||
__m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
|
||||
__m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
|
||||
__m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
|
||||
__m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
|
||||
simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
|
||||
simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
|
||||
simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
|
||||
simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
|
||||
|
||||
__m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
|
||||
__m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
|
||||
__m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
|
||||
__m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
|
||||
simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
|
||||
simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
|
||||
simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
|
||||
simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
|
||||
|
||||
_mm_store_si128(((__m128i*)pDst) + 0, dst0);
|
||||
_mm_store_si128(((__m128i*)pDst) + 1, dst1);
|
||||
_mm_store_si128(((__m128i*)pDst) + 2, dst2);
|
||||
_mm_store_si128(((__m128i*)pDst) + 3, dst3);
|
||||
SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
|
||||
SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
|
||||
SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
|
||||
SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
|
|
@ -642,17 +642,17 @@ struct Transpose16_16
|
|||
#if KNOB_SIMD_WIDTH == 8
|
||||
simdscalar src = _simd_load_ps((const float*)pSrc);
|
||||
|
||||
__m128 comp0 = _mm256_castps256_ps128(src);
|
||||
__m128 comp1 = _mm256_extractf128_ps(src, 1);
|
||||
simd4scalar comp0 = _simd_extractf128_ps(src, 0);
|
||||
simd4scalar comp1 = _simd_extractf128_ps(src, 1);
|
||||
|
||||
__m128i comp0i = _mm_castps_si128(comp0);
|
||||
__m128i comp1i = _mm_castps_si128(comp1);
|
||||
simd4scalari comp0i = SIMD128::castps_si(comp0);
|
||||
simd4scalari comp1i = SIMD128::castps_si(comp1);
|
||||
|
||||
__m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
|
||||
__m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
|
||||
simd4scalari resLo = SIMD128::unpacklo_epi16(comp0i, comp1i);
|
||||
simd4scalari resHi = SIMD128::unpackhi_epi16(comp0i, comp1i);
|
||||
|
||||
_mm_store_si128((__m128i*)pDst, resLo);
|
||||
_mm_store_si128((__m128i*)pDst + 1, resHi);
|
||||
SIMD128::store_si((simd4scalari*)pDst, resLo);
|
||||
SIMD128::store_si((simd4scalari*)pDst + 1, resHi);
|
||||
#else
|
||||
#error Unsupported vector width
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -527,7 +527,7 @@ static void StreamOut(
|
|||
// Write all entries into primitive data buffer for SOS.
|
||||
while (_BitScanForward(&slot, soMask))
|
||||
{
|
||||
__m128 attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
|
||||
simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
|
||||
uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
|
||||
pa.AssembleSingle(paSlot, primIndex, attrib);
|
||||
|
||||
|
|
@ -941,7 +941,9 @@ static void GeometryShaderStage(
|
|||
|
||||
if (HasStreamOutT::value)
|
||||
{
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
gsPa.useAlternateOffset = false;
|
||||
#endif
|
||||
StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
|
||||
}
|
||||
|
||||
|
|
@ -1279,7 +1281,9 @@ static void TessellationStages(
|
|||
{
|
||||
if (HasStreamOutT::value)
|
||||
{
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
tessPa.useAlternateOffset = false;
|
||||
#endif
|
||||
StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -391,7 +391,7 @@ struct PA_STATE_BASE; // forward decl
|
|||
void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
|
||||
void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
|
||||
#if USE_SIMD16_FRONTEND
|
||||
void SIMDAPI BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
|
||||
void SIMDAPI BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
|
||||
void SIMDCALL BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
|
||||
void SIMDCALL BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -92,7 +92,7 @@ struct PA_STATE
|
|||
#if ENABLE_AVX512_SIMD16
|
||||
virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0;
|
||||
#endif
|
||||
virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
|
||||
virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
|
||||
virtual bool NextPrim() = 0;
|
||||
virtual SIMDVERTEX& GetNextVsOutput() = 0;
|
||||
virtual bool GetNextStreamOutput() = 0;
|
||||
|
|
@ -139,7 +139,7 @@ struct PA_STATE_OPT : public PA_STATE
|
|||
#if ENABLE_AVX512_SIMD16
|
||||
typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
#endif
|
||||
typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
|
||||
typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
|
||||
|
||||
PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles.
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
|
|
@ -205,7 +205,7 @@ struct PA_STATE_OPT : public PA_STATE
|
|||
|
||||
#endif
|
||||
// Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
|
||||
void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
|
||||
void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
|
||||
{
|
||||
return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
|
||||
}
|
||||
|
|
@ -767,7 +767,7 @@ PRAGMA_WARNING_POP()
|
|||
}
|
||||
|
||||
#endif
|
||||
void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
|
||||
void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
|
||||
{
|
||||
// move to slot
|
||||
for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
|
||||
|
|
@ -1253,7 +1253,7 @@ struct PA_TESS : PA_STATE
|
|||
_simd16_setzero_ps(),
|
||||
pBase,
|
||||
indices,
|
||||
mask,
|
||||
_simd16_castsi_ps(mask),
|
||||
4 /* gcc doesn't like sizeof(float) */);
|
||||
|
||||
verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
|
||||
|
|
@ -1263,7 +1263,7 @@ struct PA_TESS : PA_STATE
|
|||
pBase,
|
||||
indices,
|
||||
_simd_castsi_ps(mask),
|
||||
4 /* gcc doesn't like sizeof(float) */);
|
||||
4); // gcc doesn't like sizeof(float)
|
||||
#endif
|
||||
pBase += m_attributeStrideInVectors * SIMD_WIDTH;
|
||||
}
|
||||
|
|
@ -1302,7 +1302,7 @@ struct PA_TESS : PA_STATE
|
|||
_simd16_setzero_ps(),
|
||||
pBase,
|
||||
indices,
|
||||
mask,
|
||||
_simd16_castsi_ps(mask),
|
||||
4 /* gcc doesn't like sizeof(float) */);
|
||||
#else
|
||||
simdscalar temp = _simd_mask_i32gather_ps(
|
||||
|
|
@ -1321,7 +1321,7 @@ struct PA_TESS : PA_STATE
|
|||
}
|
||||
|
||||
#endif
|
||||
void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
|
||||
void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
|
||||
{
|
||||
SWR_ASSERT(slot < m_numAttributes);
|
||||
SWR_ASSERT(primIndex < PA_TESS::NumPrims());
|
||||
|
|
|
|||
|
|
@ -34,103 +34,103 @@
|
|||
|
||||
#if (KNOB_SIMD_WIDTH == 8)
|
||||
|
||||
INLINE __m128 swizzleLane0(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
|
||||
INLINE simd4scalar swizzleLane0(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
|
||||
{
|
||||
simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
|
||||
simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
|
||||
return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane1(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
|
||||
INLINE simd4scalar swizzleLane1(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
|
||||
{
|
||||
simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
|
||||
simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
|
||||
return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane2(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
|
||||
INLINE simd4scalar swizzleLane2(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
|
||||
{
|
||||
simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
|
||||
simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
|
||||
return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane3(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
|
||||
INLINE simd4scalar swizzleLane3(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
|
||||
{
|
||||
simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
|
||||
simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
|
||||
return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane4(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
|
||||
INLINE simd4scalar swizzleLane4(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
|
||||
{
|
||||
simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
|
||||
simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
|
||||
return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane5(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
|
||||
INLINE simd4scalar swizzleLane5(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
|
||||
{
|
||||
simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
|
||||
simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
|
||||
return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane6(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
|
||||
INLINE simd4scalar swizzleLane6(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
|
||||
{
|
||||
simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
|
||||
simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
|
||||
return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane7(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
|
||||
INLINE simd4scalar swizzleLane7(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
|
||||
{
|
||||
simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
|
||||
simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
|
||||
return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane0(const simdvector &v)
|
||||
INLINE simd4scalar swizzleLane0(const simdvector &v)
|
||||
{
|
||||
return swizzleLane0(v.x, v.y, v.z, v.w);
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane1(const simdvector &v)
|
||||
INLINE simd4scalar swizzleLane1(const simdvector &v)
|
||||
{
|
||||
return swizzleLane1(v.x, v.y, v.z, v.w);
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane2(const simdvector &v)
|
||||
INLINE simd4scalar swizzleLane2(const simdvector &v)
|
||||
{
|
||||
return swizzleLane2(v.x, v.y, v.z, v.w);
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane3(const simdvector &v)
|
||||
INLINE simd4scalar swizzleLane3(const simdvector &v)
|
||||
{
|
||||
return swizzleLane3(v.x, v.y, v.z, v.w);
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane4(const simdvector &v)
|
||||
INLINE simd4scalar swizzleLane4(const simdvector &v)
|
||||
{
|
||||
return swizzleLane4(v.x, v.y, v.z, v.w);
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane5(const simdvector &v)
|
||||
INLINE simd4scalar swizzleLane5(const simdvector &v)
|
||||
{
|
||||
return swizzleLane5(v.x, v.y, v.z, v.w);
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane6(const simdvector &v)
|
||||
INLINE simd4scalar swizzleLane6(const simdvector &v)
|
||||
{
|
||||
return swizzleLane6(v.x, v.y, v.z, v.w);
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane7(const simdvector &v)
|
||||
INLINE simd4scalar swizzleLane7(const simdvector &v)
|
||||
{
|
||||
return swizzleLane7(v.x, v.y, v.z, v.w);
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLaneN(const simdvector &v, int lane)
|
||||
INLINE simd4scalar swizzleLaneN(const simdvector &v, int lane)
|
||||
{
|
||||
switch (lane)
|
||||
{
|
||||
|
|
@ -156,87 +156,87 @@ INLINE __m128 swizzleLaneN(const simdvector &v, int lane)
|
|||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
INLINE __m128 swizzleLane0(const simd16vector &v)
|
||||
INLINE simd4scalar swizzleLane0(const simd16vector &v)
|
||||
{
|
||||
return swizzleLane0(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane1(const simd16vector &v)
|
||||
INLINE simd4scalar swizzleLane1(const simd16vector &v)
|
||||
{
|
||||
return swizzleLane1(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane2(const simd16vector &v)
|
||||
INLINE simd4scalar swizzleLane2(const simd16vector &v)
|
||||
{
|
||||
return swizzleLane2(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane3(const simd16vector &v)
|
||||
INLINE simd4scalar swizzleLane3(const simd16vector &v)
|
||||
{
|
||||
return swizzleLane3(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane4(const simd16vector &v)
|
||||
INLINE simd4scalar swizzleLane4(const simd16vector &v)
|
||||
{
|
||||
return swizzleLane4(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane5(const simd16vector &v)
|
||||
INLINE simd4scalar swizzleLane5(const simd16vector &v)
|
||||
{
|
||||
return swizzleLane5(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane6(const simd16vector &v)
|
||||
INLINE simd4scalar swizzleLane6(const simd16vector &v)
|
||||
{
|
||||
return swizzleLane6(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane7(const simd16vector &v)
|
||||
INLINE simd4scalar swizzleLane7(const simd16vector &v)
|
||||
{
|
||||
return swizzleLane7(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane8(const simd16vector &v)
|
||||
INLINE simd4scalar swizzleLane8(const simd16vector &v)
|
||||
{
|
||||
return swizzleLane0(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLane9(const simd16vector &v)
|
||||
INLINE simd4scalar swizzleLane9(const simd16vector &v)
|
||||
{
|
||||
return swizzleLane1(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLaneA(const simd16vector &v)
|
||||
INLINE simd4scalar swizzleLaneA(const simd16vector &v)
|
||||
{
|
||||
return swizzleLane2(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLaneB(const simd16vector &v)
|
||||
INLINE simd4scalar swizzleLaneB(const simd16vector &v)
|
||||
{
|
||||
return swizzleLane3(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLaneC(const simd16vector &v)
|
||||
INLINE simd4scalar swizzleLaneC(const simd16vector &v)
|
||||
{
|
||||
return swizzleLane4(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLaneD(const simd16vector &v)
|
||||
INLINE simd4scalar swizzleLaneD(const simd16vector &v)
|
||||
{
|
||||
return swizzleLane5(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLaneE(const simd16vector &v)
|
||||
INLINE simd4scalar swizzleLaneE(const simd16vector &v)
|
||||
{
|
||||
return swizzleLane6(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLaneF(const simd16vector &v)
|
||||
INLINE simd4scalar swizzleLaneF(const simd16vector &v)
|
||||
{
|
||||
return swizzleLane7(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
|
||||
}
|
||||
|
||||
INLINE __m128 swizzleLaneN(const simd16vector &v, int lane)
|
||||
INLINE simd4scalar swizzleLaneN(const simd16vector &v, int lane)
|
||||
{
|
||||
switch (lane)
|
||||
{
|
||||
|
|
@ -286,7 +286,7 @@ bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
|||
bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
#endif
|
||||
void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
|
||||
void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
|
||||
|
||||
bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
|
|
@ -294,7 +294,7 @@ bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
|||
bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
#endif
|
||||
void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
|
||||
void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
|
||||
|
||||
bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
|
|
@ -302,7 +302,7 @@ bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
|||
bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
#endif
|
||||
void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
|
||||
void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
|
||||
|
||||
bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
|
|
@ -310,7 +310,7 @@ bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
|||
bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
#endif
|
||||
void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
|
||||
void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
|
||||
|
||||
bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
|
|
@ -318,7 +318,7 @@ bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
|||
bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
#endif
|
||||
void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
|
||||
void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
|
||||
|
||||
bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
|
|
@ -326,7 +326,7 @@ bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
|||
bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
#endif
|
||||
void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
|
||||
void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
|
||||
|
||||
bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
|
|
@ -334,13 +334,13 @@ bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
|||
bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
#endif
|
||||
void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
|
||||
void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
|
||||
|
||||
bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
#endif
|
||||
void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
|
||||
void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
|
||||
|
||||
bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
|
|
@ -350,10 +350,10 @@ bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
|||
bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
#endif
|
||||
void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
|
||||
void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
|
||||
|
||||
template <uint32_t TotalControlPoints>
|
||||
void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
|
||||
void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
|
||||
{
|
||||
// We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
|
||||
// KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
|
||||
|
|
@ -788,7 +788,7 @@ bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
|
|||
}
|
||||
|
||||
#endif
|
||||
void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
|
||||
void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
|
||||
|
|
@ -1057,7 +1057,7 @@ bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
|
|||
}
|
||||
|
||||
#endif
|
||||
void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
|
||||
void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
|
||||
|
|
@ -1325,7 +1325,7 @@ bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
|
|||
}
|
||||
|
||||
#endif
|
||||
void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
|
||||
void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot);
|
||||
|
|
@ -1491,7 +1491,7 @@ bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
|
|||
}
|
||||
|
||||
#endif
|
||||
void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
|
||||
void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
|
||||
|
|
@ -1741,7 +1741,7 @@ bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
|
|||
}
|
||||
|
||||
#endif
|
||||
void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
|
||||
void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
|
||||
{
|
||||
PaLineStripSingle0(pa, slot, primIndex, verts);
|
||||
|
||||
|
|
@ -1855,7 +1855,7 @@ bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
|
|||
}
|
||||
|
||||
#endif
|
||||
void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
|
||||
void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
|
||||
|
|
@ -2075,7 +2075,7 @@ bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
|
|||
}
|
||||
|
||||
#endif
|
||||
void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
|
||||
void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
|
||||
|
|
@ -2239,7 +2239,7 @@ bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
|
|||
}
|
||||
|
||||
#endif
|
||||
void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
|
||||
void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
|
||||
|
|
@ -2529,7 +2529,7 @@ void PaRectListSingle0(
|
|||
PA_STATE_OPT& pa,
|
||||
uint32_t slot,
|
||||
uint32_t primIndex,
|
||||
__m128 verts[])
|
||||
simd4scalar verts[])
|
||||
{
|
||||
// We have 12 simdscalars contained within 3 simdvectors which
|
||||
// hold at least 8 triangles worth of data. We want to assemble a single
|
||||
|
|
|
|||
|
|
@ -199,15 +199,15 @@ struct StorePixels<32, 2>
|
|||
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
|
||||
{
|
||||
// Each 4-pixel row is 16-bytes
|
||||
__m128i *pZRow01 = (__m128i*)pSrc;
|
||||
__m128i vQuad00 = _mm_load_si128(pZRow01);
|
||||
__m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
|
||||
simd4scalari *pZRow01 = (simd4scalari*)pSrc;
|
||||
simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
|
||||
simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
|
||||
|
||||
__m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
|
||||
__m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
|
||||
simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
|
||||
simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
|
||||
|
||||
_mm_storeu_si128((__m128i*)ppDsts[0], vRow00);
|
||||
_mm_storeu_si128((__m128i*)ppDsts[1], vRow10);
|
||||
SIMD128::storeu_si((simd4scalari*)ppDsts[0], vRow00);
|
||||
SIMD128::storeu_si((simd4scalari*)ppDsts[1], vRow10);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -218,20 +218,20 @@ struct StorePixels<32, 4>
|
|||
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
|
||||
{
|
||||
// 4 x 16 bytes = 64 bytes, 16 pixels
|
||||
const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
|
||||
const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
|
||||
|
||||
__m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
|
||||
simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
|
||||
|
||||
// Unswizzle from SWR-Z order
|
||||
__m128i quad0 = _mm_load_si128(&pSrc128[0]); // 0 1 2 3
|
||||
__m128i quad1 = _mm_load_si128(&pSrc128[1]); // 4 5 6 7
|
||||
__m128i quad2 = _mm_load_si128(&pSrc128[2]); // 8 9 A B
|
||||
__m128i quad3 = _mm_load_si128(&pSrc128[3]); // C D E F
|
||||
simd4scalari quad0 = SIMD128::load_si(&pSrc128[0]); // 0 1 2 3
|
||||
simd4scalari quad1 = SIMD128::load_si(&pSrc128[1]); // 4 5 6 7
|
||||
simd4scalari quad2 = SIMD128::load_si(&pSrc128[2]); // 8 9 A B
|
||||
simd4scalari quad3 = SIMD128::load_si(&pSrc128[3]); // C D E F
|
||||
|
||||
_mm_storeu_si128(ppDsts128[0], _mm_unpacklo_epi64(quad0, quad1)); // 0 1 4 5
|
||||
_mm_storeu_si128(ppDsts128[1], _mm_unpackhi_epi64(quad0, quad1)); // 2 3 6 7
|
||||
_mm_storeu_si128(ppDsts128[2], _mm_unpacklo_epi64(quad2, quad3)); // 8 9 C D
|
||||
_mm_storeu_si128(ppDsts128[3], _mm_unpackhi_epi64(quad2, quad3)); // A B E F
|
||||
SIMD128::storeu_si(ppDsts128[0], SIMD128::unpacklo_epi64(quad0, quad1)); // 0 1 4 5
|
||||
SIMD128::storeu_si(ppDsts128[1], SIMD128::unpackhi_epi64(quad0, quad1)); // 2 3 6 7
|
||||
SIMD128::storeu_si(ppDsts128[2], SIMD128::unpacklo_epi64(quad2, quad3)); // 8 9 C D
|
||||
SIMD128::storeu_si(ppDsts128[3], SIMD128::unpackhi_epi64(quad2, quad3)); // A B E F
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -251,10 +251,10 @@ struct StorePixels<64, 4>
|
|||
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
|
||||
{
|
||||
// Each 4-pixel row is 32 bytes.
|
||||
const __m128i* pPixSrc = (const __m128i*)pSrc;
|
||||
const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
|
||||
|
||||
// order of pointers match SWR-Z layout
|
||||
__m128i** pvDsts = (__m128i**)&ppDsts[0];
|
||||
simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
|
||||
*pvDsts[0] = pPixSrc[0];
|
||||
*pvDsts[1] = pPixSrc[1];
|
||||
*pvDsts[2] = pPixSrc[2];
|
||||
|
|
@ -269,9 +269,9 @@ struct StorePixels<64, 8>
|
|||
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
|
||||
{
|
||||
// 8 x 16 bytes = 128 bytes, 16 pixels
|
||||
const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
|
||||
const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
|
||||
|
||||
__m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
|
||||
simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
|
||||
|
||||
// order of pointers match SWR-Z layout
|
||||
*ppDsts128[0] = pSrc128[0]; // 0 1
|
||||
|
|
@ -301,10 +301,10 @@ struct StorePixels<128, 8>
|
|||
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
|
||||
{
|
||||
// Each 4-pixel row is 64 bytes.
|
||||
const __m128i* pPixSrc = (const __m128i*)pSrc;
|
||||
const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
|
||||
|
||||
// Unswizzle from SWR-Z order
|
||||
__m128i** pvDsts = (__m128i**)&ppDsts[0];
|
||||
simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
|
||||
*pvDsts[0] = pPixSrc[0];
|
||||
*pvDsts[1] = pPixSrc[2];
|
||||
*pvDsts[2] = pPixSrc[1];
|
||||
|
|
@ -323,9 +323,9 @@ struct StorePixels<128, 16>
|
|||
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
|
||||
{
|
||||
// 16 x 16 bytes = 256 bytes, 16 pixels
|
||||
const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
|
||||
const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
|
||||
|
||||
__m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
|
||||
simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
|
||||
|
||||
for (uint32_t i = 0; i < 16; i += 4)
|
||||
{
|
||||
|
|
@ -563,8 +563,8 @@ struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
|
|||
temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
|
||||
|
||||
// merge/store data into destination but don't overwrite the X8 bits
|
||||
simdscalari destlo = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]));
|
||||
simdscalari desthi = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]));
|
||||
simdscalari destlo = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]));
|
||||
simdscalari desthi = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]));
|
||||
|
||||
simd16scalari dest = _simd16_setzero_si();
|
||||
|
||||
|
|
@ -575,8 +575,8 @@ struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
|
|||
|
||||
dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));
|
||||
|
||||
_simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), _simd16_extract_si(dest, 0));
|
||||
_simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), _simd16_extract_si(dest, 1));
|
||||
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]), _simd16_extract_si(dest, 0));
|
||||
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]), _simd16_extract_si(dest, 1));
|
||||
#else
|
||||
static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
|
||||
|
||||
|
|
@ -593,25 +593,25 @@ struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
|
|||
|
||||
// Store data into destination but don't overwrite the X8 bits
|
||||
// Each 4-pixel row is 16-bytes
|
||||
__m128i *pZRow01 = (__m128i*)aosTile;
|
||||
__m128i vQuad00 = _mm_load_si128(pZRow01);
|
||||
__m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
|
||||
simd4scalari *pZRow01 = (simd4scalari*)aosTile;
|
||||
simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
|
||||
simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
|
||||
|
||||
__m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
|
||||
__m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
|
||||
simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
|
||||
simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
|
||||
|
||||
__m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]);
|
||||
__m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]);
|
||||
simd4scalari vDst0 = SIMD128::loadu_si((const simd4scalari*)ppDsts[0]);
|
||||
simd4scalari vDst1 = SIMD128::loadu_si((const simd4scalari*)ppDsts[1]);
|
||||
|
||||
__m128i vMask = _mm_set1_epi32(0xFFFFFF);
|
||||
simd4scalari vMask = _mm_set1_epi32(0xFFFFFF);
|
||||
|
||||
vDst0 = _mm_andnot_si128(vMask, vDst0);
|
||||
vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask));
|
||||
vDst1 = _mm_andnot_si128(vMask, vDst1);
|
||||
vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask));
|
||||
vDst0 = SIMD128::andnot_si(vMask, vDst0);
|
||||
vDst0 = SIMD128::or_si(vDst0, SIMD128::and_si(vRow00, vMask));
|
||||
vDst1 = SIMD128::andnot_si(vMask, vDst1);
|
||||
vDst1 = SIMD128::or_si(vDst1, SIMD128::and_si(vRow10, vMask));
|
||||
|
||||
_mm_storeu_si128((__m128i*)ppDsts[0], vDst0);
|
||||
_mm_storeu_si128((__m128i*)ppDsts[1], vDst1);
|
||||
SIMD128::storeu_si((simd4scalari*)ppDsts[0], vDst0);
|
||||
SIMD128::storeu_si((simd4scalari*)ppDsts[1], vDst1);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
|
@ -683,8 +683,8 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDs
|
|||
// store 8x2 memory order:
|
||||
// row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
|
||||
// row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
|
||||
_simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
|
||||
_simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
|
||||
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
|
||||
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -736,15 +736,15 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst
|
|||
|
||||
// splitting into two sets of 4 wide integer vector types
|
||||
// because AVX doesn't have instructions to support this operation at 8 wide
|
||||
__m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
|
||||
__m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
|
||||
__m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
|
||||
__m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
|
||||
simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
|
||||
simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
|
||||
simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
|
||||
simd4scalari srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
|
||||
|
||||
__m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
|
||||
__m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
|
||||
__m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
|
||||
__m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
|
||||
simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
|
||||
simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
|
||||
simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
|
||||
simd4scalari srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
|
||||
|
||||
srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
|
||||
srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
|
||||
|
|
@ -753,18 +753,18 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst
|
|||
srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
|
||||
srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
|
||||
|
||||
srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
|
||||
srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
|
||||
srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
|
||||
srcLo2 = SIMD128::or_si(srcLo2, srcLo3); // ab00ab00ab00ab00
|
||||
|
||||
srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
|
||||
srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
|
||||
srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
|
||||
srcHi2 = SIMD128::or_si(srcHi2, srcHi3); // ab00ab00ab00ab00
|
||||
|
||||
srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
|
||||
srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
|
||||
srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // abgrabgrabgrabgr
|
||||
srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // abgrabgrabgrabgr
|
||||
|
||||
// unpack into rows that get the tiling order correct
|
||||
__m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
|
||||
__m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
|
||||
simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
|
||||
simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
|
||||
|
||||
simdscalari final = _mm256_castsi128_si256(vRow00);
|
||||
final = _mm256_insertf128_si256(final, vRow10, 1);
|
||||
|
|
@ -785,7 +785,7 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst
|
|||
final = _mm256_permute4x64_epi64(final, 0xD8);
|
||||
#endif
|
||||
|
||||
_simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
|
||||
_simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
|
||||
}
|
||||
|
||||
#if USE_8x2_TILE_BACKEND
|
||||
|
|
@ -848,8 +848,8 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8
|
|||
// store 8x2 memory order:
|
||||
// row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
|
||||
// row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
|
||||
_simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
|
||||
_simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
|
||||
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
|
||||
_simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -894,29 +894,29 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_
|
|||
|
||||
// splitting into two sets of 4 wide integer vector types
|
||||
// because AVX doesn't have instructions to support this operation at 8 wide
|
||||
__m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
|
||||
__m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
|
||||
__m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
|
||||
simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
|
||||
simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
|
||||
simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
|
||||
|
||||
__m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
|
||||
__m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
|
||||
__m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
|
||||
simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
|
||||
simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
|
||||
simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
|
||||
|
||||
srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
|
||||
srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
|
||||
srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
|
||||
srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
|
||||
|
||||
srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
|
||||
srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
|
||||
|
||||
srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
|
||||
srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
|
||||
|
||||
srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
|
||||
srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
|
||||
srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
|
||||
srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
|
||||
|
||||
// unpack into rows that get the tiling order correct
|
||||
__m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
|
||||
__m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
|
||||
simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
|
||||
simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
|
||||
|
||||
simdscalari final = _mm256_castsi128_si256(vRow00);
|
||||
final = _mm256_insertf128_si256(final, vRow10, 1);
|
||||
|
|
@ -936,7 +936,7 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_
|
|||
|
||||
#endif
|
||||
|
||||
_simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
|
||||
_simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
|
||||
}
|
||||
|
||||
template<>
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue