swr: [rasterizer core] SIMD16 Frontend WIP

Implement widened binner for SIMD16

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
Tim Rowley 2017-03-29 12:58:18 -05:00
parent b8515d5c0f
commit d5157ddca4
4 changed files with 1299 additions and 75 deletions

View file

@ -46,10 +46,6 @@ struct simd16scalari
};
typedef uint16_t simd16mask;
#define _simd16_masklo(mask) ((mask) & 0xFF)
#define _simd16_maskhi(mask) (((mask) >> 8))
#define _simd16_setmask(hi, lo) (((hi) << 8) | (lo))
#else
typedef __m512 simd16scalar;
typedef __m512d simd16scalard;
@ -60,6 +56,10 @@ typedef __mmask16 simd16mask;
#error Unsupported vector width
#endif//KNOB_SIMD16_WIDTH == 16
#define _simd16_masklo(mask) ((mask) & 0xFF)
#define _simd16_maskhi(mask) (((mask) >> 8) & 0xFF)
#define _simd16_setmask(hi, lo) (((hi) << 8) | (lo))
OSALIGN(union, KNOB_SIMD16_BYTES) simd16vector
{
simd16scalar v[4];
@ -383,32 +383,26 @@ SIMD16_EMU_AVX512_2(simd16scalar, _simd16_max_ps, _mm256_max_ps)
INLINE simd16mask _simd16_movemask_ps(simd16scalar a)
{
simd16mask mask;
simdmask mask_lo = _mm256_movemask_ps(a.lo);
simdmask mask_hi = _mm256_movemask_ps(a.hi);
reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_ps(a.lo);
reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_ps(a.hi);
return mask;
return static_cast<simd16mask>(mask_lo) | (static_cast<simd16mask>(mask_hi) << 8);
}
INLINE simd16mask _simd16_movemask_pd(simd16scalard a)
{
simd16mask mask;
simdmask mask_lo = _mm256_movemask_pd(a.lo);
simdmask mask_hi = _mm256_movemask_pd(a.hi);
reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_pd(a.lo);
reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_pd(a.hi);
return mask;
return static_cast<simd16mask>(mask_lo) | (static_cast<simd16mask>(mask_hi) << 4);
}
INLINE simd16mask _simd16_movemask_epi8(simd16scalari a)
INLINE uint64_t _simd16_movemask_epi8(simd16scalari a)
{
simd16mask mask;
uint32_t mask_lo = _mm256_movemask_epi8(a.lo);
uint32_t mask_hi = _mm256_movemask_epi8(a.hi);
reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_epi8(a.lo);
reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_epi8(a.hi);
return mask;
return static_cast<uint64_t>(mask_lo) | (static_cast<uint64_t>(mask_hi) << 32);
}
INLINE simd16scalari _simd16_cvtps_epi32(simd16scalar a)
@ -809,12 +803,10 @@ INLINE simd16mask _simd16_scalari2mask(simd16scalari mask)
return _mm512_cmpneq_epu32_mask(mask, _mm512_setzero_epi32());
}
#if 0
INLINE simd16mask _simd16_scalard2mask(simd16scalard mask)
{
return _mm512_cmpneq_epu64_mask(mask, _mm512_setzero_epi64());
return _mm512_cmpneq_epu64_mask(_mm512_castpd_si512(mask), _mm512_setzero_si512());
}
#endif
#define _simd16_setzero_ps _mm512_setzero_ps
#define _simd16_setzero_si _mm512_setzero_si512
@ -889,6 +881,7 @@ INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, cons
}
#define _simd16_mul_ps _mm512_mul_ps
#define _simd16_div_ps _mm512_div_ps
#define _simd16_add_ps _mm512_add_ps
#define _simd16_sub_ps _mm512_sub_ps
#define _simd16_rsqrt_ps _mm512_rsqrt14_ps
@ -900,12 +893,10 @@ INLINE simd16mask _simd16_movemask_ps(simd16scalar a)
return _simd16_scalari2mask(_mm512_castps_si512(a));
}
#if 0
INLINE simd16mask _simd16_movemask_pd(simd16scalard a)
{
return _simd16_scalard2mask(_mm512i_castpd_si512(a));
return _simd16_scalard2mask(a);
}
#endif
#if 0
INLINE int _simd16_movemask_epi8(simd16scalari a)
@ -1040,7 +1031,6 @@ INLINE simd16scalar _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *
#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask)
#define _simd16_abs_epi32 _mm512_abs_epi32
#define _simd16_cmpeq_epi64 _mm512_abs_epi32
INLINE simd16scalari _simd16_cmpeq_epi64(simd16scalari a, simd16scalari b)
{

File diff suppressed because it is too large Load diff

View file

@ -112,6 +112,23 @@ void triangleSetupABIntVertical(const simdscalari vX[3], const simdscalari vY[3]
vB[1] = _simd_sub_epi32(vX[2], vX[1]);
vB[2] = _simd_sub_epi32(vX[0], vX[2]);
}
#if ENABLE_AVX512_SIMD16
INLINE
void triangleSetupABIntVertical(const simd16scalari vX[3], const simd16scalari vY[3], simd16scalari(&vA)[3], simd16scalari(&vB)[3])
{
// A = y0 - y1
// B = x1 - x0
vA[0] = _simd16_sub_epi32(vY[0], vY[1]);
vA[1] = _simd16_sub_epi32(vY[1], vY[2]);
vA[2] = _simd16_sub_epi32(vY[2], vY[0]);
vB[0] = _simd16_sub_epi32(vX[1], vX[0]);
vB[1] = _simd16_sub_epi32(vX[2], vX[1]);
vB[2] = _simd16_sub_epi32(vX[0], vX[2]);
}
#endif
// Calculate the determinant of the triangle
// 2 vectors between the 3 points: P, Q
// Px = x0-x2, Py = y0-y2
@ -185,6 +202,44 @@ void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3]
pvDet[1] = vResultHi;
}
#if ENABLE_AVX512_SIMD16
INLINE
void calcDeterminantIntVertical(const simd16scalari vA[3], const simd16scalari vB[3], simd16scalari *pvDet)
{
// refer to calcDeterminantInt comment for calculation explanation
// A1*B2
simd16scalari vA1Lo = _simd16_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5
simd16scalari vA1Hi = _simd16_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7
simd16scalari vB2Lo = _simd16_unpacklo_epi32(vB[2], vB[2]);
simd16scalari vB2Hi = _simd16_unpackhi_epi32(vB[2], vB[2]);
simd16scalari vA1B2Lo = _simd16_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5
simd16scalari vA1B2Hi = _simd16_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7
// B1*A2
simd16scalari vA2Lo = _simd16_unpacklo_epi32(vA[2], vA[2]);
simd16scalari vA2Hi = _simd16_unpackhi_epi32(vA[2], vA[2]);
simd16scalari vB1Lo = _simd16_unpacklo_epi32(vB[1], vB[1]);
simd16scalari vB1Hi = _simd16_unpackhi_epi32(vB[1], vB[1]);
simd16scalari vA2B1Lo = _simd16_mul_epi32(vA2Lo, vB1Lo);
simd16scalari vA2B1Hi = _simd16_mul_epi32(vA2Hi, vB1Hi);
// A1*B2 - A2*B1
simd16scalari detLo = _simd16_sub_epi64(vA1B2Lo, vA2B1Lo);
simd16scalari detHi = _simd16_sub_epi64(vA1B2Hi, vA2B1Hi);
// shuffle 0 1 4 5 -> 0 1 2 3
simd16scalari vResultLo = _simd16_permute2f128_si(detLo, detHi, 0x20);
simd16scalari vResultHi = _simd16_permute2f128_si(detLo, detHi, 0x31);
pvDet[0] = vResultLo;
pvDet[1] = vResultHi;
}
#endif
INLINE
void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128 &vB, __m128 &vC)
{
@ -227,6 +282,27 @@ void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices)
}
}
#if USE_SIMD16_FRONTEND
template<uint32_t NumVerts>
INLINE
void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices)
{
const simd16scalar m00 = _simd16_broadcast_ss(&vpMatrices.m00[0]);
const simd16scalar m30 = _simd16_broadcast_ss(&vpMatrices.m30[0]);
const simd16scalar m11 = _simd16_broadcast_ss(&vpMatrices.m11[0]);
const simd16scalar m31 = _simd16_broadcast_ss(&vpMatrices.m31[0]);
const simd16scalar m22 = _simd16_broadcast_ss(&vpMatrices.m22[0]);
const simd16scalar m32 = _simd16_broadcast_ss(&vpMatrices.m32[0]);
for (uint32_t i = 0; i < NumVerts; ++i)
{
v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30);
v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31);
v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32);
}
}
#endif
template<uint32_t NumVerts>
INLINE
void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simdscalari vViewportIdx)
@ -247,6 +323,28 @@ void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices,
}
}
#if USE_SIMD16_FRONTEND
template<uint32_t NumVerts>
INLINE
void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simd16scalari vViewportIdx)
{
// perform a gather of each matrix element based on the viewport array indexes
const simd16scalar m00 = _simd16_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
const simd16scalar m30 = _simd16_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4);
const simd16scalar m11 = _simd16_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4);
const simd16scalar m31 = _simd16_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4);
const simd16scalar m22 = _simd16_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4);
const simd16scalar m32 = _simd16_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4);
for (uint32_t i = 0; i < NumVerts; ++i)
{
v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30);
v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31);
v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32);
}
}
#endif
INLINE
void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, SWR_RECT &bbox)
{

View file

@ -83,6 +83,16 @@ struct simdBBox
simdscalari xmax;
};
#if ENABLE_AVX512_SIMD16
struct simd16BBox
{
simd16scalari ymin;
simd16scalari ymax;
simd16scalari xmin;
simd16scalari xmax;
};
#endif
INLINE
void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
{