swr: [rasterizer core] SIMD16 Frontend WIP - fix tesselation crashes

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
Tim Rowley 2017-03-07 16:23:18 -08:00
parent ab3f4449c3
commit 4cb69e817c
3 changed files with 35 additions and 31 deletions

View file

@ -610,6 +610,8 @@ INLINE static T RoundDownEven(T value)
/// ///
/// attribCount will limit the vector copies to those attribs specified /// attribCount will limit the vector copies to those attribs specified
/// ///
/// note: the stride between vertexes is determinded by KNOB_NUM_ATTRIBUTES
///
void PackPairsOfSimdVertexIntoSimd16VertexInPlace(simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount) void PackPairsOfSimdVertexIntoSimd16VertexInPlace(simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
{ {
SWR_ASSERT(vertex); SWR_ASSERT(vertex);
@ -1244,7 +1246,7 @@ static void TessellationStages(
uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH; uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs; size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
#if USE_SIMD16_FRONTEND #if USE_SIMD16_FRONTEND
size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSOutputVectors); // simd8 -> simd16, padding size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 -> simd16, padding
#else #else
size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors; size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
#endif #endif
@ -1253,7 +1255,7 @@ static void TessellationStages(
AlignedFree(gt_pTessellationThreadData->pDSOutput); AlignedFree(gt_pTessellationThreadData->pDSOutput);
gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64); gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
#if USE_SIMD16_FRONTEND #if USE_SIMD16_FRONTEND
gt_pTessellationThreadData->numDSOutputVectors = RoundUpEven(requiredDSOutputVectors); // simd8 -> simd16, padding gt_pTessellationThreadData->numDSOutputVectors = RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 -> simd16, padding
#else #else
gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors; gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
#endif #endif
@ -1272,7 +1274,11 @@ static void TessellationStages(
dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU; dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV; dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput; dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
#if USE_SIMD16_FRONTEND
dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16
#else
dsContext.vectorStride = requiredDSVectorInvocations; dsContext.vectorStride = requiredDSVectorInvocations;
#endif
uint32_t dsInvocations = 0; uint32_t dsInvocations = 0;
@ -1289,19 +1295,14 @@ static void TessellationStages(
UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints); UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
#if USE_SIMD16_FRONTEND #if USE_SIMD16_FRONTEND
// TEMPORARY: DS outputs simdvertex, PA inputs simd16vertex, so convert simdvertex to simd16vertex, in-place SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16
PackPairsOfSimdVertexIntoSimd16VertexInPlace(
reinterpret_cast<simdvertex *>(dsContext.pOutputData),
RoundUpEven(dsContext.vectorStride), // simd8 -> simd16
tsState.numDsOutputAttribs);
#endif #endif
PA_TESS tessPa( PA_TESS tessPa(
pDC, pDC,
#if USE_SIMD16_FRONTEND #if USE_SIMD16_FRONTEND
reinterpret_cast<const simd16scalar *>(dsContext.pOutputData), // simd8 -> simd16 reinterpret_cast<const simd16scalar *>(dsContext.pOutputData), // simd8 -> simd16
RoundUpEven(dsContext.vectorStride) / 2, // simd8 -> simd16 dsContext.vectorStride / 2, // simd8 -> simd16
#else #else
dsContext.pOutputData, dsContext.pOutputData,
dsContext.vectorStride, dsContext.vectorStride,

View file

@ -233,8 +233,6 @@ struct PA_STATE_OPT : public PA_STATE
this->reset = false; this->reset = false;
} }
this->pfnPaFunc = this->pfnPaNextFunc;
if (!HasWork()) if (!HasWork())
{ {
morePrims = false; // no more to do morePrims = false; // no more to do
@ -290,12 +288,14 @@ struct PA_STATE_OPT : public PA_STATE
#if ENABLE_AVX512_SIMD16 #if ENABLE_AVX512_SIMD16
void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16, void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
uint32_t numSimdPrims = 0, uint32_t numSimdPrims = 0,
uint32_t numPrimsIncrement = 0, uint32_t numPrimsIncrement = 0,
bool reset = false) bool reset = false)
{ {
this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16; this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
this->pfnPaNextFunc = pfnPaNextFunc;
this->nextNumSimdPrims = numSimdPrims; this->nextNumSimdPrims = numSimdPrims;
this->nextNumPrimsIncrement = numPrimsIncrement; this->nextNumPrimsIncrement = numPrimsIncrement;
this->nextReset = reset; this->nextReset = reset;
@ -344,12 +344,13 @@ INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNext
#if ENABLE_AVX512_SIMD16 #if ENABLE_AVX512_SIMD16
INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16, INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
uint32_t numSimdPrims = 0, uint32_t numSimdPrims = 0,
uint32_t numPrimsIncrement = 0, uint32_t numPrimsIncrement = 0,
bool reset = false) bool reset = false)
{ {
return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset); return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
} }
#endif #endif

View file

@ -469,6 +469,7 @@ static bool PaPatchList_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector ver
SetNextPaState_simd16( SetNextPaState_simd16(
pa, pa,
PaPatchList_simd16<TotalControlPoints, CurrentControlPoints + 1>, PaPatchList_simd16<TotalControlPoints, CurrentControlPoints + 1>,
PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
PaPatchListSingle<TotalControlPoints>); PaPatchListSingle<TotalControlPoints>);
return false; return false;
@ -505,6 +506,7 @@ static bool PaPatchListTerm_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector
SetNextPaState_simd16( SetNextPaState_simd16(
pa, pa,
PaPatchList_simd16<TotalControlPoints>, PaPatchList_simd16<TotalControlPoints>,
PaPatchList<TotalControlPoints>,
PaPatchListSingle<TotalControlPoints>, PaPatchListSingle<TotalControlPoints>,
0, 0,
KNOB_SIMD16_WIDTH, KNOB_SIMD16_WIDTH,
@ -741,13 +743,13 @@ bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
#if ENABLE_AVX512_SIMD16 #if ENABLE_AVX512_SIMD16
bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
{ {
SetNextPaState_simd16(pa, PaTriList1_simd16, PaTriListSingle0); SetNextPaState_simd16(pa, PaTriList1_simd16, PaTriList1, PaTriListSingle0);
return false; // Not enough vertices to assemble 16 triangles return false; // Not enough vertices to assemble 16 triangles
} }
bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
{ {
SetNextPaState_simd16(pa, PaTriList2_simd16, PaTriListSingle0); SetNextPaState_simd16(pa, PaTriList2_simd16, PaTriList2, PaTriListSingle0);
return false; // Not enough vertices to assemble 16 triangles return false; // Not enough vertices to assemble 16 triangles
} }
@ -781,7 +783,7 @@ bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
v2[i] = _simd16_permute_ps(temp2, perm2); v2[i] = _simd16_permute_ps(temp2, perm2);
} }
SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriListSingle0, 0, KNOB_SIMD16_WIDTH, true); SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriList0, PaTriListSingle0, 0, KNOB_SIMD16_WIDTH, true);
return true; return true;
} }
@ -1019,7 +1021,7 @@ bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
#if ENABLE_AVX512_SIMD16 #if ENABLE_AVX512_SIMD16
bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
{ {
SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStripSingle0); SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0);
return false; // Not enough vertices to assemble 16 triangles. return false; // Not enough vertices to assemble 16 triangles.
} }
@ -1050,7 +1052,7 @@ bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
v2[i] = _simd16_shuffle_ps(a[i], shuff, _MM_SHUFFLE(2, 2, 2, 2)); // a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0 v2[i] = _simd16_shuffle_ps(a[i], shuff, _MM_SHUFFLE(2, 2, 2, 2)); // a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
} }
SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStripSingle0, 0, KNOB_SIMD16_WIDTH); SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0, 0, KNOB_SIMD16_WIDTH);
return true; return true;
} }
@ -1285,7 +1287,7 @@ bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
#if ENABLE_AVX512_SIMD16 #if ENABLE_AVX512_SIMD16
bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
{ {
SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFanSingle0); SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0);
return false; // Not enough vertices to assemble 16 triangles. return false; // Not enough vertices to assemble 16 triangles.
} }
@ -1319,7 +1321,7 @@ bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
v1[i] = _simd16_shuffle_ps(b[i], v2[i], _MM_SHUFFLE(2, 1, 2, 1)); // b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 v1[i] = _simd16_shuffle_ps(b[i], v2[i], _MM_SHUFFLE(2, 1, 2, 1)); // b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
} }
SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFanSingle0, 0, KNOB_SIMD16_WIDTH); SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0, 0, KNOB_SIMD16_WIDTH);
return true; return true;
} }
@ -1457,7 +1459,7 @@ bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
#if ENABLE_AVX512_SIMD16 #if ENABLE_AVX512_SIMD16
bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
{ {
SetNextPaState_simd16(pa, PaQuadList1_simd16, PaQuadListSingle0); SetNextPaState_simd16(pa, PaQuadList1_simd16, PaQuadList1, PaQuadListSingle0);
return false; // Not enough vertices to assemble 16 triangles. return false; // Not enough vertices to assemble 16 triangles.
} }
@ -1485,7 +1487,7 @@ bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
v2[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 2, 3, 2)); // a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF v2[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 2, 3, 2)); // a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF
} }
SetNextPaState_simd16(pa, PaQuadList0_simd16, PaQuadListSingle0, 0, KNOB_SIMD16_WIDTH, true); SetNextPaState_simd16(pa, PaQuadList0_simd16, PaQuadList0, PaQuadListSingle0, 0, KNOB_SIMD16_WIDTH, true);
return true; return true;
} }
@ -1712,7 +1714,7 @@ bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
#if ENABLE_AVX512_SIMD16 #if ENABLE_AVX512_SIMD16
bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
{ {
SetNextPaState_simd16(pa, PaLineLoop1_simd16, PaLineLoopSingle0); SetNextPaState_simd16(pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0);
return false; return false;
} }
@ -1735,7 +1737,7 @@ bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
} }
} }
SetNextPaState_simd16(pa, PaLineLoop1_simd16, PaLineLoopSingle0, 0, KNOB_SIMD16_WIDTH); SetNextPaState_simd16(pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0, 0, KNOB_SIMD16_WIDTH);
return true; return true;
} }
@ -1824,7 +1826,7 @@ bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
#if ENABLE_AVX512_SIMD16 #if ENABLE_AVX512_SIMD16
bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
{ {
SetNextPaState_simd16(pa, PaLineList1_simd16, PaLineListSingle0); SetNextPaState_simd16(pa, PaLineList1_simd16, PaLineList1, PaLineListSingle0);
return false; // Not enough vertices to assemble 16 lines return false; // Not enough vertices to assemble 16 lines
} }
@ -1849,7 +1851,7 @@ bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1)); // a1 a3 a5 a7 a9 aB aD aF b1 b3 b5 b7 b9 bB bD bF v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1)); // a1 a3 a5 a7 a9 aB aD aF b1 b3 b5 b7 b9 bB bD bF
} }
SetNextPaState_simd16(pa, PaLineList0_simd16, PaLineListSingle0, 0, KNOB_SIMD16_WIDTH, true); SetNextPaState_simd16(pa, PaLineList0_simd16, PaLineList0, PaLineListSingle0, 0, KNOB_SIMD16_WIDTH, true);
return true; return true;
} }
@ -2042,7 +2044,7 @@ bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
#if ENABLE_AVX512_SIMD16 #if ENABLE_AVX512_SIMD16
bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
{ {
SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStripSingle0); SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0);
return false; // Not enough vertices to assemble 16 lines return false; // Not enough vertices to assemble 16 lines
} }
@ -2069,7 +2071,7 @@ bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
v1[i] = _simd16_permute_ps(temp, perm); // a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 v1[i] = _simd16_permute_ps(temp, perm); // a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0
} }
SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStripSingle0, 0, KNOB_SIMD16_WIDTH); SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0, 0, KNOB_SIMD16_WIDTH);
return true; return true;
} }
@ -2234,7 +2236,7 @@ bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
verts[0] = a; // points only have 1 vertex. verts[0] = a; // points only have 1 vertex.
SetNextPaState_simd16(pa, PaPoints0_simd16, PaPointsSingle0, 0, KNOB_SIMD16_WIDTH, true); SetNextPaState_simd16(pa, PaPoints0_simd16, PaPoints0, PaPointsSingle0, 0, KNOB_SIMD16_WIDTH, true);
return true; return true;
} }
@ -2390,7 +2392,7 @@ bool PaRectList2(
/// There is not enough to assemble 8 triangles. /// There is not enough to assemble 8 triangles.
bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
{ {
SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectListSingle0); SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectList1, PaRectListSingle0);
return false; return false;
} }
@ -2494,7 +2496,7 @@ bool PaRectList1_simd16(
v2[i] = _simd16_insert_ps(_simd16_setzero_ps(), v2_lo, 0); v2[i] = _simd16_insert_ps(_simd16_setzero_ps(), v2_lo, 0);
} }
SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectListSingle0, 0, KNOB_SIMD16_WIDTH, true); SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectList1, PaRectListSingle0, 0, KNOB_SIMD16_WIDTH, true);
return true; return true;
} }
@ -2510,7 +2512,7 @@ bool PaRectList2_simd16(
simd16vector verts[]) simd16vector verts[])
{ {
SWR_INVALID("Is rect list used for anything other then clears?"); SWR_INVALID("Is rect list used for anything other then clears?");
SetNextPaState_simd16(pa, PaRectList0_simd16, PaRectListSingle0, 0, KNOB_SIMD16_WIDTH, true); SetNextPaState_simd16(pa, PaRectList0_simd16, PaRectList0, PaRectListSingle0, 0, KNOB_SIMD16_WIDTH, true);
return true; return true;
} }