swr/rast: reduce simd{16}vertex stack for VS output

Frontend - reduce simdvertex/simd16vertex stack usage for VS output in
ProcessDraw, fixes stack overflow in some of the deeper call stacks under
SIMD16.

1. Move the vertex store out of PA_FACTORY, and off the stack
2. Allocate the vertex store out of the aligned heap (pointer is
   temporarily stored in TLS, but will be migrated to thread pool
   along with other frontend temporary buffers).
3. Grow the vertex store as necessary for the number of verts per
   primitive, in chunks of 8/4 simdvertex/simd16vertex

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
Tim Rowley 2017-04-19 12:21:05 -05:00
parent 536baf507e
commit 0424e6249a
2 changed files with 54 additions and 16 deletions

View file

@ -551,6 +551,7 @@ static void StreamOut(
_mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
}
soMask &= ~(1 << slot);
}
@ -1345,8 +1346,6 @@ static void TessellationStages(
const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
const uint32_t primMask = GenMask(numPrims);
const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
const simdscalari primID_lo = _simd16_extract_si(primID, 0);
const simdscalari primID_hi = _simd16_extract_si(primID, 1);
@ -1390,9 +1389,9 @@ static void TessellationStages(
if (HasRastT::value)
{
#if USE_SIMD16_FRONTEND
simd16vector prim_simd16[3];
simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points
#else
simdvector prim[3]; // Only deal with triangles, lines, or points
simdvector prim[3]; // Only deal with triangles, lines, or points
#endif
AR_BEGIN(FEPAAssemble, pDC->drawId);
bool assemble =
@ -1407,7 +1406,7 @@ static void TessellationStages(
SWR_ASSERT(pfnClipFunc);
#if USE_SIMD16_FRONTEND
tessPa.useAlternateOffset = false;
pfnClipFunc(pDC, tessPa, workerId, prim_simd16, primMask, primID, _simd16_set1_epi32(0));
pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID, _simd16_set1_epi32(0));
#else
pfnClipFunc(pDC, tessPa, workerId, prim,
GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
@ -1420,9 +1419,21 @@ static void TessellationStages(
} // while (tessPa.HasWork())
} // for (uint32_t p = 0; p < numPrims; ++p)
#if USE_SIMD16_FRONTEND
if (gt_pTessellationThreadData->pDSOutput != nullptr)
{
AlignedFree(gt_pTessellationThreadData->pDSOutput);
gt_pTessellationThreadData->pDSOutput = nullptr;
}
gt_pTessellationThreadData->numDSOutputVectors = 0;
#endif
TSDestroyCtx(tsCtx);
}
THREAD PA_STATE::SIMDVERTEX *pVertexStore = nullptr;
THREAD uint32_t gVertexStoreSize = 0;
//////////////////////////////////////////////////////////////////////////
/// @brief FE handler for SwrDraw.
/// @tparam IsIndexedT - Is indexed drawing enabled
@ -1530,8 +1541,36 @@ void ProcessDraw(
pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
}
const uint32_t vertexCount = NumVertsPerPrim(state.topology, state.gsState.gsEnable);
SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
// grow the vertex store for the PA as necessary
if (gVertexStoreSize < vertexCount)
{
if (pVertexStore != nullptr)
{
AlignedFree(pVertexStore);
}
while (gVertexStoreSize < vertexCount)
{
#if USE_SIMD16_FRONTEND
gVertexStoreSize += 4; // grow in chunks of 4 simd16vertex
#else
gVertexStoreSize += 8; // grow in chunks of 8 simdvertex
#endif
}
SWR_ASSERT(gVertexStoreSize <= MAX_NUM_VERTS_PER_PRIM);
pVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(gVertexStoreSize * sizeof(pVertexStore[0]), 64));
SWR_ASSERT(pVertexStore != nullptr);
}
// choose primitive assembler
PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, pVertexStore, gVertexStoreSize);
PA_STATE& pa = paFactory.GetPA();
#if USE_SIMD16_FRONTEND
@ -1689,8 +1728,6 @@ void ProcessDraw(
const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
const uint32_t primMask = GenMask(numPrims);
const simd16scalari primID = pa.GetPrimID(work.startPrimID);
const simdscalari primID_lo = _simd16_extract_si(primID, 0);
const simdscalari primID_hi = _simd16_extract_si(primID, 1);
@ -1732,7 +1769,7 @@ void ProcessDraw(
StreamOut(pDC, pa, workerId, pSoPrimData, numPrims_hi, 0);
}
#else
pa.useAlternateOffset = false; // StreamOut() is SIMD16-compatible..
pa.useAlternateOffset = false;
StreamOut(pDC, pa, workerId, pSoPrimData, 0);
#endif
}
@ -1742,7 +1779,7 @@ void ProcessDraw(
SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
pa.useAlternateOffset = false;
pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, primMask, primID, _simd16_setzero_si());
pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID, _simd16_setzero_si());
}
}
}

View file

@ -136,9 +136,9 @@ struct PA_STATE_OPT : public PA_STATE
uint32_t primIDIncr{ 0 }; // how much to increment for each vector (typically vector / {1, 2})
SIMDSCALARI primID;
typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
#if ENABLE_AVX512_SIMD16
typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& state, uint32_t slot, simd16vector verts[]);
typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
@ -691,6 +691,7 @@ PRAGMA_WARNING_PUSH_DISABLE(4789)
pBase += SIMD_WIDTH;
}
}
return true;
}
PRAGMA_WARNING_POP()
@ -1392,7 +1393,7 @@ private:
template <typename IsIndexedT, typename IsCutIndexEnabledT>
struct PA_FACTORY
{
PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo)
PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize) : topo(in_topo)
{
#if KNOB_ENABLE_CUT_AWARE_PA == TRUE
const API_STATE& state = GetApiState(pDC);
@ -1408,7 +1409,7 @@ struct PA_FACTORY
memset(&indexStore, 0, sizeof(indexStore));
uint32_t numAttribs = state.feNumAttributes;
new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * PA_STATE::SIMD_WIDTH,
new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH,
&this->indexStore[0], numVerts, numAttribs, state.topology, false);
cutPA = true;
}
@ -1416,7 +1417,7 @@ struct PA_FACTORY
#endif
{
uint32_t numPrims = GetNumPrims(in_topo, numVerts);
new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * PA_STATE::SIMD_WIDTH, false);
new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, false);
cutPA = false;
}
@ -1438,10 +1439,10 @@ struct PA_FACTORY
PA_STATE_OPT paOpt;
PA_STATE_CUT paCut;
bool cutPA{ false };
PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
PA_STATE::SIMDVERTEX vertexStore[MAX_NUM_VERTS_PER_PRIM];
PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM];
};