swr/rast: FP consistency between POSH/RENDER pipes

- Ensure all threads have optimal floating-point control state
- Disable auto-generation of fused FP ops for VERTEX shader stage
- Disable "fast" FP ops for VERTEX shader stage

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
Alok Hota 2018-08-28 12:23:31 -05:00
parent dc7b3c95a4
commit 0b4db43705
4 changed files with 33 additions and 11 deletions

View file

@ -294,4 +294,25 @@ int SWR_API
std::string* pOptStdErr = nullptr, ///< (Optional Out) Standard Error text
const std::string* pOptStdIn = nullptr); ///< (Optional In) Standard Input text
/// Helper for setting up FP state
/// @returns old csr state
static INLINE uint32_t SetOptimalVectorCSR()
{
uint32_t oldCSR = _mm_getcsr();
uint32_t newCSR = (oldCSR & ~(_MM_ROUND_MASK | _MM_DENORMALS_ZERO_MASK | _MM_FLUSH_ZERO_MASK));
newCSR |= (_MM_ROUND_NEAREST | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
_mm_setcsr(newCSR);
return oldCSR;
}
/// Set Vector CSR state.
/// @param csrState - should be value returned from SetOptimalVectorCSR()
static INLINE void RestoreVectorCSR(uint32_t csrState)
{
_mm_setcsr(csrState);
}
#endif //__SWR_OS_H__

View file

@ -250,9 +250,7 @@ void QueueWork(SWR_CONTEXT* pContext)
if (pContext->threadInfo.SINGLE_THREADED)
{
// flush denormals to 0
uint32_t mxcsr = _mm_getcsr();
_mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
uint32_t mxcsr = SetOptimalVectorCSR();
if (IsDraw)
{
@ -274,7 +272,7 @@ void QueueWork(SWR_CONTEXT* pContext)
}
// restore csr
_mm_setcsr(mxcsr);
RestoreVectorCSR(mxcsr);
}
else
{

View file

@ -1840,10 +1840,10 @@ void ProcessDraw(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, vo
{
vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
fetchInfo_lo.xpIndices =
pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), &vIndex);
fetchInfo_hi.xpIndices =
pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), &vIndex + KNOB_SIMD_WIDTH * sizeof(int32_t)); // 1/2 of KNOB_SIMD16_WIDTH
fetchInfo_lo.xpIndices = pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), &vIndex);
fetchInfo_hi.xpIndices = pDC->pContext->pfnMakeGfxPtr(
GetPrivateState(pDC),
&vIndex + KNOB_SIMD_WIDTH * sizeof(int32_t)); // 1/2 of KNOB_SIMD16_WIDTH
}
fetchInfo_lo.CurInstance = instanceNum;

View file

@ -421,9 +421,9 @@ INLINE void UpdateClientStats(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CON
for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
{
stats.DepthPassCount += dynState.pStats[i].DepthPassCount;
stats.PsInvocations += dynState.pStats[i].PsInvocations;
stats.CsInvocations += dynState.pStats[i].CsInvocations;
}
@ -439,6 +439,10 @@ INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONT
pDC->retireCallback.pfnCallbackFunc(pDC->retireCallback.userData,
pDC->retireCallback.userData2,
pDC->retireCallback.userData3);
// Callbacks to external code *could* change floating point control state
// Reset our optimal flags
SetOptimalVectorCSR();
}
}
@ -870,8 +874,7 @@ DWORD workerThreadMain(LPVOID pData)
uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE;
uint32_t numaMask = pContext->threadPool.numaMask;
// flush denormals to 0
_mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
SetOptimalVectorCSR();
// Track tiles locked by other threads. If we try to lock a macrotile and find its already
// locked then we'll add it to this list so that we don't try and lock it again.