mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-04 22:49:13 +02:00
swr: [rasterizer core] Add experimental support for hyper-threaded front-end
Acked-by: Brian Paul <brianp@vmware.com>
This commit is contained in:
parent
9a8146d0ff
commit
a939a58881
5 changed files with 140 additions and 57 deletions
|
|
@ -87,7 +87,10 @@ HANDLE SwrCreateContext(
|
|||
// Calling createThreadPool() above can set SINGLE_THREADED
|
||||
if (KNOB_SINGLE_THREADED)
|
||||
{
|
||||
SET_KNOB(HYPERTHREADED_FE, false);
|
||||
pContext->NumWorkerThreads = 1;
|
||||
pContext->NumFEThreads = 1;
|
||||
pContext->NumBEThreads = 1;
|
||||
}
|
||||
|
||||
// Allocate scratch space for workers.
|
||||
|
|
@ -177,8 +180,7 @@ void QueueWork(SWR_CONTEXT *pContext)
|
|||
// multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
|
||||
// have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
|
||||
// then moved on if all work is done.)
|
||||
pContext->pCurDrawContext->threadsDone =
|
||||
pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
|
||||
pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads;
|
||||
|
||||
_ReadWriteBarrier();
|
||||
{
|
||||
|
|
@ -196,7 +198,7 @@ void QueueWork(SWR_CONTEXT *pContext)
|
|||
{
|
||||
static TileSet lockedTiles;
|
||||
uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
|
||||
WorkOnFifoFE(pContext, 0, curDraw[0], 0);
|
||||
WorkOnFifoFE(pContext, 0, curDraw[0]);
|
||||
WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0);
|
||||
}
|
||||
else
|
||||
|
|
|
|||
|
|
@ -382,32 +382,28 @@ struct DRAW_STATE
|
|||
// This draw context maintains all of the state needed for the draw operation.
|
||||
struct DRAW_CONTEXT
|
||||
{
|
||||
SWR_CONTEXT *pContext;
|
||||
SWR_CONTEXT* pContext;
|
||||
uint64_t drawId;
|
||||
MacroTileMgr* pTileMgr;
|
||||
DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
|
||||
uint64_t dependency;
|
||||
DRAW_STATE* pState;
|
||||
CachingArena* pArena;
|
||||
|
||||
uint64_t drawId;
|
||||
bool isCompute; // Is this DC a compute context?
|
||||
bool cleanupState; // True if this is the last draw using an entry in the state ring.
|
||||
volatile bool doneFE; // Is FE work done for this draw?
|
||||
|
||||
bool isCompute; // Is this DC a compute context?
|
||||
volatile OSALIGNLINE(uint32_t) FeLock;
|
||||
volatile int64_t threadsDone;
|
||||
|
||||
FE_WORK FeWork;
|
||||
volatile OSALIGNLINE(uint32_t) FeLock;
|
||||
volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw?
|
||||
volatile OSALIGNLINE(int64_t) threadsDone;
|
||||
OSALIGNLINE(FE_WORK) FeWork;
|
||||
uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS]; // Scratch space used for spill fills.
|
||||
|
||||
uint64_t dependency;
|
||||
|
||||
MacroTileMgr* pTileMgr;
|
||||
|
||||
// The following fields are valid if isCompute is true.
|
||||
DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
|
||||
|
||||
DRAW_STATE* pState;
|
||||
CachingArena* pArena;
|
||||
|
||||
uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS]; // Scratch space used for spill fills.
|
||||
|
||||
bool cleanupState; // True if this is the last draw using an entry in the state ring.
|
||||
};
|
||||
|
||||
static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
|
||||
|
||||
INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
|
||||
{
|
||||
SWR_ASSERT(pDC != nullptr);
|
||||
|
|
@ -459,6 +455,8 @@ struct SWR_CONTEXT
|
|||
uint32_t curStateId; // Current index to the next available entry in the DS ring.
|
||||
|
||||
uint32_t NumWorkerThreads;
|
||||
uint32_t NumFEThreads;
|
||||
uint32_t NumBEThreads;
|
||||
|
||||
THREAD_POOL threadPool; // Thread pool associated with this context
|
||||
|
||||
|
|
|
|||
|
|
@ -305,10 +305,10 @@ INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
|
|||
return result;
|
||||
}
|
||||
|
||||
INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
|
||||
INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE, uint64_t& drawEnqueued)
|
||||
{
|
||||
// increment our current draw id to the first incomplete draw
|
||||
uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
|
||||
drawEnqueued = GetEnqueuedDraw(pContext);
|
||||
while (curDrawBE < drawEnqueued)
|
||||
{
|
||||
DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
|
||||
|
|
@ -316,8 +316,9 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
|
|||
// If its not compute and FE is not done then break out of loop.
|
||||
if (!pDC->doneFE && !pDC->isCompute) break;
|
||||
|
||||
bool isWorkComplete = (pDC->isCompute) ?
|
||||
pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete();
|
||||
bool isWorkComplete = pDC->isCompute ?
|
||||
pDC->pDispatch->isWorkComplete() :
|
||||
pDC->pTileMgr->isWorkComplete();
|
||||
|
||||
if (isWorkComplete)
|
||||
{
|
||||
|
|
@ -358,7 +359,8 @@ void WorkOnFifoBE(
|
|||
{
|
||||
// Find the first incomplete draw that has pending work. If no such draw is found then
|
||||
// return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
|
||||
if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
|
||||
uint64_t drawEnqueued = 0;
|
||||
if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
|
@ -373,7 +375,7 @@ void WorkOnFifoBE(
|
|||
// 2. If we're trying to work on draws after curDrawBE, we are restricted to
|
||||
// working on those macrotiles that are known to be complete in the prior draw to
|
||||
// maintain order. The locked tiles provides the history to ensures this.
|
||||
for (uint64_t i = curDrawBE; i < GetEnqueuedDraw(pContext); ++i)
|
||||
for (uint64_t i = curDrawBE; i < drawEnqueued; ++i)
|
||||
{
|
||||
DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
|
||||
|
||||
|
|
@ -466,7 +468,7 @@ void WorkOnFifoBE(
|
|||
}
|
||||
}
|
||||
|
||||
void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode)
|
||||
void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE)
|
||||
{
|
||||
// Try to grab the next DC from the ring
|
||||
uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
|
||||
|
|
@ -519,38 +521,43 @@ void WorkOnCompute(
|
|||
uint32_t workerId,
|
||||
uint64_t& curDrawBE)
|
||||
{
|
||||
if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
|
||||
uint64_t drawEnqueued = 0;
|
||||
if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
|
||||
|
||||
DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
|
||||
if (pDC->isCompute == false) return;
|
||||
|
||||
// check dependencies
|
||||
if (CheckDependency(pContext, pDC, lastRetiredDraw))
|
||||
for (uint64_t i = curDrawBE; curDrawBE < drawEnqueued; ++i)
|
||||
{
|
||||
return;
|
||||
}
|
||||
DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
|
||||
if (pDC->isCompute == false) return;
|
||||
|
||||
SWR_ASSERT(pDC->pDispatch != nullptr);
|
||||
DispatchQueue& queue = *pDC->pDispatch;
|
||||
|
||||
// Is there any work remaining?
|
||||
if (queue.getNumQueued() > 0)
|
||||
{
|
||||
uint32_t threadGroupId = 0;
|
||||
while (queue.getWork(threadGroupId))
|
||||
// check dependencies
|
||||
if (CheckDependency(pContext, pDC, lastRetiredDraw))
|
||||
{
|
||||
ProcessComputeBE(pDC, workerId, threadGroupId);
|
||||
return;
|
||||
}
|
||||
|
||||
queue.finishedWork();
|
||||
SWR_ASSERT(pDC->pDispatch != nullptr);
|
||||
DispatchQueue& queue = *pDC->pDispatch;
|
||||
|
||||
// Is there any work remaining?
|
||||
if (queue.getNumQueued() > 0)
|
||||
{
|
||||
uint32_t threadGroupId = 0;
|
||||
while (queue.getWork(threadGroupId))
|
||||
{
|
||||
ProcessComputeBE(pDC, workerId, threadGroupId);
|
||||
|
||||
queue.finishedWork();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<bool IsFEThread, bool IsBEThread>
|
||||
DWORD workerThreadMain(LPVOID pData)
|
||||
{
|
||||
THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
|
||||
|
|
@ -634,25 +641,38 @@ DWORD workerThreadMain(LPVOID pData)
|
|||
}
|
||||
}
|
||||
|
||||
RDTSC_START(WorkerWorkOnFifoBE);
|
||||
WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
|
||||
RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
|
||||
if (IsBEThread)
|
||||
{
|
||||
RDTSC_START(WorkerWorkOnFifoBE);
|
||||
WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
|
||||
RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
|
||||
|
||||
WorkOnCompute(pContext, workerId, curDrawBE);
|
||||
WorkOnCompute(pContext, workerId, curDrawBE);
|
||||
}
|
||||
|
||||
WorkOnFifoFE(pContext, workerId, curDrawFE, numaNode);
|
||||
if (IsFEThread)
|
||||
{
|
||||
WorkOnFifoFE(pContext, workerId, curDrawFE);
|
||||
|
||||
if (!IsBEThread)
|
||||
{
|
||||
curDrawBE = curDrawFE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
template<> DWORD workerThreadMain<false, false>(LPVOID) = delete;
|
||||
|
||||
template <bool IsFEThread, bool IsBEThread>
|
||||
DWORD workerThreadInit(LPVOID pData)
|
||||
{
|
||||
#if defined(_WIN32)
|
||||
__try
|
||||
#endif // _WIN32
|
||||
{
|
||||
return workerThreadMain(pData);
|
||||
return workerThreadMain<IsFEThread, IsBEThread>(pData);
|
||||
}
|
||||
|
||||
#if defined(_WIN32)
|
||||
|
|
@ -664,6 +684,7 @@ DWORD workerThreadInit(LPVOID pData)
|
|||
|
||||
return 1;
|
||||
}
|
||||
template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
|
||||
|
||||
void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
|
||||
{
|
||||
|
|
@ -681,6 +702,16 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
|
|||
uint32_t numCoresPerNode = numHWCoresPerNode;
|
||||
uint32_t numHyperThreads = numHWHyperThreads;
|
||||
|
||||
if (KNOB_MAX_WORKER_THREADS)
|
||||
{
|
||||
SET_KNOB(HYPERTHREADED_FE, false);
|
||||
}
|
||||
|
||||
if (KNOB_HYPERTHREADED_FE)
|
||||
{
|
||||
SET_KNOB(MAX_THREADS_PER_CORE, 0);
|
||||
}
|
||||
|
||||
if (KNOB_MAX_NUMA_NODES)
|
||||
{
|
||||
numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES);
|
||||
|
|
@ -696,6 +727,11 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
|
|||
numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE);
|
||||
}
|
||||
|
||||
if (numHyperThreads < 2)
|
||||
{
|
||||
SET_KNOB(HYPERTHREADED_FE, false);
|
||||
}
|
||||
|
||||
// Calculate numThreads
|
||||
uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
|
||||
|
||||
|
|
@ -770,9 +806,14 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
|
|||
pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups;
|
||||
pPool->pThreadData[workerId].threadId = 0;
|
||||
pPool->pThreadData[workerId].numaId = 0;
|
||||
pPool->pThreadData[workerId].coreId = 0;
|
||||
pPool->pThreadData[workerId].htId = 0;
|
||||
pPool->pThreadData[workerId].pContext = pContext;
|
||||
pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
|
||||
pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]);
|
||||
pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
|
||||
|
||||
pContext->NumBEThreads++;
|
||||
pContext->NumFEThreads++;
|
||||
}
|
||||
}
|
||||
else
|
||||
|
|
@ -804,8 +845,29 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
|
|||
pPool->pThreadData[workerId].procGroupId = core.procGroup;
|
||||
pPool->pThreadData[workerId].threadId = core.threadIds[t];
|
||||
pPool->pThreadData[workerId].numaId = n;
|
||||
pPool->pThreadData[workerId].coreId = c;
|
||||
pPool->pThreadData[workerId].htId = t;
|
||||
pPool->pThreadData[workerId].pContext = pContext;
|
||||
pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]);
|
||||
|
||||
if (KNOB_HYPERTHREADED_FE)
|
||||
{
|
||||
if (t == 0)
|
||||
{
|
||||
pContext->NumBEThreads++;
|
||||
pPool->threads[workerId] = new std::thread(workerThreadInit<false, true>, &pPool->pThreadData[workerId]);
|
||||
}
|
||||
else
|
||||
{
|
||||
pContext->NumFEThreads++;
|
||||
pPool->threads[workerId] = new std::thread(workerThreadInit<true, false>, &pPool->pThreadData[workerId]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
|
||||
pContext->NumBEThreads++;
|
||||
pContext->NumFEThreads++;
|
||||
}
|
||||
|
||||
++workerId;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -41,6 +41,8 @@ struct THREAD_DATA
|
|||
uint32_t procGroupId; // Will always be 0 for non-Windows OS
|
||||
uint32_t threadId; // within the procGroup for Windows
|
||||
uint32_t numaId; // NUMA node id
|
||||
uint32_t coreId; // Core id
|
||||
uint32_t htId; // Hyperthread id
|
||||
uint32_t workerId;
|
||||
SWR_CONTEXT *pContext;
|
||||
bool forceBindProcGroup; // Only useful when KNOB_MAX_WORKER_THREADS is set.
|
||||
|
|
@ -62,7 +64,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
|
|||
void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
|
||||
|
||||
// Expose FE and BE worker functions to the API thread if single threaded
|
||||
void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode);
|
||||
void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE);
|
||||
void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
|
||||
void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
|
||||
int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
|
||||
|
|
@ -30,6 +30,18 @@ KNOBS = [
|
|||
'category' : 'debug',
|
||||
}],
|
||||
|
||||
['HYPERTHREADED_FE', {
|
||||
'type' : 'bool',
|
||||
'default' : 'false',
|
||||
'desc' : ['EXPERIMENTAL!!',
|
||||
'If enabled will attempt to use secondary threads per core to perform',
|
||||
'front-end (VS/GS) work.',
|
||||
'',
|
||||
'Note: Setting this will cause KNOB_MAX_THREADS_PER_CORE to be ignored.'],
|
||||
'category' : 'perf',
|
||||
'advanced' : 'true',
|
||||
}],
|
||||
|
||||
['DUMP_SHADER_IR', {
|
||||
'type' : 'bool',
|
||||
'default' : 'false',
|
||||
|
|
@ -166,6 +178,7 @@ KNOBS = [
|
|||
'',
|
||||
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
|
||||
'category' : 'perf',
|
||||
'advanced' : 'true',
|
||||
}],
|
||||
|
||||
['TOSS_FETCH', {
|
||||
|
|
@ -175,6 +188,7 @@ KNOBS = [
|
|||
'',
|
||||
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
|
||||
'category' : 'perf',
|
||||
'advanced' : 'true',
|
||||
}],
|
||||
|
||||
['TOSS_IA', {
|
||||
|
|
@ -184,6 +198,7 @@ KNOBS = [
|
|||
'',
|
||||
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
|
||||
'category' : 'perf',
|
||||
'advanced' : 'true',
|
||||
}],
|
||||
|
||||
['TOSS_VS', {
|
||||
|
|
@ -193,6 +208,7 @@ KNOBS = [
|
|||
'',
|
||||
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
|
||||
'category' : 'perf',
|
||||
'advanced' : 'true',
|
||||
}],
|
||||
|
||||
['TOSS_SETUP_TRIS', {
|
||||
|
|
@ -202,6 +218,7 @@ KNOBS = [
|
|||
'',
|
||||
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
|
||||
'category' : 'perf',
|
||||
'advanced' : 'true',
|
||||
}],
|
||||
|
||||
['TOSS_BIN_TRIS', {
|
||||
|
|
@ -211,6 +228,7 @@ KNOBS = [
|
|||
'',
|
||||
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
|
||||
'category' : 'perf',
|
||||
'advanced' : 'true',
|
||||
}],
|
||||
|
||||
['TOSS_RS', {
|
||||
|
|
@ -220,4 +238,5 @@ KNOBS = [
|
|||
'',
|
||||
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
|
||||
'category' : 'perf',
|
||||
'advanced' : 'true',
|
||||
}],]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue