mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 19:40:10 +01:00
swr: [rasterizer core] SIMD16 Frontend WIP
Fix GS and streamout. Reviewed-by: George Kyriazis <george.kyriazis@intel.com>
This commit is contained in:
parent
fee3fc018b
commit
549b9d2e9f
2 changed files with 136 additions and 22 deletions
|
|
@ -376,7 +376,16 @@ public:
|
|||
const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
|
||||
|
||||
uint32_t numClippedPrims = 0;
|
||||
#if USE_SIMD16_FRONTEND
|
||||
const uint32_t numPrims = pa.NumPrims();
|
||||
const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
|
||||
|
||||
SWR_ASSERT(numPrims <= numPrims_lo);
|
||||
|
||||
for (uint32_t inputPrim = 0; inputPrim < numPrims_lo; ++inputPrim)
|
||||
#else
|
||||
for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
|
||||
#endif
|
||||
{
|
||||
uint32_t numEmittedVerts = pVertexCount[inputPrim];
|
||||
if (numEmittedVerts < NumVertsPerPrim)
|
||||
|
|
@ -391,13 +400,28 @@ public:
|
|||
// tranpose clipper output so that each lane's vertices are in SIMD order
|
||||
// set aside space for 2 vertices, as the PA will try to read up to 16 verts
|
||||
// for triangle fan
|
||||
#if USE_SIMD16_FRONTEND
|
||||
simd16vertex transposedPrims[2];
|
||||
#else
|
||||
simdvertex transposedPrims[2];
|
||||
#endif
|
||||
|
||||
// transpose pos
|
||||
uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
|
||||
|
||||
#if USE_SIMD16_FRONTEND
|
||||
// TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug - use dx11_clipping_03-09 failures to check for existence of bug
|
||||
static const float *dummy = reinterpret_cast<const float *>(pBase);
|
||||
#endif
|
||||
|
||||
for (uint32_t c = 0; c < 4; ++c)
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
|
||||
transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
|
||||
#else
|
||||
transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
|
||||
#endif
|
||||
pBase += sizeof(simdscalar);
|
||||
}
|
||||
|
||||
|
|
@ -408,7 +432,12 @@ public:
|
|||
uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib;
|
||||
for (uint32_t c = 0; c < 4; ++c)
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
|
||||
transposedPrims[0].attrib[attribSlot][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
|
||||
#else
|
||||
transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
|
||||
#endif
|
||||
pBase += sizeof(simdscalar);
|
||||
}
|
||||
}
|
||||
|
|
@ -419,7 +448,12 @@ public:
|
|||
pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim;
|
||||
for (uint32_t c = 0; c < 4; ++c)
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
|
||||
transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
|
||||
#else
|
||||
transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
|
||||
#endif
|
||||
pBase += sizeof(simdscalar);
|
||||
}
|
||||
}
|
||||
|
|
@ -429,7 +463,12 @@ public:
|
|||
pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim;
|
||||
for (uint32_t c = 0; c < 4; ++c)
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
|
||||
transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
|
||||
#else
|
||||
transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
|
||||
#endif
|
||||
pBase += sizeof(simdscalar);
|
||||
}
|
||||
}
|
||||
|
|
@ -440,6 +479,27 @@ public:
|
|||
{
|
||||
do
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
simd16vector attrib_simd16[NumVertsPerPrim];
|
||||
bool assemble = clipPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16);
|
||||
|
||||
if (assemble)
|
||||
{
|
||||
static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
|
||||
|
||||
simdvector attrib[NumVertsPerPrim];
|
||||
for (uint32_t i = 0; i < NumVertsPerPrim; i += 1)
|
||||
{
|
||||
for (uint32_t j = 0; j < 4; j += 1)
|
||||
{
|
||||
attrib[i][j] = _simd16_extract_ps(attrib_simd16[i][j], 0);
|
||||
}
|
||||
}
|
||||
|
||||
clipPa.useAlternateOffset = false;
|
||||
pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim]));
|
||||
}
|
||||
#else
|
||||
simdvector attrib[NumVertsPerPrim];
|
||||
bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib);
|
||||
if (assemble)
|
||||
|
|
@ -447,6 +507,7 @@ public:
|
|||
static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
|
||||
pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim]));
|
||||
}
|
||||
#endif
|
||||
} while (clipPa.NextPrim());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -495,6 +495,9 @@ static void StreamOut(
|
|||
PA_STATE& pa,
|
||||
uint32_t workerId,
|
||||
uint32_t* pPrimData,
|
||||
#if USE_SIMD16_FRONTEND
|
||||
uint32_t numPrims_simd8,
|
||||
#endif
|
||||
uint32_t streamIndex)
|
||||
{
|
||||
SWR_CONTEXT *pContext = pDC->pContext;
|
||||
|
|
@ -517,7 +520,12 @@ static void StreamOut(
|
|||
soContext.pBuffer[i] = &state.soBuffer[i];
|
||||
}
|
||||
|
||||
#if USE_SIMD16_FRONTEND
|
||||
uint32_t numPrims = numPrims_simd8;
|
||||
#else
|
||||
uint32_t numPrims = pa.NumPrims();
|
||||
#endif
|
||||
|
||||
for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
|
||||
{
|
||||
DWORD slot = 0;
|
||||
|
|
@ -604,7 +612,7 @@ INLINE static T RoundDownEven(T value)
|
|||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// Pack pairs of simdvertexes into simd16vertexes, in-place
|
||||
/// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
|
||||
///
|
||||
/// vertexCount is in terms of the source simdvertexes and must be even
|
||||
///
|
||||
|
|
@ -612,10 +620,10 @@ INLINE static T RoundDownEven(T value)
|
|||
///
|
||||
/// note: the stride between vertexes is determinded by KNOB_NUM_ATTRIBUTES
|
||||
///
|
||||
void PackPairsOfSimdVertexIntoSimd16VertexInPlace(simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
|
||||
void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
|
||||
{
|
||||
SWR_ASSERT(vertex);
|
||||
SWR_ASSERT(IsEven(vertexCount));
|
||||
SWR_ASSERT(vertex_simd16);
|
||||
SWR_ASSERT(attribCount <= KNOB_NUM_ATTRIBUTES);
|
||||
|
||||
simd16vertex temp;
|
||||
|
|
@ -626,14 +634,18 @@ void PackPairsOfSimdVertexIntoSimd16VertexInPlace(simdvertex *vertex, uint32_t v
|
|||
{
|
||||
for (uint32_t k = 0; k < 4; k += 1)
|
||||
{
|
||||
temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
|
||||
temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
|
||||
temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
|
||||
|
||||
if ((i + 1) < vertexCount)
|
||||
{
|
||||
temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t j = 0; j < attribCount; j += 1)
|
||||
{
|
||||
reinterpret_cast<simd16vertex *>(vertex)[i >> 1].attrib[j] = temp.attrib[j];
|
||||
vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -704,17 +716,16 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num
|
|||
|
||||
THREAD SWR_GS_CONTEXT tlsGsContext;
|
||||
|
||||
#if USE_SIMD16_FRONTEND
|
||||
THREAD simd16vertex tempVertex_simd16[128];
|
||||
|
||||
#endif
|
||||
template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
|
||||
struct GsBufferInfo
|
||||
{
|
||||
GsBufferInfo(const SWR_GS_STATE &gsState)
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
// TEMPORARY: pad up to multiple of two, to support in-place conversion from simdvertex to simd16vertex
|
||||
const uint32_t vertexCount = RoundUpEven(gsState.maxNumVerts);
|
||||
#else
|
||||
const uint32_t vertexCount = gsState.maxNumVerts;
|
||||
#endif
|
||||
const uint32_t vertexStride = sizeof(SIMDVERTEX);
|
||||
const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH;
|
||||
|
||||
|
|
@ -896,18 +907,19 @@ static void GeometryShaderStage(
|
|||
}
|
||||
|
||||
#if USE_SIMD16_FRONTEND
|
||||
// TEMPORARY: GS outputs simdvertex, PA inputs simd16vertex, so convert simdvertex to simd16vertex, in-place
|
||||
// TEMPORARY: GS outputs simdvertex, PA inputs simd16vertex, so convert simdvertex to simd16vertex
|
||||
|
||||
const uint32_t attribCount = VERTEX_ATTRIB_START_SLOT + pState->numInputAttribs;
|
||||
SWR_ASSERT(numEmittedVerts <= 256);
|
||||
|
||||
PackPairsOfSimdVertexIntoSimd16VertexInPlace(
|
||||
reinterpret_cast<simdvertex *>(pBase),
|
||||
RoundUpEven(numEmittedVerts), // simd8 -> simd16
|
||||
attribCount);
|
||||
PackPairsOfSimdVertexIntoSimd16Vertex(
|
||||
tempVertex_simd16,
|
||||
reinterpret_cast<const simdvertex *>(pBase),
|
||||
numEmittedVerts,
|
||||
KNOB_NUM_ATTRIBUTES);
|
||||
|
||||
#endif
|
||||
#if USE_SIMD16_FRONTEND
|
||||
PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
|
||||
PA_STATE_CUT gsPa(pDC, reinterpret_cast<uint8_t *>(tempVertex_simd16), numEmittedVerts, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
|
||||
|
||||
#else
|
||||
PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
|
||||
|
|
@ -932,7 +944,22 @@ static void GeometryShaderStage(
|
|||
|
||||
if (HasStreamOutT::value)
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
const uint32_t numPrims = gsPa.NumPrims();
|
||||
const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
|
||||
const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
|
||||
|
||||
gsPa.useAlternateOffset = false;
|
||||
StreamOut(pDC, gsPa, workerId, pSoPrimData, numPrims_lo, stream);
|
||||
|
||||
if (numPrims_hi)
|
||||
{
|
||||
gsPa.useAlternateOffset = true;
|
||||
StreamOut(pDC, gsPa, workerId, pSoPrimData, numPrims_hi, stream);
|
||||
}
|
||||
#else
|
||||
StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (HasRastT::value && state.soState.streamToRasterizer == stream)
|
||||
|
|
@ -1349,7 +1376,18 @@ static void TessellationStages(
|
|||
{
|
||||
if (HasStreamOutT::value)
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
tessPa.useAlternateOffset = false;
|
||||
StreamOut(pDC, tessPa, workerId, pSoPrimData, numPrims_lo, 0);
|
||||
|
||||
if (numPrims_hi)
|
||||
{
|
||||
tessPa.useAlternateOffset = true;
|
||||
StreamOut(pDC, tessPa, workerId, pSoPrimData, numPrims_hi, 0);
|
||||
}
|
||||
#else
|
||||
StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (HasRastT::value)
|
||||
|
|
@ -1487,7 +1525,11 @@ void ProcessDraw(
|
|||
void* pStreamCutBuffer = nullptr;
|
||||
if (HasGeometryShaderT::value)
|
||||
{
|
||||
#if USE_SIMD16_FRONTEND
|
||||
AllocateGsBuffers<simd16vertex, KNOB_SIMD16_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
|
||||
#else
|
||||
AllocateGsBuffers<simdvertex, KNOB_SIMD_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (HasTessellationT::value)
|
||||
|
|
@ -1638,9 +1680,9 @@ void ProcessDraw(
|
|||
|
||||
// copy SIMD vout_lo to lo part of SIMD16 vout
|
||||
{
|
||||
const uint32_t voutNumSlots = VERTEX_ATTRIB_START_SLOT + state.feNumAttributes;
|
||||
const uint32_t attribCount = sizeof(vout.attrib) / sizeof(vout.attrib[0]);
|
||||
|
||||
for (uint32_t i = 0; i < voutNumSlots; i += 1)
|
||||
for (uint32_t i = 0; i < attribCount; i += 1)
|
||||
{
|
||||
for (uint32_t j = 0; j < 4; j += 1)
|
||||
{
|
||||
|
|
@ -1655,9 +1697,9 @@ void ProcessDraw(
|
|||
|
||||
// copy SIMD vout_hi to hi part of SIMD16 vout
|
||||
{
|
||||
const uint32_t voutNumSlots = VERTEX_ATTRIB_START_SLOT + state.feNumAttributes;
|
||||
const uint32_t attribCount = sizeof(vout.attrib) / sizeof(vout.attrib[0]);
|
||||
|
||||
for (uint32_t i = 0; i < voutNumSlots; i += 1)
|
||||
for (uint32_t i = 0; i < attribCount; i += 1)
|
||||
{
|
||||
for (uint32_t j = 0; j < 4; j += 1)
|
||||
{
|
||||
|
|
@ -1732,8 +1774,19 @@ void ProcessDraw(
|
|||
// If streamout is enabled then stream vertices out to memory.
|
||||
if (HasStreamOutT::value)
|
||||
{
|
||||
#if 1
|
||||
pa.useAlternateOffset = false;
|
||||
StreamOut(pDC, pa, workerId, pSoPrimData, numPrims_lo, 0);
|
||||
|
||||
if (numPrims_hi)
|
||||
{
|
||||
pa.useAlternateOffset = true;
|
||||
StreamOut(pDC, pa, workerId, pSoPrimData, numPrims_hi, 0);
|
||||
}
|
||||
#else
|
||||
pa.useAlternateOffset = false; // StreamOut() is SIMD16-compatible..
|
||||
StreamOut(pDC, pa, workerId, pSoPrimData, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (HasRastT::value)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue