swr/rast: Fix 64bit float loads in x86 lowering pass

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
George Kyriazis 2018-04-06 16:39:09 -05:00
parent 1ffbbbee97
commit 96ad8f5a23
2 changed files with 25 additions and 45 deletions

View file

@ -201,44 +201,7 @@ namespace SwrJit
/// @param scale - value to scale indices by
Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
{
Value* vGather;
// use avx2 gather instruction if available
if (JM()->mArch.AVX2())
{
vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth / 2)), VectorType::get(mDoubleTy, mVWidth / 2));
vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
}
else
{
Value* pStack = STACKSAVE();
// store vSrc on the stack. this way we can select between a valid load address and the vSrc address
Value* vSrcPtr = ALLOCA(vSrc->getType());
SetTempAlloca(vSrcPtr);
STORE(vSrc, vSrcPtr);
vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
Value *vOffsets = MUL(vIndices, vScaleVec);
for (uint32_t i = 0; i < mVWidth / 2; ++i)
{
// single component byte index
Value *offset = VEXTRACT(vOffsets, C(i));
// byte pointer to component
Value *loadAddress = GEP(pBase, offset);
loadAddress = BITCAST(loadAddress, PointerType::get(mDoubleTy, 0));
// pointer to the value to load if we're masking off a component
Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
Value *selMask = VEXTRACT(vMask, C(i));
// switch in a safe address to load if we're trying to access a vertex
Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
Value *val = LOAD(validAddress);
vGather = VINSERT(vGather, val, C(i));
}
STACKRESTORE(pStack);
}
return vGather;
return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
}
//////////////////////////////////////////////////////////////////////////

View file

@ -230,7 +230,6 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
}
// Fetch attributes from memory and output to a simdvertex struct
// since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
RET_VOID();
@ -763,13 +762,31 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
// if we need to gather the component
if (compCtrl[i] == StoreSrc)
{
Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
Value* vShufLo;
Value* vShufHi;
Value* vShufAll;
Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
if (mVWidth == 8)
{
vShufLo = C({ 0, 1, 2, 3 });
vShufHi = C({ 4, 5, 6, 7 });
vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
}
else
{
SWR_ASSERT(mVWidth == 16);
vShufLo = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
vShufHi = C({ 8, 9, 10, 11, 12, 13, 14, 15 });
vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
}
Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
Value *vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo);
Value *vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi);
Value *vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo);
Value *vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi);
Value *vZeroDouble = VECTOR_SPLAT(mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
@ -777,7 +794,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
pGatherLo = VCVTPD2PS(pGatherLo);
pGatherHi = VCVTPD2PS(pGatherHi);
Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll);
vVertexElements[currentVertexElement++] = pGather;
}