swr/rast: Use llvm intrinsic masked gather

Use llvm intrinsic masked.gather instead of manual unroll for the cases
where we have vector of pointers.  Improves llvm IR debug experience by
reducing a ton of IR to a single intrinsic call. Also seems to reduce
overall stack use considerably.

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
George Kyriazis 2018-02-02 17:03:01 -06:00
parent 9cc9688e49
commit e12db47a7d
2 changed files with 14 additions and 0 deletions

View file

@ -346,6 +346,18 @@ namespace SwrJit
return vGather;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Alternative masked gather where source is a vector of pointers
/// @param pVecSrcPtr - SIMD wide vector of pointers
/// @param pVecMask - SIMD active lanes
/// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru)
{
Function* pMaskedGather = llvm::Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::masked_gather, { pVecPassthru->getType() });
return CALL(pMaskedGather, { pVecSrcPtr, C(0), pVecMask, pVecPassthru });
}
void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
Value* mask, Value* vGatherComponents[], bool bPackedOutput)
{

View file

@ -58,6 +58,8 @@ virtual void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byte
Value *GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
Value *GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru);
void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask);
void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput);