mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-09 06:48:06 +02:00
swr/rasterizer: Better implementation of scatter
Added support for avx512 scatter instruction. Non-avx512 will now call into a C function to do the scatter emulation. This has better jit compile performance than the previous approach of jitting scalar loops. Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
parent
ad9aff5528
commit
5dd9ad1570
7 changed files with 225 additions and 79 deletions
|
|
@ -156,6 +156,7 @@ JITTER_CXX_SOURCES := \
|
||||||
rasterizer/jitter/streamout_jit.cpp \
|
rasterizer/jitter/streamout_jit.cpp \
|
||||||
rasterizer/jitter/streamout_jit.h \
|
rasterizer/jitter/streamout_jit.h \
|
||||||
rasterizer/jitter/shader_lib/DebugOutput.cpp \
|
rasterizer/jitter/shader_lib/DebugOutput.cpp \
|
||||||
|
rasterizer/jitter/shader_lib/Scatter.cpp \
|
||||||
rasterizer/jitter/functionpasses/passes.h \
|
rasterizer/jitter/functionpasses/passes.h \
|
||||||
rasterizer/jitter/functionpasses/lower_x86.cpp
|
rasterizer/jitter/functionpasses/lower_x86.cpp
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -82,6 +82,7 @@ files_swr_mesa = files(
|
||||||
'rasterizer/jitter/streamout_jit.cpp',
|
'rasterizer/jitter/streamout_jit.cpp',
|
||||||
'rasterizer/jitter/streamout_jit.h',
|
'rasterizer/jitter/streamout_jit.h',
|
||||||
'rasterizer/jitter/shader_lib/DebugOutput.cpp',
|
'rasterizer/jitter/shader_lib/DebugOutput.cpp',
|
||||||
|
'rasterizer/jitter/shader_lib/Scatter.cpp',
|
||||||
'rasterizer/jitter/functionpasses/lower_x86.cpp',
|
'rasterizer/jitter/functionpasses/lower_x86.cpp',
|
||||||
'rasterizer/memory/SurfaceState.h'
|
'rasterizer/memory/SurfaceState.h'
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -45,6 +45,7 @@ intrinsics = [
|
||||||
['VGATHERPD', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
|
['VGATHERPD', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
|
||||||
['VGATHERPS', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
|
['VGATHERPS', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
|
||||||
['VGATHERDD', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
|
['VGATHERDD', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'],
|
||||||
|
['VSCATTERPS', ['pBase', 'mask', 'indices', 'src', 'scale'], 'src'],
|
||||||
['VRCPPS', ['a'], 'a'],
|
['VRCPPS', ['a'], 'a'],
|
||||||
['VROUND', ['a', 'rounding'], 'a'],
|
['VROUND', ['a', 'rounding'], 'a'],
|
||||||
['BEXTR_32', ['src', 'control'], 'src'],
|
['BEXTR_32', ['src', 'control'], 'src'],
|
||||||
|
|
|
||||||
|
|
@ -237,7 +237,8 @@ namespace SwrJit
|
||||||
return Builder::MASKED_LOAD(Ptr, Align, Mask, PassThru, Name, Ty, usage);
|
return Builder::MASKED_LOAD(Ptr, Align, Mask, PassThru, Name, Ty, usage);
|
||||||
}
|
}
|
||||||
|
|
||||||
StoreInst* BuilderGfxMem::STORE(Value *Val, Value *Ptr, bool isVolatile, Type* Ty, JIT_MEM_CLIENT usage)
|
StoreInst*
|
||||||
|
BuilderGfxMem::STORE(Value* Val, Value* Ptr, bool isVolatile, Type* Ty, JIT_MEM_CLIENT usage)
|
||||||
{
|
{
|
||||||
AssertGFXMemoryParams(Ptr, usage);
|
AssertGFXMemoryParams(Ptr, usage);
|
||||||
|
|
||||||
|
|
@ -245,7 +246,11 @@ namespace SwrJit
|
||||||
return Builder::STORE(Val, Ptr, isVolatile, Ty, usage);
|
return Builder::STORE(Val, Ptr, isVolatile, Ty, usage);
|
||||||
}
|
}
|
||||||
|
|
||||||
StoreInst* BuilderGfxMem::STORE(Value* Val, Value* BasePtr, const std::initializer_list<uint32_t>& offset, Type* Ty, JIT_MEM_CLIENT usage)
|
StoreInst* BuilderGfxMem::STORE(Value* Val,
|
||||||
|
Value* BasePtr,
|
||||||
|
const std::initializer_list<uint32_t>& offset,
|
||||||
|
Type* Ty,
|
||||||
|
JIT_MEM_CLIENT usage)
|
||||||
{
|
{
|
||||||
AssertGFXMemoryParams(BasePtr, usage);
|
AssertGFXMemoryParams(BasePtr, usage);
|
||||||
|
|
||||||
|
|
@ -253,7 +258,8 @@ namespace SwrJit
|
||||||
return Builder::STORE(Val, BasePtr, offset, Ty, usage);
|
return Builder::STORE(Val, BasePtr, offset, Ty, usage);
|
||||||
}
|
}
|
||||||
|
|
||||||
CallInst* BuilderGfxMem::MASKED_STORE(Value *Val, Value *Ptr, unsigned Align, Value *Mask, Type* Ty, JIT_MEM_CLIENT usage)
|
CallInst* BuilderGfxMem::MASKED_STORE(
|
||||||
|
Value* Val, Value* Ptr, unsigned Align, Value* Mask, Type* Ty, JIT_MEM_CLIENT usage)
|
||||||
{
|
{
|
||||||
AssertGFXMemoryParams(Ptr, usage);
|
AssertGFXMemoryParams(Ptr, usage);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -647,6 +647,10 @@ namespace SwrJit
|
||||||
{
|
{
|
||||||
AssertMemoryUsageParams(pDst, usage);
|
AssertMemoryUsageParams(pDst, usage);
|
||||||
|
|
||||||
|
SWR_ASSERT(vSrc->getType()->getVectorElementType()->isFloatTy());
|
||||||
|
VSCATTERPS(pDst, vMask, vOffsets, vSrc, C(1));
|
||||||
|
return;
|
||||||
|
|
||||||
/* Scatter algorithm
|
/* Scatter algorithm
|
||||||
|
|
||||||
while(Index = BitScanForward(mask))
|
while(Index = BitScanForward(mask))
|
||||||
|
|
@ -657,6 +661,10 @@ namespace SwrJit
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
|
||||||
|
// Reference implementation kept around for reference
|
||||||
|
|
||||||
BasicBlock* pCurBB = IRB()->GetInsertBlock();
|
BasicBlock* pCurBB = IRB()->GetInsertBlock();
|
||||||
Function* pFunc = pCurBB->getParent();
|
Function* pFunc = pCurBB->getParent();
|
||||||
Type* pSrcTy = vSrc->getType()->getVectorElementType();
|
Type* pSrcTy = vSrc->getType()->getVectorElementType();
|
||||||
|
|
@ -744,5 +752,7 @@ namespace SwrJit
|
||||||
|
|
||||||
// Move builder to beginning of post loop
|
// Move builder to beginning of post loop
|
||||||
IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
|
IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
|
||||||
|
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
} // namespace SwrJit
|
} // namespace SwrJit
|
||||||
|
|
|
||||||
|
|
@ -32,8 +32,12 @@
|
||||||
#include "passes.h"
|
#include "passes.h"
|
||||||
#include "JitManager.h"
|
#include "JitManager.h"
|
||||||
|
|
||||||
|
#include "common/simdlib.hpp"
|
||||||
|
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
|
extern "C" void ScatterPS_256(uint8_t*, SIMD256::Integer, SIMD256::Float, uint8_t, uint32_t);
|
||||||
|
|
||||||
namespace llvm
|
namespace llvm
|
||||||
{
|
{
|
||||||
// foward declare the initializer
|
// foward declare the initializer
|
||||||
|
|
@ -88,6 +92,8 @@ namespace SwrJit
|
||||||
Instruction*
|
Instruction*
|
||||||
VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
|
VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
|
||||||
Instruction*
|
Instruction*
|
||||||
|
VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
|
||||||
|
Instruction*
|
||||||
VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
|
VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
|
||||||
Instruction*
|
Instruction*
|
||||||
VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
|
VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
|
||||||
|
|
@ -102,88 +108,61 @@ namespace SwrJit
|
||||||
|
|
||||||
static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
|
static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
|
static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
|
||||||
// 256 wide 512 wide
|
// 256 wide 512 wide
|
||||||
{
|
{
|
||||||
// AVX
|
// AVX
|
||||||
{"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
|
{"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
|
||||||
{"meta.intrinsic.VPERMPS",
|
{"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
|
||||||
{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
|
{"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
|
||||||
{"meta.intrinsic.VPERMD",
|
{"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
|
||||||
{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
|
{"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
|
||||||
{"meta.intrinsic.VGATHERPD",
|
{"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
|
||||||
{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
|
{"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}},
|
||||||
{"meta.intrinsic.VGATHERPS",
|
{"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
|
||||||
{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
|
{"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
|
||||||
{"meta.intrinsic.VGATHERDD",
|
{"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
|
||||||
{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
|
{"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
|
||||||
{"meta.intrinsic.VCVTPD2PS",
|
|
||||||
{{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
|
|
||||||
{"meta.intrinsic.VCVTPH2PS",
|
|
||||||
{{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
|
|
||||||
{"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
|
|
||||||
{"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
// AVX2
|
// AVX2
|
||||||
{"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
|
{"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
|
||||||
{"meta.intrinsic.VPERMPS",
|
{"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}},
|
||||||
{{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}},
|
{"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}},
|
||||||
{"meta.intrinsic.VPERMD",
|
{"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
|
||||||
{{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}},
|
{"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
|
||||||
{"meta.intrinsic.VGATHERPD",
|
{"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
|
||||||
{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
|
{"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}},
|
||||||
{"meta.intrinsic.VGATHERPS",
|
{"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}},
|
||||||
{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
|
{"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
|
||||||
{"meta.intrinsic.VGATHERDD",
|
{"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
|
||||||
{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
|
{"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
|
||||||
{"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}},
|
|
||||||
{"meta.intrinsic.VCVTPH2PS",
|
|
||||||
{{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
|
|
||||||
{"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
|
|
||||||
{"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
// AVX512
|
// AVX512
|
||||||
{"meta.intrinsic.VRCPPS",
|
{"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
|
||||||
{{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
|
|
||||||
#if LLVM_VERSION_MAJOR < 7
|
#if LLVM_VERSION_MAJOR < 7
|
||||||
{"meta.intrinsic.VPERMPS",
|
{"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}},
|
||||||
{{Intrinsic::x86_avx512_mask_permvar_sf_256,
|
{"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}},
|
||||||
Intrinsic::x86_avx512_mask_permvar_sf_512},
|
|
||||||
NO_EMU}},
|
|
||||||
{"meta.intrinsic.VPERMD",
|
|
||||||
{{Intrinsic::x86_avx512_mask_permvar_si_256,
|
|
||||||
Intrinsic::x86_avx512_mask_permvar_si_512},
|
|
||||||
NO_EMU}},
|
|
||||||
#else
|
#else
|
||||||
{"meta.intrinsic.VPERMPS",
|
{"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
|
||||||
{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
|
{"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
|
||||||
{"meta.intrinsic.VPERMD",
|
|
||||||
{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
|
|
||||||
#endif
|
#endif
|
||||||
{"meta.intrinsic.VGATHERPD",
|
{"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
|
||||||
{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
|
{"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
|
||||||
{"meta.intrinsic.VGATHERPS",
|
{"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
|
||||||
{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
|
{"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}},
|
||||||
{"meta.intrinsic.VGATHERDD",
|
|
||||||
{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
|
|
||||||
#if LLVM_VERSION_MAJOR < 7
|
#if LLVM_VERSION_MAJOR < 7
|
||||||
{"meta.intrinsic.VCVTPD2PS",
|
{"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512}, NO_EMU}},
|
||||||
{{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512},
|
|
||||||
NO_EMU}},
|
|
||||||
#else
|
#else
|
||||||
{"meta.intrinsic.VCVTPD2PS",
|
{"meta.intrinsic.VCVTPD2PS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VCONVERT_EMU}},
|
||||||
{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VCONVERT_EMU}},
|
|
||||||
#endif
|
#endif
|
||||||
{"meta.intrinsic.VCVTPH2PS",
|
{"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_avx512_mask_vcvtph2ps_256, Intrinsic::x86_avx512_mask_vcvtph2ps_512}, NO_EMU}},
|
||||||
{{Intrinsic::x86_avx512_mask_vcvtph2ps_256, Intrinsic::x86_avx512_mask_vcvtph2ps_512},
|
{"meta.intrinsic.VROUND", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}},
|
||||||
NO_EMU}},
|
{"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}},
|
||||||
{"meta.intrinsic.VROUND",
|
|
||||||
{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}},
|
|
||||||
{"meta.intrinsic.VHSUBPS",
|
|
||||||
{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}},
|
|
||||||
}};
|
}};
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
struct LowerX86 : public FunctionPass
|
struct LowerX86 : public FunctionPass
|
||||||
{
|
{
|
||||||
|
|
@ -209,6 +188,27 @@ namespace SwrJit
|
||||||
SWR_ASSERT(false, "Unsupported AVX architecture.");
|
SWR_ASSERT(false, "Unsupported AVX architecture.");
|
||||||
mTarget = AVX;
|
mTarget = AVX;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Setup scatter function for 256 wide
|
||||||
|
uint32_t curWidth = B->mVWidth;
|
||||||
|
B->SetTargetWidth(8);
|
||||||
|
std::vector<Type*> args = {
|
||||||
|
B->mInt8PtrTy, // pBase
|
||||||
|
B->mSimdInt32Ty, // vIndices
|
||||||
|
B->mSimdFP32Ty, // vSrc
|
||||||
|
B->mInt8Ty, // mask
|
||||||
|
B->mInt32Ty // scale
|
||||||
|
};
|
||||||
|
|
||||||
|
FunctionType* pfnScatterTy = FunctionType::get(B->mVoidTy, args, false);
|
||||||
|
mPfnScatter256 = cast<Function>(
|
||||||
|
B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy));
|
||||||
|
if (sys::DynamicLibrary::SearchForAddressOfSymbol("ScatterPS_256") == nullptr)
|
||||||
|
{
|
||||||
|
sys::DynamicLibrary::AddSymbol("ScatterPS_256", (void*)&ScatterPS_256);
|
||||||
|
}
|
||||||
|
|
||||||
|
B->SetTargetWidth(curWidth);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Try to decipher the vector type of the instruction. This does not work properly
|
// Try to decipher the vector type of the instruction. This does not work properly
|
||||||
|
|
@ -392,23 +392,39 @@ namespace SwrJit
|
||||||
virtual bool runOnFunction(Function& F)
|
virtual bool runOnFunction(Function& F)
|
||||||
{
|
{
|
||||||
std::vector<Instruction*> toRemove;
|
std::vector<Instruction*> toRemove;
|
||||||
|
std::vector<BasicBlock*> bbs;
|
||||||
|
|
||||||
for (auto& BB : F.getBasicBlockList())
|
// Make temp copy of the basic blocks and instructions, as the intrinsic
|
||||||
|
// replacement code might invalidate the iterators
|
||||||
|
for (auto& b : F.getBasicBlockList())
|
||||||
{
|
{
|
||||||
for (auto& I : BB.getInstList())
|
bbs.push_back(&b);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto* BB : bbs)
|
||||||
|
{
|
||||||
|
std::vector<Instruction*> insts;
|
||||||
|
for (auto& i : BB->getInstList())
|
||||||
{
|
{
|
||||||
if (CallInst* pCallInst = dyn_cast<CallInst>(&I))
|
insts.push_back(&i);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto* I : insts)
|
||||||
|
{
|
||||||
|
if (CallInst* pCallInst = dyn_cast<CallInst>(I))
|
||||||
{
|
{
|
||||||
Function* pFunc = pCallInst->getCalledFunction();
|
Function* pFunc = pCallInst->getCalledFunction();
|
||||||
if (pFunc)
|
if (pFunc)
|
||||||
{
|
{
|
||||||
if (pFunc->getName().startswith("meta.intrinsic"))
|
if (pFunc->getName().startswith("meta.intrinsic"))
|
||||||
{
|
{
|
||||||
B->IRB()->SetInsertPoint(&I);
|
B->IRB()->SetInsertPoint(I);
|
||||||
Instruction* pReplace = ProcessIntrinsic(pCallInst);
|
Instruction* pReplace = ProcessIntrinsic(pCallInst);
|
||||||
SWR_ASSERT(pReplace);
|
|
||||||
toRemove.push_back(pCallInst);
|
toRemove.push_back(pCallInst);
|
||||||
pCallInst->replaceAllUsesWith(pReplace);
|
if (pReplace)
|
||||||
|
{
|
||||||
|
pCallInst->replaceAllUsesWith(pReplace);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -428,10 +444,9 @@ namespace SwrJit
|
||||||
virtual void getAnalysisUsage(AnalysisUsage& AU) const {}
|
virtual void getAnalysisUsage(AnalysisUsage& AU) const {}
|
||||||
|
|
||||||
JitManager* JM() { return B->JM(); }
|
JitManager* JM() { return B->JM(); }
|
||||||
|
Builder* B;
|
||||||
Builder* B;
|
TargetArch mTarget;
|
||||||
|
Function* mPfnScatter256;
|
||||||
TargetArch mTarget;
|
|
||||||
|
|
||||||
static char ID; ///< Needed by LLVM to generate ID for FunctionPass.
|
static char ID; ///< Needed by LLVM to generate ID for FunctionPass.
|
||||||
};
|
};
|
||||||
|
|
@ -639,6 +654,69 @@ namespace SwrJit
|
||||||
|
|
||||||
return cast<Instruction>(v32Gather);
|
return cast<Instruction>(v32Gather);
|
||||||
}
|
}
|
||||||
|
Instruction*
|
||||||
|
VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
|
||||||
|
{
|
||||||
|
Builder* B = pThis->B;
|
||||||
|
auto pBase = pCallInst->getArgOperand(0);
|
||||||
|
auto vi1Mask = pCallInst->getArgOperand(1);
|
||||||
|
auto vi32Indices = pCallInst->getArgOperand(2);
|
||||||
|
auto v32Src = pCallInst->getArgOperand(3);
|
||||||
|
auto i32Scale = pCallInst->getArgOperand(4);
|
||||||
|
|
||||||
|
if (arch != AVX512)
|
||||||
|
{
|
||||||
|
// Call into C function to do the scatter. This has significantly better compile perf
|
||||||
|
// compared to jitting scatter loops for every scatter
|
||||||
|
if (width == W256)
|
||||||
|
{
|
||||||
|
auto mask = B->BITCAST(vi1Mask, B->mInt8Ty);
|
||||||
|
B->CALL(pThis->mPfnScatter256, {pBase, vi32Indices, v32Src, mask, i32Scale});
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Need to break up 512 wide scatter to two 256 wide
|
||||||
|
auto maskLo = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
|
||||||
|
auto indicesLo =
|
||||||
|
B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
|
||||||
|
auto srcLo = B->VSHUFFLE(v32Src, v32Src, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
|
||||||
|
|
||||||
|
auto mask = B->BITCAST(maskLo, B->mInt8Ty);
|
||||||
|
B->CALL(pThis->mPfnScatter256, {pBase, indicesLo, srcLo, mask, i32Scale});
|
||||||
|
|
||||||
|
auto maskHi = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
|
||||||
|
auto indicesHi =
|
||||||
|
B->VSHUFFLE(vi32Indices, vi32Indices, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
|
||||||
|
auto srcHi = B->VSHUFFLE(v32Src, v32Src, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
|
||||||
|
|
||||||
|
mask = B->BITCAST(maskHi, B->mInt8Ty);
|
||||||
|
B->CALL(pThis->mPfnScatter256, {pBase, indicesHi, srcHi, mask, i32Scale});
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
Value* iMask;
|
||||||
|
Function* pX86IntrinFunc;
|
||||||
|
if (width == W256)
|
||||||
|
{
|
||||||
|
// No direct intrinsic supported in llvm to scatter 8 elem with 32bit indices, but we
|
||||||
|
// can use the scatter of 8 elements with 64bit indices
|
||||||
|
pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
|
||||||
|
Intrinsic::x86_avx512_scatter_qps_512);
|
||||||
|
|
||||||
|
auto vi32IndicesExt = B->Z_EXT(vi32Indices, B->mSimdInt64Ty);
|
||||||
|
iMask = B->BITCAST(vi1Mask, B->mInt8Ty);
|
||||||
|
B->CALL(pX86IntrinFunc, {pBase, iMask, vi32IndicesExt, v32Src, i32Scale});
|
||||||
|
}
|
||||||
|
else if (width == W512)
|
||||||
|
{
|
||||||
|
pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
|
||||||
|
Intrinsic::x86_avx512_scatter_dps_512);
|
||||||
|
iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
|
||||||
|
B->CALL(pX86IntrinFunc, {pBase, iMask, vi32Indices, v32Src, i32Scale});
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
// No support for vroundps in avx512 (it is available in kncni), so emulate with avx
|
// No support for vroundps in avx512 (it is available in kncni), so emulate with avx
|
||||||
// instructions
|
// instructions
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,49 @@
|
||||||
|
/****************************************************************************
|
||||||
|
* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice (including the next
|
||||||
|
* paragraph) shall be included in all copies or substantial portions of the
|
||||||
|
* Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||||
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||||
|
* IN THE SOFTWARE.
|
||||||
|
*
|
||||||
|
* @file Scatter.cpp
|
||||||
|
*
|
||||||
|
* @brief Shader support library implementation for scatter emulation
|
||||||
|
*
|
||||||
|
* Notes:
|
||||||
|
*
|
||||||
|
******************************************************************************/
|
||||||
|
#include <stdarg.h>
|
||||||
|
#include "common/os.h"
|
||||||
|
#include "common/simdlib.hpp"
|
||||||
|
|
||||||
|
extern "C" void ScatterPS_256(uint8_t* pBase, SIMD256::Integer vIndices, SIMD256::Float vSrc, uint8_t mask, uint32_t scale)
|
||||||
|
{
|
||||||
|
OSALIGN(float, 32) src[8];
|
||||||
|
OSALIGN(uint32_t, 32) indices[8];
|
||||||
|
|
||||||
|
SIMD256::store_ps(src, vSrc);
|
||||||
|
SIMD256::store_si((SIMD256::Integer*)indices, vIndices);
|
||||||
|
|
||||||
|
DWORD index;
|
||||||
|
while (_BitScanForward(&index, mask))
|
||||||
|
{
|
||||||
|
mask &= ~(1 << index);
|
||||||
|
|
||||||
|
*(float*)(pBase + indices[index] * scale) = src[index];
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Add table
Reference in a new issue