mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-31 14:10:09 +01:00
swr/rast: fix USE_SIMD16_FRONTEND issues
Fix problems found when enabling USE_SIMD16_FRONTEND, mostly related to vMask / movemask_ps(pd). Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
parent
07062daae9
commit
d08493f9ce
14 changed files with 49 additions and 74 deletions
|
|
@ -159,20 +159,10 @@ typedef SIMD512 SIMD16;
|
|||
#define _simd16_packus_epi32 SIMD16::packus_epi32
|
||||
#define _simd16_packs_epi32 SIMD16::packs_epi32
|
||||
#define _simd16_cmplt_ps_mask SIMD16::cmp_ps_mask<SIMD16::CompareType::LT_OQ>
|
||||
#define _simd16_cmpeq_ps_mask SIMD16::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>
|
||||
#define _simd16_int2mask(mask) simd16mask(mask)
|
||||
#define _simd16_mask2int(mask) int(mask)
|
||||
|
||||
// convert bitmask to vector mask
|
||||
SIMDINLINE simd16scalar vMask16(int32_t mask)
|
||||
{
|
||||
simd16scalari temp = _simd16_set1_epi32(mask);
|
||||
|
||||
simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
|
||||
|
||||
simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits));
|
||||
|
||||
return _simd16_castsi_ps(result);
|
||||
}
|
||||
#define _simd16_vmask_ps SIMD16::vmask_ps
|
||||
|
||||
#endif//ENABLE_AVX512_SIMD16
|
||||
|
||||
|
|
|
|||
|
|
@ -181,6 +181,7 @@ typedef SIMD256 SIMD;
|
|||
#define _simd_storeu2_si SIMD::storeu2_si
|
||||
|
||||
#define _simd_blendv_epi32 SIMD::blendv_epi32
|
||||
#define _simd_vmask_ps SIMD::vmask_ps
|
||||
|
||||
template<int mask> SIMDINLINE
|
||||
SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer a, SIMD128::Integer b)
|
||||
|
|
@ -188,26 +189,6 @@ SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer a, SIMD128::Integer b)
|
|||
return SIMD128::castps_si(SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b)));
|
||||
}
|
||||
|
||||
// convert bitmask to vector mask
|
||||
SIMDINLINE
|
||||
SIMD256::Float vMask(int32_t mask)
|
||||
{
|
||||
SIMD256::Integer vec = SIMD256::set1_epi32(mask);
|
||||
const SIMD256::Integer bit = SIMD256::set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
|
||||
vec = SIMD256::and_si(vec, bit);
|
||||
vec = SIMD256::cmplt_epi32(SIMD256::setzero_si(), vec);
|
||||
return SIMD256::castsi_ps(vec);
|
||||
}
|
||||
|
||||
SIMDINLINE
|
||||
SIMD256::Integer vMaski(int32_t mask)
|
||||
{
|
||||
SIMD256::Integer vec = SIMD256::set1_epi32(mask);
|
||||
const SIMD256::Integer bit = SIMD256::set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
|
||||
vec = SIMD256::and_si(vec, bit);
|
||||
return SIMD256::cmplt_epi32(SIMD256::setzero_si(), vec);
|
||||
}
|
||||
|
||||
SIMDINLINE
|
||||
void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -519,6 +519,11 @@ static SIMDINLINE Float SIMDCALL set_ps(float in3, float in2, float in1, float i
|
|||
return _mm_set_ps(in3, in2, in1, in0);
|
||||
}
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL set_epi32(int in3, int in2, int in1, int in0)
|
||||
{
|
||||
return _mm_set_epi32(in3, in2, in1, in0);
|
||||
}
|
||||
|
||||
template <int ImmT>
|
||||
static SIMDINLINE float SIMDCALL extract_ps(Float a)
|
||||
{
|
||||
|
|
@ -526,6 +531,16 @@ static SIMDINLINE float SIMDCALL extract_ps(Float a)
|
|||
return *reinterpret_cast<float*>(&tmp);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
|
||||
{
|
||||
Integer vec = set1_epi32(mask);
|
||||
const Integer bit = set_epi32(
|
||||
0x08, 0x04, 0x02, 0x01);
|
||||
vec = and_si(vec, bit);
|
||||
vec = cmplt_epi32(setzero_si(), vec);
|
||||
return castsi_ps(vec);
|
||||
}
|
||||
|
||||
#undef SIMD_WRAPPER_1
|
||||
#undef SIMD_WRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2
|
||||
|
|
|
|||
|
|
@ -741,6 +741,16 @@ static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer *phi, SIMD128Imp
|
|||
_mm256_storeu2_m128i(&phi->v, &plo->v, src);
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
|
||||
{
|
||||
Integer vec = set1_epi32(mask);
|
||||
const Integer bit = set_epi32(
|
||||
0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
|
||||
vec = and_si(vec, bit);
|
||||
vec = cmplt_epi32(setzero_si(), vec);
|
||||
return castsi_ps(vec);
|
||||
}
|
||||
|
||||
#undef SIMD_WRAPPER_1
|
||||
#undef SIMD_WRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2
|
||||
|
|
|
|||
|
|
@ -554,12 +554,12 @@ static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
|
|||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
|
||||
{
|
||||
__mmask8 m = _mm512_cmplt_pd_mask(a, setzero_pd());
|
||||
__mmask8 m = _mm512_test_epi64_mask(castpd_si(a), set1_epi32(-1));
|
||||
return static_cast<uint32_t>(m);
|
||||
}
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
|
||||
{
|
||||
__mmask16 m = _mm512_cmplt_ps_mask(a, setzero_ps());
|
||||
__mmask16 m = _mm512_test_epi32_mask(castps_si(a), set1_epi32(-1));
|
||||
return static_cast<uint32_t>(m);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -29,9 +29,6 @@
|
|||
//
|
||||
//============================================================================
|
||||
|
||||
static const int TARGET_SIMD_WIDTH = 16;
|
||||
using SIMD256T = SIMD256Impl::AVX2Impl;
|
||||
|
||||
#define SIMD_WRAPPER_1_(op, intrin) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) \
|
||||
{\
|
||||
|
|
@ -135,24 +132,6 @@ using SIMD256T = SIMD256Impl::AVX2Impl;
|
|||
}
|
||||
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
|
||||
|
||||
private:
|
||||
static SIMDINLINE Integer vmask(__mmask8 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi64(m, -1LL);
|
||||
}
|
||||
static SIMDINLINE Integer vmask(__mmask16 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi32(m, -1);
|
||||
}
|
||||
static SIMDINLINE Integer vmask(__mmask32 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi16(m, -1);
|
||||
}
|
||||
static SIMDINLINE Integer vmask(__mmask64 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi8(m, -1);
|
||||
}
|
||||
|
||||
public:
|
||||
SIMD_WRAPPERI_2_(and_ps, and_epi32); // return a & b (float treated as int)
|
||||
SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32); // return (~a) & b (float treated as int)
|
||||
|
|
|
|||
|
|
@ -821,13 +821,11 @@ static SIMDINLINE Float SIMDCALL set_ps(
|
|||
|
||||
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
|
||||
{
|
||||
Integer vec = set1_epi32(mask);
|
||||
const Integer bit = set_epi32(
|
||||
0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100,
|
||||
0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
|
||||
vec = and_si(vec, bit);
|
||||
vec = cmplt_epi32(setzero_si(), vec);
|
||||
return castsi_ps(vec);
|
||||
return Float
|
||||
{
|
||||
SIMD256T::vmask_ps(mask),
|
||||
SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH)
|
||||
};
|
||||
}
|
||||
|
||||
#undef SIMD_WRAPPER_1
|
||||
|
|
|
|||
|
|
@ -277,7 +277,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
|
|||
coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
|
||||
}
|
||||
|
||||
simdscalar vCoverageMask = vMask(coverageMask);
|
||||
simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
|
||||
simdscalar stencilPassMask = vCoverageMask;
|
||||
|
||||
AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
|
||||
|
|
|
|||
|
|
@ -576,7 +576,7 @@ struct PixelRateZTestLoop
|
|||
for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
|
||||
{
|
||||
const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
|
||||
vCoverageMask[sample] = _simd_and_ps(activeLanes, vMask(pCoverageMask[currentSimdIn8x8] & MASK));
|
||||
vCoverageMask[sample] = _simd_and_ps(activeLanes, _simd_vmask_ps(pCoverageMask[currentSimdIn8x8] & MASK));
|
||||
|
||||
if(!_simd_movemask_ps(vCoverageMask[sample]))
|
||||
{
|
||||
|
|
@ -597,7 +597,7 @@ struct PixelRateZTestLoop
|
|||
const float minz = state.depthBoundsState.depthBoundsTestMinValue;
|
||||
const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
|
||||
|
||||
vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(CalcDepthBoundsAcceptMask(z, minz, maxz)));
|
||||
vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(CalcDepthBoundsAcceptMask(z, minz, maxz)));
|
||||
}
|
||||
|
||||
AR_BEGIN(BEBarycentric, pDC->drawId);
|
||||
|
|
@ -630,7 +630,7 @@ struct PixelRateZTestLoop
|
|||
{
|
||||
uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
|
||||
|
||||
vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
|
||||
vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(~clipMask));
|
||||
}
|
||||
|
||||
// ZTest for this sample
|
||||
|
|
@ -907,7 +907,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
|
|||
#endif
|
||||
simdscalar activeLanes;
|
||||
if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
|
||||
activeLanes = vMask(work.anyCoveredSamples & MASK);
|
||||
activeLanes = _simd_vmask_ps(work.anyCoveredSamples & MASK);
|
||||
|
||||
if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -133,7 +133,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
|
|||
coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
|
||||
}
|
||||
|
||||
simdscalar vCoverageMask = vMask(coverageMask);
|
||||
simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
|
||||
simdscalar depthPassMask = vCoverageMask;
|
||||
simdscalar stencilPassMask = vCoverageMask;
|
||||
|
||||
|
|
|
|||
|
|
@ -117,7 +117,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
|
|||
coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
|
||||
}
|
||||
|
||||
simdscalar vCoverageMask = vMask(coverageMask);
|
||||
simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
|
||||
simdscalar depthPassMask = vCoverageMask;
|
||||
simdscalar stencilPassMask = vCoverageMask;
|
||||
|
||||
|
|
|
|||
|
|
@ -1013,7 +1013,7 @@ public:
|
|||
AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
|
||||
// we have to clip tris, execute the clipper, which will also
|
||||
// call the binner
|
||||
ClipSimd(vMask(primMask), vMask(clipMask), pa, primId);
|
||||
ClipSimd(_simd_vmask_ps(primMask), _simd_vmask_ps(clipMask), pa, primId);
|
||||
AR_END(FEGuardbandClip, 1);
|
||||
}
|
||||
else if (validMask)
|
||||
|
|
@ -1081,7 +1081,7 @@ public:
|
|||
|
||||
// cull prims outside view frustum
|
||||
simd16scalar clipIntersection = ComputeClipCodeIntersection_simd16();
|
||||
int validMask = primMask & _simd16_movemask_ps(_simd16_cmpeq_ps(clipIntersection, _simd16_setzero_ps()));
|
||||
int validMask = primMask & _simd16_cmpeq_ps_mask(clipIntersection, _simd16_setzero_ps());
|
||||
|
||||
// skip clipping for points
|
||||
uint32_t clipMask = 0;
|
||||
|
|
@ -1095,7 +1095,7 @@ public:
|
|||
AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
|
||||
// we have to clip tris, execute the clipper, which will also
|
||||
// call the binner
|
||||
ClipSimd(vMask(primMask), vMask(clipMask), pa, primId);
|
||||
ClipSimd(_simd16_vmask_ps(primMask), _simd16_vmask_ps(clipMask), pa, primId);
|
||||
AR_END(FEGuardbandClip, 1);
|
||||
}
|
||||
else if (validMask)
|
||||
|
|
|
|||
|
|
@ -481,7 +481,7 @@ static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
|
|||
{
|
||||
uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
|
||||
uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
|
||||
return _simd_castps_si(vMask(mask));
|
||||
return _simd_castps_si(_simd_vmask_ps(mask));
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
|||
|
|
@ -703,7 +703,9 @@ struct PA_STATE_CUT : public PA_STATE
|
|||
#if USE_SIMD16_FRONTEND
|
||||
simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
|
||||
|
||||
verts[v].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
|
||||
// Assigning to a temporary first to avoid an MSVC 2017 compiler bug
|
||||
simdscalar t = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
|
||||
verts[v].v[c] = t;
|
||||
#else
|
||||
verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
|
||||
#endif
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue