swr/rast: Fix 64bit float loads in x86 lowering pass

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
2026-05-08 11:18:08 +02:00 · 2018-04-06 16:39:09 -05:00 · 2018-04-06 16:39:09 -05:00 · 96ad8f5a23
commit 96ad8f5a23
parent 1ffbbbee97
2 changed files with 25 additions and 45 deletions
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@ -201,44 +201,7 @@ namespace SwrJit
    /// @param scale - value to scale indices by
    Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
    {
-        Value* vGather;
-
-        // use avx2 gather instruction if available
-        if (JM()->mArch.AVX2())
-        {
-            vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth / 2)), VectorType::get(mDoubleTy, mVWidth / 2));
-            vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
-        }
-        else
-        {
-            Value* pStack = STACKSAVE();
-
-            // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
-            Value* vSrcPtr = ALLOCA(vSrc->getType());
-            SetTempAlloca(vSrcPtr);
-            STORE(vSrc, vSrcPtr);
-
-            vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
-            Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
-            Value *vOffsets = MUL(vIndices, vScaleVec);
-            for (uint32_t i = 0; i < mVWidth / 2; ++i)
-            {
-                // single component byte index
-                Value *offset = VEXTRACT(vOffsets, C(i));
-                // byte pointer to component
-                Value *loadAddress = GEP(pBase, offset);
-                loadAddress = BITCAST(loadAddress, PointerType::get(mDoubleTy, 0));
-                // pointer to the value to load if we're masking off a component
-                Value *maskLoadAddress = GEP(vSrcPtr, { C(0), C(i) });
-                Value *selMask = VEXTRACT(vMask, C(i));
-                // switch in a safe address to load if we're trying to access a vertex
-                Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
-                Value *val = LOAD(validAddress);
-                vGather = VINSERT(vGather, val, C(i));
-            }
-            STACKRESTORE(pStack);
-        }
-        return vGather;
+        return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
    }

    //////////////////////////////////////////////////////////////////////////
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@ -230,7 +230,6 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
    }

    // Fetch attributes from memory and output to a simdvertex struct
-    // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
    JitGatherVertices(fetchState, streams, vIndices, pVtxOut);

    RET_VOID();
@ -763,13 +762,31 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                            // if we need to gather the component
                            if (compCtrl[i] == StoreSrc)
                            {
-                                Value *vMaskLo = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3}));
-                                Value *vMaskHi = VSHUFFLE(vGatherMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7}));
+                                Value* vShufLo;
+                                Value* vShufHi;
+                                Value* vShufAll;

-                                Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0));
-                                Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1));
+                                if (mVWidth == 8)
+                                {
+                                    vShufLo = C({ 0, 1, 2, 3 });
+                                    vShufHi = C({ 4, 5, 6, 7 });
+                                    vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
+                                }
+                                else
+                                {
+                                    SWR_ASSERT(mVWidth == 16);
+                                    vShufLo = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
+                                    vShufHi = C({ 8, 9, 10, 11, 12, 13, 14, 15 });
+                                    vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
+                                }

-                                Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
+                                Value *vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo);
+                                Value *vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi);
+
+                                Value *vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo);
+                                Value *vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi);
+
+                                Value *vZeroDouble = VECTOR_SPLAT(mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));

                                Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
                                Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
@ -777,7 +794,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
                                pGatherLo = VCVTPD2PS(pGatherLo);
                                pGatherHi = VCVTPD2PS(pGatherHi);

-                                Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7}));
+                                Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll);

                                vVertexElements[currentVertexElement++] = pGather;
                            }