Add SRC and IN implementations to avoid CompositeGeneral in some cases hit by PDF rendering

The patch implements a few more operations with special cases MMX
code. On my laptop, applying the patch to cairo speeds up the
benchmark (rendering page 14 of a PDF file[*]) from 20.9 seconds
to 14.9 seconds, which is an improvement of 28.6%.

[*] http://people.redhat.com/jakub/prelink.pdf

This also benefits the recently added unaligned_clip perf case:

image-rgb  unaligned_clip-100 0.11 -> 0.06: 1.65x speedup
▋
image-rgba unaligned_clip-100 0.11 -> 0.06: 1.64x speedup
▋
This commit is contained in:
Soeren Sandmann 2007-01-09 15:05:29 -08:00 committed by Carl Worth
parent d5531c4f50
commit cf1d95e714
3 changed files with 366 additions and 1 deletions

View file

@ -2134,6 +2134,232 @@ fbCompositeSolidMask_nx8888x0565Cmmx (pixman_operator_t op,
_mm_empty ();
}
void
fbCompositeIn_nx8x8mmx (pixman_operator_t op,
PicturePtr pSrc,
PicturePtr pMask,
PicturePtr pDst,
INT16 xSrc,
INT16 ySrc,
INT16 xMask,
INT16 yMask,
INT16 xDst,
INT16 yDst,
CARD16 width,
CARD16 height)
{
CARD8 *dstLine, *dst;
CARD8 *maskLine, *mask;
FbStride dstStride, maskStride;
CARD16 w;
CARD32 src;
CARD8 sa;
__m64 vsrc, vsrca;
fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1);
fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
fbComposeGetSolid(pSrc, pDst, src);
sa = src >> 24;
if (sa == 0)
return;
vsrc = load8888(src);
vsrca = expand_alpha(vsrc);
while (height--)
{
dst = dstLine;
dstLine += dstStride;
mask = maskLine;
maskLine += maskStride;
w = width;
if ((((unsigned long)pDst & 3) == 0) &&
(((unsigned long)pSrc & 3) == 0))
{
while (w >= 4)
{
CARD32 m;
__m64 vmask;
__m64 vdest;
m = 0;
vmask = load8888 (*(CARD32 *)mask);
vdest = load8888 (*(CARD32 *)dst);
*(CARD32 *)dst = store8888 (in (in (vsrca, vmask), vdest));
dst += 4;
mask += 4;
w -= 4;
}
}
while (w--)
{
CARD16 tmp;
CARD8 a;
CARD32 m, d;
CARD32 r;
a = *mask++;
d = *dst;
m = FbInU (sa, 0, a, tmp);
r = FbInU (m, 0, d, tmp);
*dst++ = r;
}
}
_mm_empty();
}
void
fbCompositeIn_8x8mmx (pixman_operator_t op,
PicturePtr pSrc,
PicturePtr pMask,
PicturePtr pDst,
INT16 xSrc,
INT16 ySrc,
INT16 xMask,
INT16 yMask,
INT16 xDst,
INT16 yDst,
CARD16 width,
CARD16 height)
{
CARD8 *dstLine, *dst;
CARD8 *srcLine, *src;
FbStride srcStride, dstStride;
CARD16 w;
fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1);
fbComposeGetStart (pSrc, xSrc, ySrc, CARD8, srcStride, srcLine, 1);
while (height--)
{
dst = dstLine;
dstLine += dstStride;
src = srcLine;
srcLine += srcStride;
w = width;
if ((((unsigned long)pDst & 3) == 0) &&
(((unsigned long)pSrc & 3) == 0))
{
while (w >= 4)
{
CARD32 *s = (CARD32 *)src;
CARD32 *d = (CARD32 *)dst;
*d = store8888 (in (load8888 (*s), load8888 (*d)));
w -= 4;
dst += 4;
src += 4;
}
}
while (w--)
{
CARD8 s, d;
CARD16 tmp;
s = *src;
d = *dst;
*dst = FbInU (s, 0, d, tmp);
src++;
dst++;
}
}
_mm_empty ();
}
void
fbCompositeSrcAdd_8888x8x8mmx (pixman_operator_t op,
PicturePtr pSrc,
PicturePtr pMask,
PicturePtr pDst,
INT16 xSrc,
INT16 ySrc,
INT16 xMask,
INT16 yMask,
INT16 xDst,
INT16 yDst,
CARD16 width,
CARD16 height)
{
CARD8 *dstLine, *dst;
CARD8 *maskLine, *mask;
FbStride dstStride, maskStride;
CARD16 w;
CARD32 src;
CARD8 sa;
__m64 vsrc, vsrca;
fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1);
fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
fbComposeGetSolid(pSrc, pDst, src);
sa = src >> 24;
if (sa == 0)
return;
vsrc = load8888(src);
vsrca = expand_alpha(vsrc);
while (height--)
{
dst = dstLine;
dstLine += dstStride;
mask = maskLine;
maskLine += maskStride;
w = width;
if ((((unsigned long)pMask & 3) == 0) &&
(((unsigned long)pDst & 3) == 0))
{
while (w >= 4)
{
__m64 vmask = load8888 (*(CARD32 *)mask);
__m64 vdest = load8888 (*(CARD32 *)dst);
*(CARD32 *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), vdest));
w -= 4;
dst += 4;
mask += 4;
}
}
while (w--)
{
CARD16 tmp;
CARD16 a;
CARD32 m, d;
CARD32 r;
a = *mask++;
d = *dst;
m = FbInU (sa, 0, a, tmp);
r = FbAdd (m, d, 0, tmp);
*dst++ = r;
}
}
_mm_empty();
}
void
fbCompositeSrcAdd_8000x8000mmx (pixman_operator_t op,
PicturePtr pSrc,

View file

@ -44,6 +44,20 @@ Bool fbHaveMMX(void);
pixman_private
void fbComposeSetupMMX(void);
pixman_private
void fbCompositeIn_nx8x8mmx (pixman_operator_t op,
PicturePtr pSrc,
PicturePtr pMask,
PicturePtr pDst,
INT16 xSrc,
INT16 ySrc,
INT16 xMask,
INT16 yMask,
INT16 xDst,
INT16 yDst,
CARD16 width,
CARD16 height);
pixman_private
void fbCompositeSolidMask_nx8888x0565Cmmx (pixman_operator_t op,
PicturePtr pSrc,
@ -109,6 +123,35 @@ void fbCompositeSolidMaskSrc_nx8x8888mmx (pixman_operator_t op,
INT16 yDst,
CARD16 width,
CARD16 height);
pixman_private
void fbCompositeSrcAdd_8888x8x8mmx (pixman_operator_t op,
PicturePtr pSrc,
PicturePtr pMask,
PicturePtr pDst,
INT16 xSrc,
INT16 ySrc,
INT16 xMask,
INT16 yMask,
INT16 xDst,
INT16 yDst,
CARD16 width,
CARD16 height);
pixman_private
void fbCompositeIn_8x8mmx (pixman_operator_t op,
PicturePtr pSrc,
PicturePtr pMask,
PicturePtr pDst,
INT16 xSrc,
INT16 ySrc,
INT16 xMask,
INT16 yMask,
INT16 xDst,
INT16 yDst,
CARD16 width,
CARD16 height);
pixman_private
void fbCompositeSrcAdd_8000x8000mmx (pixman_operator_t op,
PicturePtr pSrc,

View file

@ -843,6 +843,58 @@ fbCompositeSrcAdd_8888x8888 (pixman_operator_t op,
}
}
static void
fbCompositeSrcAdd_8888x8x8 (pixman_operator_t op,
PicturePtr pSrc,
PicturePtr pMask,
PicturePtr pDst,
INT16 xSrc,
INT16 ySrc,
INT16 xMask,
INT16 yMask,
INT16 xDst,
INT16 yDst,
CARD16 width,
CARD16 height)
{
CARD8 *dstLine, *dst;
CARD8 *maskLine, *mask;
FbStride dstStride, maskStride;
CARD16 w;
CARD32 src;
CARD8 sa;
fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1);
fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
fbComposeGetSolid (pSrc, pDst, src);
sa = (src >> 24);
while (height--)
{
dst = dstLine;
dstLine += dstStride;
mask = maskLine;
maskLine += maskStride;
w = width;
while (w--)
{
CARD16 tmp;
CARD16 a;
CARD32 m, d;
CARD32 r;
a = *mask++;
d = *dst;
m = FbInU (sa, 0, a, tmp);
r = FbAdd (m, d, 0, tmp);
*dst++ = r;
}
}
}
static void
fbCompositeSrcAdd_1000x1000 (pixman_operator_t op,
PicturePtr pSrc,
@ -1759,6 +1811,26 @@ pixman_composite (pixman_operator_t op,
break;
}
}
else
{
if ((pSrc->format_code == PICT_a8r8g8b8 ||
pSrc->format_code == PICT_a8b8g8r8) &&
srcRepeat &&
pMask->format_code == PICT_a8 &&
pDst->format_code == PICT_a8)
{
#ifdef USE_MMX
if (fbHaveMMX())
{
srcRepeat = FALSE;
func = fbCompositeSrcAdd_8888x8x8mmx;
}
else
#endif
func = fbCompositeSrcAdd_8888x8x8;
}
}
break;
case PIXMAN_OPERATOR_SRC:
if (pMask)
@ -1798,10 +1870,34 @@ pixman_composite (pixman_operator_t op,
}
}
break;
case PIXMAN_OPERATOR_IN:
#ifdef USE_MMX
if (pSrc->format_code == PICT_a8 &&
pDst->format_code == PICT_a8 &&
!pMask)
{
if (fbHaveMMX())
func = fbCompositeIn_8x8mmx;
}
else if (srcRepeat && pMask && !pMask->componentAlpha &&
(pSrc->format_code == PICT_a8r8g8b8 ||
pSrc->format_code == PICT_a8b8g8r8) &&
(pMask->format_code == PICT_a8) &&
pDst->format_code == PICT_a8)
{
if (fbHaveMMX())
{
srcRepeat = FALSE;
func = fbCompositeIn_nx8x8mmx;
}
}
#else
func = NULL;
#endif
break;
case PIXMAN_OPERATOR_CLEAR:
case PIXMAN_OPERATOR_DST:
case PIXMAN_OPERATOR_OVER_REVERSE:
case PIXMAN_OPERATOR_IN:
case PIXMAN_OPERATOR_IN_REVERSE:
case PIXMAN_OPERATOR_OUT:
case PIXMAN_OPERATOR_OUT_REVERSE: