summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSøren Sandmann Pedersen <sandmann@daimi.au.dk>2004-07-22 19:24:50 +0000
committerSøren Sandmann Pedersen <sandmann@daimi.au.dk>2004-07-22 19:24:50 +0000
commitcc3e0173d9fae8a40eb46606d9951e3aa1df975a (patch)
treebd64512cf0d26994d87e366ce184a83297a9f5e6
parent9565d9e0cf85e6f5fb47acebdd66212bd6cc3e08 (diff)
Thu Jul 22 20:03:11 2004 Soeren Sandmann <sandmann@daimi.au.dk>
Call MMX solid fill routine when available. Call MMX operations when available. New HasGcc34 macro New file with many operations implemented with MMX intrinsics, conditional on having GCC 3.4 on i386.
-rw-r--r--fb/fbfill.c6
-rw-r--r--fb/fbmmx.c1514
-rw-r--r--fb/fbmmx.h160
-rw-r--r--fb/fbpict.c286
-rw-r--r--fb/fbpict.h64
-rw-r--r--fb/fbsolid.c1
6 files changed, 1923 insertions, 108 deletions
diff --git a/fb/fbfill.c b/fb/fbfill.c
index d03bc6461..f5842c252 100644
--- a/fb/fbfill.c
+++ b/fb/fbfill.c
@@ -24,6 +24,7 @@
/* $XFree86: xc/programs/Xserver/fb/fbfill.c,v 1.5 2003/01/29 00:43:33 torrey Exp $ */
#include "fb.h"
+#include "fbmmx.h"
void
fbFill (DrawablePtr pDrawable,
@@ -43,6 +44,11 @@ fbFill (DrawablePtr pDrawable,
switch (pGC->fillStyle) {
case FillSolid:
+#ifdef USE_GCC34_MMX
+ if (!pPriv->and && fbHaveMMX())
+ if (fbSolidFillmmx (pDrawable, x, y, width, height, pPriv->xor))
+ return;
+#endif
fbSolid (dst + (y + dstYoff) * dstStride,
dstStride,
(x + dstXoff) * dstBpp,
diff --git a/fb/fbmmx.c b/fb/fbmmx.c
new file mode 100644
index 000000000..9adda92de
--- /dev/null
+++ b/fb/fbmmx.c
@@ -0,0 +1,1514 @@
+/*
+ * Copyright © 2004 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission. Red Hat makes no representations about the
+ * suitability of this software for any purpose. It is provided "as is"
+ * without express or implied warranty.
+ *
+ * RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL RED HAT
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author: Søren Sandmann (sandmann@redhat.com)
+ *
+ * Based on work by Owen Taylor
+ */
+
+#include "fb.h"
+
+#ifdef USE_GCC34_MMX
+
+#ifdef RENDER
+
+#include "picturestr.h"
+#include "mipict.h"
+#include "fbpict.h"
+
+typedef int Vector1x64 __attribute__ ((mode(DI)));
+typedef int Vector2x32 __attribute__ ((mode(V2SI)));
+typedef int Vector4x16 __attribute__ ((mode(V4HI)));
+typedef int Vector8x8 __attribute__ ((mode(V8QI)));
+
+typedef unsigned long long ullong;
+
+#define noVERBOSE
+
+#ifdef VERBOSE
+#define CHECKPOINT() ErrorF ("at %s %d\n", __FUNCTION__, __LINE__)
+#else
+#define CHECKPOINT()
+#endif
+
+typedef struct
+{
+ ullong mmx_zero;
+ ullong mmx_4x00ff;
+ ullong mmx_4x0080;
+ ullong mmx_565_rgb;
+ ullong mmx_565_unpack_multiplier;
+ ullong mmx_565_r;
+ ullong mmx_565_g;
+ ullong mmx_565_b;
+ ullong mmx_mask_0;
+ ullong mmx_mask_1;
+ ullong mmx_mask_2;
+ ullong mmx_mask_3;
+ ullong mmx_full_alpha;
+ ullong mmx_ffff0000ffff0000;
+ ullong mmx_0000ffff00000000;
+ ullong mmx_000000000000ffff;
+} MMXData;
+
+static const MMXData c =
+{
+ .mmx_zero = 0x0000000000000000ULL,
+ .mmx_4x00ff = 0x00ff00ff00ff00ffULL,
+ .mmx_4x0080 = 0x0080008000800080ULL,
+ .mmx_565_rgb = 0x000001f0003f001fULL,
+ .mmx_565_r = 0x000000f800000000ULL,
+ .mmx_565_g = 0x0000000000fc0000ULL,
+ .mmx_565_b = 0x00000000000000f8ULL,
+ .mmx_mask_0 = 0xffffffffffff0000ULL,
+ .mmx_mask_1 = 0xffffffff0000ffffULL,
+ .mmx_mask_2 = 0xffff0000ffffffffULL,
+ .mmx_mask_3 = 0x0000ffffffffffffULL,
+ .mmx_full_alpha = 0x00ff000000000000ULL,
+ .mmx_565_unpack_multiplier = 0x0000008404100840ULL,
+ .mmx_ffff0000ffff0000 = 0xffff0000ffff0000ULL,
+ .mmx_0000ffff00000000 = 0x0000ffff00000000ULL,
+ .mmx_000000000000ffff = 0x000000000000ffffULL,
+};
+
+static __inline__ Vector1x64
+shift (Vector1x64 v, int s)
+{
+ if (s > 0)
+ return __builtin_ia32_psllq (v, s);
+ else if (s < 0)
+ return __builtin_ia32_psrlq (v, -s);
+ else
+ return v;
+}
+
+static __inline__ Vector4x16
+negate (Vector4x16 mask)
+{
+ return (Vector4x16)__builtin_ia32_pxor (
+ (Vector1x64)mask,
+ (Vector1x64)c.mmx_4x00ff);
+}
+
+static __inline__ Vector4x16
+pix_multiply (Vector4x16 a, Vector4x16 b)
+{
+ Vector4x16 res;
+
+ res = __builtin_ia32_pmullw (a, b);
+ res = __builtin_ia32_paddw (res, (Vector4x16)c.mmx_4x0080);
+ res = __builtin_ia32_psrlw (res, 8);
+
+ return res;
+}
+
+#if 0
+#define HAVE_PSHUFW
+#endif
+
+#ifdef HAVE_PSHUFW
+
+static __inline__ Vector4x16
+expand_alpha (Vector4x16 pixel)
+{
+ Vector4x16 result;
+ __asm__ ("pshufw $0xFF, %1, %0\n\t" : "=y" (result) : "y" (pixel));
+ return result;
+}
+
+static __inline__ Vector4x16
+expand_alpha_rev (Vector4x16 pixel)
+{
+ Vector4x16 result;
+ __asm__ ("pshufw $0x00, %1, %0\n\t" : "=y" (result) : "y" (pixel));
+ return result;
+}
+
+static __inline__ Vector4x16
+invert_colors (Vector4x16 pixel)
+{
+ Vector4x16 result;
+
+ /* 0xC6 = 11000110 */
+ /* 3 0 1 2 */
+
+ __asm__ ("pshufw $0xC6, %1, %0\n\t" : "=y" (result) : "y" (pixel));
+
+ return result;
+}
+
+#else
+
+static __inline__ Vector4x16
+expand_alpha (Vector4x16 pixel)
+{
+ Vector1x64 t1, t2;
+
+ t1 = shift ((Vector1x64)pixel, -48);
+ t2 = shift (t1, 16);
+ t1 = __builtin_ia32_por (t1, t2);
+ t2 = shift (t1, 32);
+ t1 = __builtin_ia32_por (t1, t2);
+
+ return (Vector4x16)t1;
+}
+
+static __inline__ Vector4x16
+expand_alpha_rev (Vector4x16 pixel)
+{
+ Vector1x64 t1, t2;
+
+ t1 = shift ((Vector1x64)pixel, 48);
+ t1 = shift (t1, -48);
+ t2 = shift (t1, 16);
+ t1 = __builtin_ia32_por (t1, t2);
+ t2 = shift (t1, 32);
+ t1 = __builtin_ia32_por (t1, t2);
+
+ return (Vector4x16)t1;
+}
+
+static __inline__ Vector4x16
+invert_colors (Vector4x16 pixel)
+{
+ Vector1x64 x, y, z;
+
+ x = y = z = (Vector1x64)pixel;
+
+ x = __builtin_ia32_pand (x, (Vector1x64)c.mmx_ffff0000ffff0000);
+ y = __builtin_ia32_pand (y, (Vector1x64)c.mmx_000000000000ffff);
+ z = __builtin_ia32_pand (z, (Vector1x64)c.mmx_0000ffff00000000);
+
+ y = shift (y, 32);
+ z = shift (z, -32);
+
+ x = __builtin_ia32_por (x, y);
+ x = __builtin_ia32_por (x, z);
+
+ return (Vector4x16)x;
+}
+
+#endif
+
+/* Notes about writing mmx code
+ *
+ * give memory operands as the second operand. If you give it as the
+ * first, gcc will first load it into a register, then use that register
+ *
+ * ie. use
+ *
+ * __builtin_pmullw (x, mmx_constant[8]);
+ *
+ * not
+ *
+ * __builtin_pmullw (mmx_constant[8], x);
+ *
+ * Also try to minimize dependencies. Ie. when you need a value, try to calculate
+ * it from a value that was calculated as early as possible.
+ */
+
+static __inline__ Vector4x16
+over (Vector4x16 src, Vector4x16 srca, Vector4x16 dest)
+{
+ return (Vector4x16)__builtin_ia32_paddusb ((Vector8x8)src, (Vector8x8)pix_multiply(dest, negate(srca)));
+}
+
+static __inline__ Vector4x16
+over_rev_non_pre (Vector4x16 src, Vector4x16 dest)
+{
+ Vector4x16 srca = expand_alpha (src);
+ Vector4x16 srcfaaa = (Vector4x16)__builtin_ia32_por((Vector1x64)srca, (Vector1x64)c.mmx_full_alpha);
+
+ return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest);
+}
+
+static __inline__ Vector4x16
+in (Vector4x16 src,
+ Vector4x16 mask)
+{
+ return pix_multiply (src, mask);
+}
+
+static __inline__ Vector4x16
+in_over (Vector4x16 src,
+ Vector4x16 srca,
+ Vector4x16 mask,
+ Vector4x16 dest)
+{
+ return over(in(src, mask), pix_multiply(srca, mask), dest);
+}
+
+static __inline__ Vector8x8
+cvt32to64 (CARD32 v)
+{
+ ullong r = v;
+ return (Vector8x8)r;
+}
+
+static __inline__ Vector4x16
+load8888 (CARD32 v)
+{
+ return (Vector4x16)__builtin_ia32_punpcklbw (cvt32to64 (v),
+ (Vector8x8)c.mmx_zero);
+}
+
+static __inline__ Vector8x8
+pack8888 (Vector4x16 lo, Vector4x16 hi)
+{
+ Vector8x8 r;
+ r = __builtin_ia32_packuswb ((Vector4x16)lo, (Vector4x16)hi);
+ return r;
+}
+
+/* Expand 16 bits positioned at @pos (0-3) of a mmx register into 00RR00GG00BB
+
+--- Expanding 565 in the low word ---
+
+m = (m << (32 - 3)) | (m << (16 - 5)) | m;
+m = m & (01f0003f001f);
+m = m * (008404100840);
+m = m >> 8;
+
+Note the trick here - the top word is shifted by another nibble to avoid
+it bumping into the middle word
+*/
+static __inline__ Vector4x16
+expand565 (Vector4x16 pixel, int pos)
+{
+ Vector1x64 p = (Vector1x64)pixel;
+
+ /* move pixel to low 16 bit and zero the rest */
+ p = shift (shift (p, (3 - pos) * 16), -48);
+
+ Vector1x64 t1 = shift (p, 36 - 11);
+ Vector1x64 t2 = shift (p, 16 - 5);
+
+ p = __builtin_ia32_por (t1, p);
+ p = __builtin_ia32_por (t2, p);
+ p = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_rgb);
+
+ pixel = __builtin_ia32_pmullw ((Vector4x16)p, (Vector4x16)c.mmx_565_unpack_multiplier);
+ return __builtin_ia32_psrlw (pixel, 8);
+}
+
+static __inline__ Vector4x16
+expand8888 (Vector4x16 in, int pos)
+{
+ if (pos == 0)
+ return (Vector4x16)__builtin_ia32_punpcklbw ((Vector8x8)in, (Vector8x8)c.mmx_zero);
+ else
+ return (Vector4x16)__builtin_ia32_punpckhbw ((Vector8x8)in, (Vector8x8)c.mmx_zero);
+}
+
+static __inline__ Vector4x16
+pack565 (Vector4x16 pixel, Vector4x16 target, int pos)
+{
+ Vector1x64 p = (Vector1x64)pixel;
+ Vector1x64 t = (Vector1x64)target;
+ Vector1x64 r, g, b;
+
+ r = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_r);
+ g = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_g);
+ b = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_b);
+
+ r = shift (r, - (32 - 8) + pos * 16);
+ g = shift (g, - (16 - 3) + pos * 16);
+ b = shift (b, - (0 + 3) + pos * 16);
+
+ if (pos == 0)
+ t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_0);
+ else if (pos == 1)
+ t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_1);
+ else if (pos == 2)
+ t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_2);
+ else if (pos == 3)
+ t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_3);
+
+ p = __builtin_ia32_por (r, t);
+ p = __builtin_ia32_por (g, p);
+
+ return (Vector4x16)__builtin_ia32_por (b, p);
+}
+
+static __inline__ void
+emms (void)
+{
+ __asm__ __volatile__ ("emms");
+}
+
+void
+fbCompositeSolid_nx8888mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height)
+{
+ CARD32 src;
+ CARD32 *dstLine, *dst;
+ CARD16 w;
+ FbStride dstStride;
+ Vector4x16 vsrc, vsrca;
+
+ CHECKPOINT();
+
+ fbComposeGetSolid(pSrc, src);
+
+ if (src >> 24 == 0)
+ return;
+
+ fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
+
+ vsrc = load8888 (src);
+ vsrca = expand_alpha (vsrc);
+
+ while (height--)
+ {
+ dst = dstLine;
+ dstLine += dstStride;
+ w = width;
+
+ CHECKPOINT();
+
+ while (w && (unsigned long)dst & 7)
+ {
+ *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), (Vector4x16)c.mmx_zero);
+
+ w--;
+ dst++;
+ }
+
+ while (w >= 2)
+ {
+ Vector4x16 vdest;
+ Vector4x16 dest0, dest1;
+
+ vdest = *(Vector4x16 *)dst;
+
+ dest0 = over(vsrc, vsrca, expand8888(vdest, 0));
+ dest1 = over(vsrc, vsrca, expand8888(vdest, 1));
+
+ *(Vector8x8 *)dst = (Vector8x8)pack8888(dest0, dest1);
+
+ dst += 2;
+ w -= 2;
+ }
+
+ CHECKPOINT();
+
+ while (w)
+ {
+ *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), (Vector4x16)c.mmx_zero);
+
+ w--;
+ dst++;
+ }
+ }
+
+ emms();
+}
+
+void
+fbCompositeSolid_nx0565mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height)
+{
+ CARD32 src;
+ CARD16 *dstLine, *dst;
+ CARD16 w;
+ FbStride dstStride;
+ Vector4x16 vsrc, vsrca;
+
+ CHECKPOINT();
+
+ fbComposeGetSolid(pSrc, src);
+
+ if (src >> 24 == 0)
+ return;
+
+ fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
+
+ vsrc = load8888 (src);
+ vsrca = expand_alpha (vsrc);
+
+ while (height--)
+ {
+ dst = dstLine;
+ dstLine += dstStride;
+ w = width;
+
+ CHECKPOINT();
+
+ while (w && (unsigned long)dst & 7)
+ {
+ ullong d = *dst;
+ Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
+ vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
+ *dst = (ullong)vdest;
+
+ w--;
+ dst++;
+ }
+
+ while (w >= 4)
+ {
+ Vector4x16 vdest;
+
+ vdest = *(Vector4x16 *)dst;
+
+ vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0);
+ vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1);
+ vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2);
+ vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3);
+
+ *(Vector8x8 *)dst = (Vector8x8)vdest;
+
+ dst += 4;
+ w -= 4;
+ }
+
+ CHECKPOINT();
+
+ while (w)
+ {
+ ullong d = *dst;
+ Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
+ vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
+ *dst = (ullong)vdest;
+
+ w--;
+ dst++;
+ }
+ }
+
+ emms();
+}
+
+void
+fbCompositeSolidMask_nx8888x8888Cmmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height)
+{
+ CARD32 src, srca;
+ CARD32 *dstLine;
+ CARD32 *maskLine;
+ FbStride dstStride, maskStride;
+ Vector4x16 vsrc, vsrca;
+
+ CHECKPOINT();
+
+ fbComposeGetSolid(pSrc, src);
+
+ srca = src >> 24;
+ if (srca == 0)
+ return;
+
+ fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
+ fbComposeGetStart (pMask, xMask, yMask, CARD32, maskStride, maskLine, 1);
+
+ vsrc = load8888(src);
+ vsrca = expand_alpha(vsrc);
+
+ while (height--)
+ {
+ int twidth = width;
+ CARD32 *p = (CARD32 *)maskLine;
+ CARD32 *q = (CARD32 *)dstLine;
+
+ while (twidth && (unsigned long)q & 7)
+ {
+ CARD32 m = *(CARD32 *)p;
+
+ if (m)
+ {
+ Vector4x16 vdest = load8888(*q);
+ vdest = in_over(vsrc, vsrca, load8888(m), vdest);
+ *q = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero);
+ }
+
+ twidth--;
+ p++;
+ q++;
+ }
+
+ while (twidth >= 2)
+ {
+ CARD32 m0, m1;
+ m0 = *p;
+ m1 = *(p + 1);
+
+ if (m0 | m1)
+ {
+ Vector4x16 dest0, dest1;
+ Vector4x16 vdest = *(Vector4x16 *)q;
+
+ dest0 = in_over(vsrc, vsrca, load8888(m0),
+ expand8888 (vdest, 0));
+ dest1 = in_over(vsrc, vsrca, load8888(m1),
+ expand8888 (vdest, 1));
+
+ *(Vector8x8 *)q = (Vector8x8)pack8888(dest0, dest1);
+ }
+
+ p += 2;
+ q += 2;
+ twidth -= 2;
+ }
+
+ while (twidth)
+ {
+ CARD32 m = *(CARD32 *)p;
+
+ if (m)
+ {
+ Vector4x16 vdest = load8888(*q);
+ vdest = in_over(vsrc, vsrca, load8888(m), vdest);
+ *q = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero);
+ }
+
+ twidth--;
+ p++;
+ q++;
+ }
+
+ dstLine += dstStride;
+ maskLine += maskStride;
+ }
+
+ emms();
+}
+
+void
+fbCompositeSolidMask_nx8x8888mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height)
+{
+ CARD32 src, srca;
+ CARD32 *dstLine, *dst;
+ CARD8 *maskLine, *mask;
+ FbStride dstStride, maskStride;
+ CARD16 w;
+ Vector4x16 vsrc, vsrca;
+ ullong srcsrc;
+
+ CHECKPOINT();
+
+ fbComposeGetSolid(pSrc, src);
+
+ srca = src >> 24;
+ if (srca == 0)
+ return;
+
+ srcsrc = (unsigned long long)src << 32 | src;
+
+ fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
+ fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
+
+ vsrc = load8888 (src);
+ vsrca = expand_alpha (vsrc);
+
+ while (height--)
+ {
+ dst = dstLine;
+ dstLine += dstStride;
+ mask = maskLine;
+ maskLine += maskStride;
+ w = width;
+
+ CHECKPOINT();
+
+ while (w && (unsigned long)dst & 7)
+ {
+ ullong m = *mask;
+
+ if (m)
+ {
+ Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), load8888(*dst));
+ *dst = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero);
+ }
+
+ w--;
+ mask++;
+ dst++;
+ }
+
+ CHECKPOINT();
+
+ while (w >= 2)
+ {
+ ullong m0, m1;
+ m0 = *mask;
+ m1 = *(mask + 1);
+
+ if (srca == 0xff && (m0 & m1) == 0xff)
+ {
+ *(unsigned long long *)dst = srcsrc;
+ }
+ else if (m0 | m1)
+ {
+ Vector4x16 vdest;
+ Vector4x16 dest0, dest1;
+
+ vdest = *(Vector4x16 *)dst;
+
+ dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m0), expand8888(vdest, 0));
+ dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m1), expand8888(vdest, 1));
+
+ *(Vector8x8 *)dst = (Vector8x8)pack8888(dest0, dest1);
+ }
+
+ mask += 2;
+ dst += 2;
+ w -= 2;
+ }
+
+ CHECKPOINT();
+
+ while (w)
+ {
+ ullong m = *mask;
+
+ if (m)
+ {
+ Vector4x16 vdest = load8888(*dst);
+ vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), vdest);
+ *dst = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero);
+ }
+
+ w--;
+ mask++;
+ dst++;
+ }
+ }
+
+ emms();
+}
+
+
+void
+fbCompositeSolidMask_nx8x0565mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height)
+{
+ CARD32 src, srca;
+ CARD16 *dstLine, *dst;
+ CARD8 *maskLine, *mask;
+ FbStride dstStride, maskStride;
+ CARD16 w;
+ Vector4x16 vsrc, vsrca;
+ unsigned long long srcsrcsrcsrc, src16;
+
+ CHECKPOINT();
+
+ fbComposeGetSolid(pSrc, src);
+
+ srca = src >> 24;
+ if (srca == 0)
+ return;
+
+ fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
+ fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
+
+ vsrc = load8888 (src);
+ vsrca = expand_alpha (vsrc);
+
+ src16 = (ullong)pack565(vsrc, (Vector4x16)c.mmx_zero, 0);
+
+ srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 |
+ (ullong)src16 << 16 | (ullong)src16;
+
+ while (height--)
+ {
+ dst = dstLine;
+ dstLine += dstStride;
+ mask = maskLine;
+ maskLine += maskStride;
+ w = width;
+
+ CHECKPOINT();
+
+ while (w && (unsigned long)dst & 7)
+ {
+ ullong m = *mask;
+
+ if (m)
+ {
+ ullong d = *dst;
+ Vector4x16 vd = (Vector4x16)d;
+ Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), expand565(vd, 0));
+ *dst = (ullong)pack565(vdest, (Vector4x16)c.mmx_zero, 0);
+ }
+
+ w--;
+ mask++;
+ dst++;
+ }
+
+ CHECKPOINT();
+
+ while (w >= 4)
+ {
+ ullong m0, m1, m2, m3;
+ m0 = *mask;
+ m1 = *(mask + 1);
+ m2 = *(mask + 2);
+ m3 = *(mask + 3);
+
+ if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
+ {
+ *(unsigned long long *)dst = srcsrcsrcsrc;
+ }
+ else if (m0 | m1 | m2 | m3)
+ {
+ Vector4x16 vdest;
+ Vector4x16 vm0, vm1, vm2, vm3;
+
+ vdest = *(Vector4x16 *)dst;
+
+ vm0 = (Vector4x16)m0;
+ vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0);
+ vm1 = (Vector4x16)m1;
+ vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1);
+ vm2 = (Vector4x16)m2;
+ vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2);
+ vm3 = (Vector4x16)m3;
+ vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3);
+
+ *(Vector4x16 *)dst = vdest;
+ }
+
+ w -= 4;
+ mask += 4;
+ dst += 4;
+ }
+
+ CHECKPOINT();
+
+ while (w)
+ {
+ ullong m = *mask;
+
+ if (m)
+ {
+ ullong d = *dst;
+ Vector4x16 vd = (Vector4x16)d;
+ Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), expand565(vd, 0));
+ *dst = (ullong)pack565(vdest, (Vector4x16)c.mmx_zero, 0);
+ }
+
+ w--;
+ mask++;
+ dst++;
+ }
+ }
+
+ emms();
+}
+
+void
+fbCompositeSrc_8888RevNPx0565mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height)
+{
+ CARD16 *dstLine, *dst;
+ CARD32 *srcLine, *src;
+ FbStride dstStride, srcStride;
+ CARD16 w;
+
+ CHECKPOINT();
+
+ fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
+ fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
+
+ assert (pSrc->pDrawable == pMask->pDrawable);
+
+ while (height--)
+ {
+ dst = dstLine;
+ dstLine += dstStride;
+ src = srcLine;
+ srcLine += srcStride;
+ w = width;
+
+ CHECKPOINT();
+
+ while (w && (unsigned long)dst & 7)
+ {
+ Vector4x16 vsrc = load8888 (*src);
+ ullong d = *dst;
+ Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
+
+ vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
+
+ *dst = (ullong)vdest;
+
+ w--;
+ dst++;
+ src++;
+ }
+
+ CHECKPOINT();
+
+ while (w >= 4)
+ {
+ CARD32 s0, s1, s2, s3;
+ unsigned char a0, a1, a2, a3;
+
+ s0 = *src;
+ s1 = *(src + 1);
+ s2 = *(src + 2);
+ s3 = *(src + 3);
+
+ a0 = (s0 >> 24);
+ a1 = (s1 >> 24);
+ a2 = (s2 >> 24);
+ a3 = (s3 >> 24);
+
+ if ((a0 & a1 & a2 & a3) == 0xFF)
+ {
+ Vector4x16 vdest;
+ vdest = pack565(invert_colors(load8888(s0)), (Vector4x16)c.mmx_zero, 0);
+ vdest = pack565(invert_colors(load8888(s1)), vdest, 1);
+ vdest = pack565(invert_colors(load8888(s2)), vdest, 2);
+ vdest = pack565(invert_colors(load8888(s3)), vdest, 3);
+
+ *(Vector4x16 *)dst = vdest;
+ }
+ else if (a0 | a1 | a2 | a3)
+ {
+ Vector4x16 vdest = *(Vector4x16 *)dst;
+
+ vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0);
+ vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1);
+ vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2);
+ vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3);
+
+ *(Vector4x16 *)dst = vdest;
+ }
+
+ w -= 4;
+ dst += 4;
+ src += 4;
+ }
+
+ CHECKPOINT();
+
+ while (w)
+ {
+ Vector4x16 vsrc = load8888 (*src);
+ ullong d = *dst;
+ Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
+
+ vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
+
+ *dst = (ullong)vdest;
+
+ w--;
+ dst++;
+ src++;
+ }
+ }
+
+ emms();
+}
+
+/* "888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
+
+void
+fbCompositeSrc_8888RevNPx8888mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height)
+{
+ CARD32 *dstLine, *dst;
+ CARD32 *srcLine, *src;
+ FbStride dstStride, srcStride;
+ CARD16 w;
+
+ CHECKPOINT();
+
+ fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
+ fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
+
+ assert (pSrc->pDrawable == pMask->pDrawable);
+
+ while (height--)
+ {
+ dst = dstLine;
+ dstLine += dstStride;
+ src = srcLine;
+ srcLine += srcStride;
+ w = width;
+
+ while (w && (unsigned long)dst & 7)
+ {
+ Vector4x16 s = load8888 (*src);
+ Vector4x16 d = load8888 (*dst);
+
+ *dst = (ullong)pack8888 (over_rev_non_pre (s, d), (Vector4x16)c.mmx_zero);
+
+ w--;
+ dst++;
+ src++;
+ }
+
+ while (w >= 2)
+ {
+ ullong s0, s1;
+ unsigned char a0, a1;
+ Vector4x16 d0, d1;
+
+ s0 = *src;
+ s1 = *(src + 1);
+
+ a0 = (s0 >> 24);
+ a1 = (s1 >> 24);
+
+ if ((a0 & a1) == 0xFF)
+ {
+ d0 = invert_colors(load8888(s0));
+ d1 = invert_colors(load8888(s1));
+
+ *(Vector8x8 *)dst = pack8888 (d0, d1);
+ }
+ else if (a0 | a1)
+ {
+ Vector4x16 vdest = *(Vector4x16 *)dst;
+
+ d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0));
+ d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1));
+
+ *(Vector8x8 *)dst = pack8888 (d0, d1);
+ }
+
+ w -= 2;
+ dst += 2;
+ src += 2;
+ }
+
+ while (w)
+ {
+ Vector4x16 s = load8888 (*src);
+ Vector4x16 d = load8888 (*dst);
+
+ *dst = (ullong)pack8888 (over_rev_non_pre (s, d), (Vector4x16)c.mmx_zero);
+
+ w--;
+ dst++;
+ src++;
+ }
+ }
+
+ emms();
+}
+
+void
+fbCompositeSolidMask_nx8888x0565Cmmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height)
+{
+ CARD32 src, srca;
+ CARD16 *dstLine;
+ CARD32 *maskLine;
+ FbStride dstStride, maskStride;
+ Vector4x16 vsrc, vsrca;
+
+ CHECKPOINT();
+
+ fbComposeGetSolid(pSrc, src);
+
+ srca = src >> 24;
+ if (srca == 0)
+ return;
+
+ fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
+ fbComposeGetStart (pMask, xMask, yMask, CARD32, maskStride, maskLine, 1);
+
+ vsrc = load8888 (src);
+ vsrca = expand_alpha (vsrc);
+
+ while (height--)
+ {
+ int twidth = width;
+ CARD32 *p = (CARD32 *)maskLine;
+ CARD16 *q = (CARD16 *)dstLine;
+
+ while (twidth && ((unsigned long)q & 7))
+ {
+ CARD32 m = *(CARD32 *)p;
+
+ if (m)
+ {
+ ullong d = *q;
+ Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
+ vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
+ *q = (ullong)vdest;
+ }
+
+ twidth--;
+ p++;
+ q++;
+ }
+
+ while (twidth >= 4)
+ {
+ CARD32 m0, m1, m2, m3;
+
+ m0 = *p;
+ m1 = *(p + 1);
+ m2 = *(p + 2);
+ m3 = *(p + 3);
+
+ if ((m0 | m1 | m2 | m3))
+ {
+ Vector4x16 vdest = *(Vector4x16 *)q;
+
+ vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0);
+ vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1);
+ vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2);
+ vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3);
+
+ *(Vector4x16 *)q = vdest;
+ }
+ twidth -= 4;
+ p += 4;
+ q += 4;
+ }
+
+ while (twidth)
+ {
+ CARD32 m;
+
+ m = *(CARD32 *)p;
+ if (m)
+ {
+ ullong d = *q;
+ Vector4x16 vdest = expand565((Vector4x16)d, 0);
+ vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0);
+ *q = (ullong)vdest;
+ }
+
+ twidth--;
+ p++;
+ q++;
+ }
+
+ maskLine += maskStride;
+ dstLine += dstStride;
+ }
+
+ emms ();
+}
+
+void
+fbCompositeSrcAdd_8000x8000mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height)
+{
+ CARD8 *dstLine, *dst;
+ CARD8 *srcLine, *src;
+ FbStride dstStride, srcStride;
+ CARD16 w;
+ CARD8 s, d;
+ CARD16 t;
+
+ CHECKPOINT();
+
+ fbComposeGetStart (pSrc, xSrc, ySrc, CARD8, srcStride, srcLine, 1);
+ fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1);
+
+ while (height--)
+ {
+ dst = dstLine;
+ dstLine += dstStride;
+ src = srcLine;
+ srcLine += srcStride;
+ w = width;
+
+ while (w && (unsigned long)dst & 7)
+ {
+ s = *src;
+ d = *dst;
+ t = d + s;
+ s = t | (0 - (t >> 8));
+ *dst = s;
+
+ dst++;
+ src++;
+ w--;
+ }
+
+ while (w >= 8)
+ {
+ __asm__ __volatile__ (
+ "movq (%0), %%mm2\n\t"
+ "movq (%1), %%mm3\n\t"
+ "paddusb %%mm2, %%mm3\n\t"
+ "movq %%mm3, (%1)\n\t"
+ : /* no output */ : "r" (src), "r" (dst));
+
+ dst += 8;
+ src += 8;
+ w -= 8;
+ }
+
+ while (w)
+ {
+ s = *src;
+ d = *dst;
+ t = d + s;
+ s = t | (0 - (t >> 8));
+ *dst = s;
+
+ dst++;
+ src++;
+ w--;
+ }
+ }
+
+ emms();
+}
+
+void
+fbCompositeSrcAdd_8888x8888mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height)
+{
+ CARD32 *dstLine, *dst;
+ CARD32 *srcLine, *src;
+ FbStride dstStride, srcStride;
+ CARD16 w;
+
+ CHECKPOINT();
+
+ fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
+ fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
+
+ while (height--)
+ {
+ dst = dstLine;
+ dstLine += dstStride;
+ src = srcLine;
+ srcLine += srcStride;
+ w = width;
+
+ while (w && (unsigned long)dst & 7)
+ {
+ __asm__ __volatile__ (
+ "movd %0, %%mm2\n\t"
+ "movd %1, %%mm3\n\t"
+ "paddusb %%mm2, %%mm3\n\t"
+ "movd %%mm3, %1\n\t"
+ : /* no output */ : "m" (*src), "m" (*dst));
+
+ dst++;
+ src++;
+ w--;
+ }
+
+ while (w >= 2)
+ {
+ __asm__ __volatile__ (
+ "movq (%0), %%mm2\n\t"
+ "movq (%1), %%mm3\n\t"
+ "paddusb %%mm2, %%mm3\n\t"
+ "movq %%mm3, (%1)\n\t"
+ : /* no output */ : "r" (src), "r" (dst));
+
+ dst += 2;
+ src += 2;
+ w -= 2;
+ }
+
+ if (w)
+ {
+ __asm__ __volatile__ (
+ "movd %0, %%mm2\n\t"
+ "movd %1, %%mm3\n\t"
+ "paddusb %%mm2, %%mm3\n\t"
+ "movd %%mm3, %1\n\t"
+ : /* no output */ : "m" (*src), "m" (*dst));
+ }
+ }
+
+ emms();
+}
+
+#define GetStart(drw,x,y,type,stride,line,bpp) {\
+ FbBits *__bits__; \
+ FbStride __stride__; \
+ int __xoff__,__yoff__; \
+ \
+ fbGetDrawable((drw),__bits__,__stride__,bpp,__xoff__,__yoff__); \
+ (stride) = __stride__ * sizeof (FbBits) / sizeof (type); \
+ (line) = ((type *) __bits__) + (stride) * ((y) - __yoff__) + ((x) - __xoff__); \
+}
+
+Bool
+fbSolidFillmmx (DrawablePtr pDraw,
+ int x,
+ int y,
+ int width,
+ int height,
+ FbBits xor)
+{
+ FbStride stride;
+ int bpp;
+ ullong fill;
+ Vector8x8 vfill;
+ CARD32 byte_width;
+ CARD8 *byte_line;
+ FbBits *bits;
+ int xoff, yoff;
+
+ CHECKPOINT();
+
+ fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff);
+
+ if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))
+ return FALSE;
+
+ if (bpp != 16 && bpp != 32)
+ return FALSE;
+
+ if (bpp == 16)
+ {
+ stride = stride * sizeof (FbBits) / 2;
+ byte_line = (CARD8 *)(((CARD16 *)bits) + stride * (y - yoff) + (x - xoff));
+ byte_width = 2 * width;
+ stride *= 2;
+ }
+ else
+ {
+ stride = stride * sizeof (FbBits) / 4;
+ byte_line = (CARD8 *)(((CARD32 *)bits) + stride * (y - yoff) + (x - xoff));
+ byte_width = 4 * width;
+ stride *= 4;
+ }
+
+ fill = ((ullong)xor << 32) | xor;
+ vfill = (Vector8x8)fill;
+
+ while (height--)
+ {
+ int w;
+ CARD8 *d = byte_line;
+ byte_line += stride;
+ w = byte_width;
+
+ while (w >= 2 && ((unsigned long)d & 3))
+ {
+ *(CARD16 *)d = xor;
+ w -= 2;
+ d += 2;
+ }
+
+ while (w >= 4 && ((unsigned int)d & 7))
+ {
+ *(CARD32 *)d = xor;
+
+ w -= 4;
+ d += 4;
+ }
+
+ while (w >= 64)
+ {
+ __asm__ __volatile (
+ "movq %0, (%1)\n\t"
+ "movq %0, 8(%1)\n\t"
+ "movq %0, 16(%1)\n\t"
+ "movq %0, 24(%1)\n\t"
+ "movq %0, 32(%1)\n\t"
+ "movq %0, 40(%1)\n\t"
+ "movq %0, 48(%1)\n\t"
+ "movq %0, 56(%1)\n\t"
+ : /* no output */
+ : "y" (vfill), "r" (d)
+ : "memory");
+ w -= 64;
+ d += 64;
+ }
+ while (w >= 4)
+ {
+ *(CARD32 *)d = xor;
+
+ w -= 4;
+ d += 4;
+ }
+ if (w >= 2)
+ {
+ *(CARD16 *)d = xor;
+ w -= 2;
+ d += 2;
+ }
+ }
+
+ emms();
+ return TRUE;
+}
+
+Bool
+fbHaveMMX (void)
+{
+ static Bool initialized = FALSE;
+ static Bool mmx_present;
+
+ if (!initialized)
+ {
+ int tmp; /* static variables are accessed through %ebx,
+ * but we mess around with the registers below,
+ * so we need a temporary variable that can
+ * be accessed directly.
+ */
+
+ __asm__ __volatile__ (
+/* Check if bit 21 in flags word is writeable */
+
+ "pusha \n\t"
+ "pushfl \n\t"
+ "popl %%eax \n\t"
+ "movl %%eax, %%ebx \n\t"
+ "xorl $0x00200000, %%eax \n\t"
+ "pushl %%eax \n\t"
+ "popfl \n\t"
+ "pushfl \n\t"
+ "popl %%eax \n\t"
+
+ "cmpl %%eax, %%ebx \n\t"
+
+ "je .notfound \n\t"
+
+/* OK, we have CPUID */
+
+ "movl $1, %%eax \n\t"
+ "cpuid \n\t"
+
+ "test $0x00800000, %%edx \n\t"
+ "jz .notfound \n\t"
+
+ "movl $1, %0 \n\t"
+ "jmp .out \n\t"
+
+ ".notfound: \n\t"
+ "movl $0, %0 \n\t"
+
+ ".out: \n\t"
+ "popa \n\t"
+ :
+ "=m" (tmp)
+ : /* no input */);
+
+ initialized = TRUE;
+
+ mmx_present = tmp;
+ }
+
+ return mmx_present;
+}
+
+
+#endif /* RENDER */
+#endif /* USE_GCC34_MMX */
diff --git a/fb/fbmmx.h b/fb/fbmmx.h
new file mode 100644
index 000000000..dd8538cc4
--- /dev/null
+++ b/fb/fbmmx.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright © 2004 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission. Red Hat makes no representations about the
+ * suitability of this software for any purpose. It is provided "as is"
+ * without express or implied warranty.
+ *
+ * RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL RED HAT
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author: Søren Sandmann (sandmann@redhat.com)
+ *
+ * Based on work by Owen Taylor
+ */
+#ifdef USE_GCC34_MMX
+Bool fbHaveMMX(void);
+#else
+#define fbHaveMMX FALSE
+#endif
+
+#ifdef USE_GCC34_MMX
+
+void fbCompositeSolidMask_nx8888x0565Cmmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+void fbCompositeSrcAdd_8888x8888mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+void fbCompositeSolidMask_nx8888x8888Cmmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+void fbCompositeSolidMask_nx8x8888mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+void fbCompositeSrcAdd_8000x8000mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+void fbCompositeSrc_8888RevNPx8888mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+void fbCompositeSrc_8888RevNPx0565mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+void fbCompositeSolid_nx8888mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+void fbCompositeSolid_nx0565mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+void fbCompositeSolidMask_nx8x0565mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+Bool fbSolidFillmmx (DrawablePtr pDraw,
+ int x,
+ int y,
+ int width,
+ int height,
+ FbBits xor);
+
+#endif /* USE_GCC34_MMX */
diff --git a/fb/fbpict.c b/fb/fbpict.c
index 3f989bc56..2abdd32d5 100644
--- a/fb/fbpict.c
+++ b/fb/fbpict.c
@@ -30,34 +30,8 @@
#include "picturestr.h"
#include "mipict.h"
#include "fbpict.h"
+#include "fbmmx.h"
-#define cvt8888to0565(s) ((((s) >> 3) & 0x001f) | \
- (((s) >> 5) & 0x07e0) | \
- (((s) >> 8) & 0xf800))
-#define cvt0565to8888(s) (((((s) << 3) & 0xf8) | (((s) >> 2) & 0x7)) | \
- ((((s) << 5) & 0xfc00) | (((s) >> 1) & 0x300)) | \
- ((((s) << 8) & 0xf80000) | (((s) << 3) & 0x70000)))
-
-#if IMAGE_BYTE_ORDER == MSBFirst
-#define Fetch24(a) ((unsigned long) (a) & 1 ? \
- ((*(a) << 16) | *((CARD16 *) ((a)+1))) : \
- ((*((CARD16 *) (a)) << 8) | *((a)+2)))
-#define Store24(a,v) ((unsigned long) (a) & 1 ? \
- ((*(a) = (CARD8) ((v) >> 16)), \
- (*((CARD16 *) ((a)+1)) = (CARD16) (v))) : \
- ((*((CARD16 *) (a)) = (CARD16) ((v) >> 8)), \
- (*((a)+2) = (CARD8) (v))))
-#else
-#define Fetch24(a) ((unsigned long) (a) & 1 ? \
- ((*(a)) | (*((CARD16 *) ((a)+1)) << 8)) : \
- ((*((CARD16 *) (a))) | (*((a)+2) << 16)))
-#define Store24(a,v) ((unsigned long) (a) & 1 ? \
- ((*(a) = (CARD8) (v)), \
- (*((CARD16 *) ((a)+1)) = (CARD16) ((v) >> 8))) : \
- ((*((CARD16 *) (a)) = (CARD16) (v)),\
- (*((a)+2) = (CARD8) ((v) >> 16))))
-#endif
-
CARD32
fbOver (CARD32 x, CARD32 y)
{
@@ -99,43 +73,6 @@ fbIn (CARD32 x, CARD8 y)
return m|n|o|p;
}
-#define fbComposeGetSolid(pict, bits) { \
- FbBits *__bits__; \
- FbStride __stride__; \
- int __bpp__; \
- int __xoff__,__yoff__; \
-\
- fbGetDrawable((pict)->pDrawable,__bits__,__stride__,__bpp__,__xoff__,__yoff__); \
- switch (__bpp__) { \
- case 32: \
- (bits) = *(CARD32 *) __bits__; \
- break; \
- case 24: \
- (bits) = Fetch24 ((CARD8 *) __bits__); \
- break; \
- case 16: \
- (bits) = *(CARD16 *) __bits__; \
- (bits) = cvt0565to8888(bits); \
- break; \
- default: \
- return; \
- } \
- /* manage missing src alpha */ \
- if ((pict)->pFormat->direct.alphaMask == 0) \
- (bits) |= 0xff000000; \
-}
-
-#define fbComposeGetStart(pict,x,y,type,stride,line,mul) {\
- FbBits *__bits__; \
- FbStride __stride__; \
- int __bpp__; \
- int __xoff__,__yoff__; \
-\
- fbGetDrawable((pict)->pDrawable,__bits__,__stride__,__bpp__,__xoff__,__yoff__); \
- (stride) = __stride__ * sizeof (FbBits) / sizeof (type); \
- (line) = ((type *) __bits__) + (stride) * ((y) - __yoff__) + (mul) * ((x) - __xoff__); \
-}
-
/*
* Naming convention:
*
@@ -168,7 +105,7 @@ fbCompositeSolidMask_nx8x8888 (CARD8 op,
srca = src >> 24;
if (src == 0)
return;
-
+
fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
@@ -940,7 +877,12 @@ fbComposite (CARD8 op,
switch (pDst->format) {
case PICT_r5g6b5:
case PICT_b5g6r5:
- func = fbCompositeSolidMask_nx8x0565;
+#ifdef USE_GCC34_MMX
+ if (fbHaveMMX())
+ func = fbCompositeSolidMask_nx8x0565mmx;
+ else
+#endif
+ func = fbCompositeSolidMask_nx8x0565;
break;
case PICT_r8g8b8:
case PICT_b8g8r8:
@@ -950,7 +892,12 @@ fbComposite (CARD8 op,
case PICT_x8r8g8b8:
case PICT_a8b8g8r8:
case PICT_x8b8g8r8:
- func = fbCompositeSolidMask_nx8x8888;
+#ifdef USE_GCC34_MMX
+ if (fbHaveMMX())
+ func = fbCompositeSolidMask_nx8x8888mmx;
+ else
+#endif
+ func = fbCompositeSolidMask_nx8x8888;
break;
}
break;
@@ -959,10 +906,20 @@ fbComposite (CARD8 op,
switch (pDst->format) {
case PICT_a8r8g8b8:
case PICT_x8r8g8b8:
- func = fbCompositeSolidMask_nx8888x8888C;
+#ifdef USE_GCC34_MMX
+ if (fbHaveMMX())
+ func = fbCompositeSolidMask_nx8888x8888Cmmx;
+ else
+#endif
+ func = fbCompositeSolidMask_nx8888x8888C;
break;
case PICT_r5g6b5:
- func = fbCompositeSolidMask_nx8888x0565C;
+#ifdef USE_GCC34_MMX
+ if (fbHaveMMX())
+ func = fbCompositeSolidMask_nx8888x0565Cmmx;
+ else
+#endif
+ func = fbCompositeSolidMask_nx8888x0565C;
break;
}
}
@@ -972,10 +929,20 @@ fbComposite (CARD8 op,
switch (pDst->format) {
case PICT_a8b8g8r8:
case PICT_x8b8g8r8:
- func = fbCompositeSolidMask_nx8888x8888C;
+#ifdef USE_GCC34_MMX
+ if (fbHaveMMX())
+ func = fbCompositeSolidMask_nx8888x8888Cmmx;
+ else
+#endif
+ func = fbCompositeSolidMask_nx8888x8888C;
break;
case PICT_b5g6r5:
- func = fbCompositeSolidMask_nx8888x0565C;
+#ifdef USE_GCC34_MMX
+ if (fbHaveMMX())
+ func = fbCompositeSolidMask_nx8888x0565Cmmx;
+ else
+#endif
+ func = fbCompositeSolidMask_nx8888x0565C;
break;
}
}
@@ -993,55 +960,145 @@ fbComposite (CARD8 op,
func = fbCompositeSolidMask_nx1xn;
break;
}
+ break;
+ }
+ }
+ }
+ else /* has mask and non-repeating source */
+ {
+ if (pSrc->pDrawable == pMask->pDrawable &&
+ xSrc == xMask && ySrc == yMask &&
+ !pMask->componentAlpha)
+ {
+ switch (pSrc->format) {
+ case PICT_x8b8g8r8:
+ switch (pMask->format) {
+ case PICT_a8r8g8b8:
+ case PICT_a8b8g8r8:
+ switch (pDst->format) {
+ case PICT_a8r8g8b8:
+ case PICT_x8r8g8b8:
+#ifdef USE_GCC34_MMX
+ if (fbHaveMMX())
+ func = fbCompositeSrc_8888RevNPx8888mmx;
+#endif
+ break;
+ case PICT_r5g6b5:
+#ifdef USE_GCC34_MMX
+ if (fbHaveMMX())
+ func = fbCompositeSrc_8888RevNPx0565mmx;
+#endif
+ break;
+ }
+ break;
+ }
+ break;
+ case PICT_x8r8g8b8:
+ switch (pMask->format) {
+ case PICT_a8r8g8b8:
+ case PICT_a8b8g8r8:
+ switch (pDst->format) {
+ case PICT_a8b8g8r8:
+ case PICT_x8b8g8r8:
+#ifdef USE_GCC34_MMX
+ if (fbHaveMMX())
+ func = fbCompositeSrc_8888RevNPx8888mmx;
+#endif
+ break;
+ case PICT_r5g6b5:
+#ifdef USE_GCC34_MMX
+ if (fbHaveMMX())
+ func = fbCompositeSrc_8888RevNPx0565mmx;
+#endif
+ break;
+ }
+ break;
+ }
+ break;
}
+ break;
}
}
}
else
{
- switch (pSrc->format) {
- case PICT_a8r8g8b8:
- switch (pDst->format) {
+ if (srcRepeat &&
+ pSrc->pDrawable->width == 1 &&
+ pSrc->pDrawable->height == 1)
+ {
+ /* no mask and repeating source */
+ switch (pSrc->format) {
case PICT_a8r8g8b8:
- case PICT_x8r8g8b8:
- func = fbCompositeSrc_8888x8888;
- break;
- case PICT_r8g8b8:
- func = fbCompositeSrc_8888x0888;
- break;
- case PICT_r5g6b5:
- func = fbCompositeSrc_8888x0565;
+ switch (pDst->format) {
+ case PICT_a8r8g8b8:
+ case PICT_x8r8g8b8:
+#ifdef USE_GCC34_MMX
+ if (fbHaveMMX())
+ {
+ srcRepeat = FALSE;
+ func = fbCompositeSolid_nx8888mmx;
+ }
+#endif
+ break;
+ case PICT_r5g6b5:
+#ifdef USE_GCC34_MMX
+ if (fbHaveMMX())
+ {
+ srcRepeat = FALSE;
+ func = fbCompositeSolid_nx0565mmx;
+ }
+#endif
+ break;
+ }
break;
}
- break;
- case PICT_a8b8g8r8:
- switch (pDst->format) {
- case PICT_a8b8g8r8:
- case PICT_x8b8g8r8:
- func = fbCompositeSrc_8888x8888;
- break;
- case PICT_b8g8r8:
- func = fbCompositeSrc_8888x0888;
+ }
+ else
+ {
+ switch (pSrc->format) {
+ case PICT_a8r8g8b8:
+ switch (pDst->format) {
+ case PICT_a8r8g8b8:
+ case PICT_x8r8g8b8:
+ func = fbCompositeSrc_8888x8888;
+ break;
+ case PICT_r8g8b8:
+ func = fbCompositeSrc_8888x0888;
+ break;
+ case PICT_r5g6b5:
+ func = fbCompositeSrc_8888x0565;
+ break;
+ }
break;
- case PICT_b5g6r5:
- func = fbCompositeSrc_8888x0565;
+ case PICT_a8b8g8r8:
+ switch (pDst->format) {
+ case PICT_a8b8g8r8:
+ case PICT_x8b8g8r8:
+ func = fbCompositeSrc_8888x8888;
+ break;
+ case PICT_b8g8r8:
+ func = fbCompositeSrc_8888x0888;
+ break;
+ case PICT_b5g6r5:
+ func = fbCompositeSrc_8888x0565;
+ break;
+ }
break;
- }
- break;
- case PICT_r5g6b5:
- switch (pDst->format) {
case PICT_r5g6b5:
- func = fbCompositeSrc_0565x0565;
+ switch (pDst->format) {
+ case PICT_r5g6b5:
+ func = fbCompositeSrc_0565x0565;
+ break;
+ }
break;
- }
- break;
- case PICT_b5g6r5:
- switch (pDst->format) {
case PICT_b5g6r5:
- func = fbCompositeSrc_0565x0565;
+ switch (pDst->format) {
+ case PICT_b5g6r5:
+ func = fbCompositeSrc_0565x0565;
+ break;
+ }
break;
}
- break;
}
}
break;
@@ -1052,21 +1109,36 @@ fbComposite (CARD8 op,
case PICT_a8r8g8b8:
switch (pDst->format) {
case PICT_a8r8g8b8:
- func = fbCompositeSrcAdd_8888x8888;
+#ifdef USE_GCC34_MMX
+ if (fbHaveMMX())
+ func = fbCompositeSrcAdd_8888x8888mmx;
+ else
+#endif
+ func = fbCompositeSrcAdd_8888x8888;
break;
}
break;
case PICT_a8b8g8r8:
switch (pDst->format) {
case PICT_a8b8g8r8:
- func = fbCompositeSrcAdd_8888x8888;
+#ifdef USE_GCC34_MMX
+ if (fbHaveMMX())
+ func = fbCompositeSrcAdd_8888x8888mmx;
+ else
+#endif
+ func = fbCompositeSrcAdd_8888x8888;
break;
}
break;
case PICT_a8:
switch (pDst->format) {
case PICT_a8:
- func = fbCompositeSrcAdd_8000x8000;
+#ifdef USE_GCC34_MMX
+ if (fbHaveMMX())
+ func = fbCompositeSrcAdd_8000x8000mmx;
+ else
+#endif
+ func = fbCompositeSrcAdd_8000x8000;
break;
}
break;
diff --git a/fb/fbpict.h b/fb/fbpict.h
index 927434950..cbf213b64 100644
--- a/fb/fbpict.h
+++ b/fb/fbpict.h
@@ -70,6 +70,70 @@ typedef void (*CompositeFunc) (CARD8 op,
CARD16 width,
CARD16 height);
+#define fbComposeGetSolid(pict, bits) { \
+ FbBits *__bits__; \
+ FbStride __stride__; \
+ int __bpp__; \
+ int __xoff__,__yoff__; \
+\
+ fbGetDrawable((pict)->pDrawable,__bits__,__stride__,__bpp__,__xoff__,__yoff__); \
+ switch (__bpp__) { \
+ case 32: \
+ (bits) = *(CARD32 *) __bits__; \
+ break; \
+ case 24: \
+ (bits) = Fetch24 ((CARD8 *) __bits__); \
+ break; \
+ case 16: \
+ (bits) = *(CARD16 *) __bits__; \
+ (bits) = cvt0565to8888(bits); \
+ break; \
+ default: \
+ return; \
+ } \
+ /* manage missing src alpha */ \
+ if ((pict)->pFormat->direct.alphaMask == 0) \
+ (bits) |= 0xff000000; \
+}
+
+#define fbComposeGetStart(pict,x,y,type,stride,line,mul) {\
+ FbBits *__bits__; \
+ FbStride __stride__; \
+ int __bpp__; \
+ int __xoff__,__yoff__; \
+\
+ fbGetDrawable((pict)->pDrawable,__bits__,__stride__,__bpp__,__xoff__,__yoff__); \
+ (stride) = __stride__ * sizeof (FbBits) / sizeof (type); \
+ (line) = ((type *) __bits__) + (stride) * ((y) - __yoff__) + (mul) * ((x) - __xoff__); \
+}
+#define cvt8888to0565(s) ((((s) >> 3) & 0x001f) | \
+ (((s) >> 5) & 0x07e0) | \
+ (((s) >> 8) & 0xf800))
+#define cvt0565to8888(s) (((((s) << 3) & 0xf8) | (((s) >> 2) & 0x7)) | \
+ ((((s) << 5) & 0xfc00) | (((s) >> 1) & 0x300)) | \
+ ((((s) << 8) & 0xf80000) | (((s) << 3) & 0x70000)))
+
+#if IMAGE_BYTE_ORDER == MSBFirst
+#define Fetch24(a) ((unsigned long) (a) & 1 ? \
+ ((*(a) << 16) | *((CARD16 *) ((a)+1))) : \
+ ((*((CARD16 *) (a)) << 8) | *((a)+2)))
+#define Store24(a,v) ((unsigned long) (a) & 1 ? \
+ ((*(a) = (CARD8) ((v) >> 16)), \
+ (*((CARD16 *) ((a)+1)) = (CARD16) (v))) : \
+ ((*((CARD16 *) (a)) = (CARD16) ((v) >> 8)), \
+ (*((a)+2) = (CARD8) (v))))
+#else
+#define Fetch24(a) ((unsigned long) (a) & 1 ? \
+ ((*(a)) | (*((CARD16 *) ((a)+1)) << 8)) : \
+ ((*((CARD16 *) (a))) | (*((a)+2) << 16)))
+#define Store24(a,v) ((unsigned long) (a) & 1 ? \
+ ((*(a) = (CARD8) (v)), \
+ (*((CARD16 *) ((a)+1)) = (CARD16) ((v) >> 8))) : \
+ ((*((CARD16 *) (a)) = (CARD16) (v)),\
+ (*((a)+2) = (CARD8) ((v) >> 16))))
+#endif
+
+
typedef struct _FbCompositeOperand FbCompositeOperand;
typedef CARD32 (*FbCompositeFetch)(FbCompositeOperand *op);
diff --git a/fb/fbsolid.c b/fb/fbsolid.c
index a325da0c2..4b7ff1936 100644
--- a/fb/fbsolid.c
+++ b/fb/fbsolid.c
@@ -49,7 +49,6 @@ fbSolid (FbBits *dst,
return;
}
#endif
-
dst += dstX >> FB_SHIFT;
dstX &= FB_MASK;
FbMaskBitsBytes(dstX, width, and == 0, startmask, startbyte,