From 41002623f314444bd416fd5f445a0425c5b59df0 Mon Sep 17 00:00:00 2001 From: Lars Knoll Date: Tue, 12 Jul 2005 10:02:10 +0000 Subject: Add MMX Code paths for the basic composition operations in fbComposeGeneral. --- fb/fbcompose.c | 43 ++-- fb/fbmmx.c | 794 ++++++++++++++++++++++++++++++++++++++++++++++++--------- fb/fbmmx.h | 54 +++- fb/fbpict.c | 8 + fb/fbpict.h | 9 + 5 files changed, 759 insertions(+), 149 deletions(-) (limited to 'fb') diff --git a/fb/fbcompose.c b/fb/fbcompose.c index 9ec696600..37ea420c1 100644 --- a/fb/fbcompose.c +++ b/fb/fbcompose.c @@ -129,10 +129,12 @@ fbFetch_r5g6b5 (const FbBits *bits, int x, int width, CARD32 *buffer, miIndexedP const CARD16 *end = pixel + width; while (pixel < end) { CARD32 p = *pixel++; - *buffer++ = 0xff000000 | - ((((p) << 3) & 0xf8) | (((p) >> 2) & 0x7)) | - ((((p) << 5) & 0xfc00) | (((p) >> 1) & 0x300)) | - ((((p) << 8) & 0xf80000) | (((p) << 3) & 0x70000)); + CARD32 r = (((p) << 3) & 0xf8) | + (((p) << 5) & 0xfc00) | + (((p) << 8) & 0xf80000); + r |= (r >> 5) & 0x70007; + r |= (r >> 6) & 0x300; + *buffer++ = 0xff000000 | r; } } @@ -1428,7 +1430,6 @@ static storeProc storeProcForPicture (PicturePtr pict) /* * Combine src and mask */ - static FASTCALL void fbCombineMaskU (CARD32 *src, const CARD32 *mask, int width) { @@ -1441,13 +1442,9 @@ fbCombineMaskU (CARD32 *src, const CARD32 *mask, int width) } } - - /* * All of the composing functions */ -typedef FASTCALL void (*CombineFuncU) (CARD32 *dest, const CARD32 *src, int width); - static FASTCALL void fbCombineClear (CARD32 *dest, const CARD32 *src, int width) @@ -1607,7 +1604,7 @@ fbCombineSaturateU (CARD32 *dest, const CARD32 *src, int width) da = ~d >> 24; if (sa > da) { - sa = (da << 8) / sa; + sa = FbIntDiv(da, sa); FbByteMul(s, sa); } FbByteAdd(d, s); @@ -1926,7 +1923,7 @@ fbCombineConjointXorU (CARD32 *dest, const CARD32 *src, int width) fbCombineConjointGeneralU (dest, src, width, CombineXor); } -static CombineFuncU combineFuncU[] = { +static CombineFuncU fbCombineFuncU[] = { fbCombineClear, fbCombineSrcU, 0, /* CombineDst */ @@ -1973,8 +1970,6 @@ static CombineFuncU combineFuncU[] = { fbCombineConjointXorU, }; - - static FASTCALL void fbCombineMaskC (CARD32 *src, CARD32 *mask, int width) { @@ -2061,8 +2056,6 @@ fbCombineMaskAlphaC (const CARD32 *src, CARD32 *mask, int width) } } -typedef FASTCALL void (*CombineFuncC) (CARD32 *dest, CARD32 *src, CARD32 *mask, int width); - static FASTCALL void fbCombineClearC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) { @@ -2551,7 +2544,7 @@ fbCombineConjointXorC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) fbCombineConjointGeneralC (dest, src, mask, width, CombineXor); } -static CombineFuncC combineFuncC[] = { +static CombineFuncC fbCombineFuncC[] = { fbCombineClearC, fbCombineSrcC, 0, /* Dest */ @@ -2598,6 +2591,14 @@ static CombineFuncC combineFuncC[] = { fbCombineConjointXorC, }; + +FbComposeFunctions composeFunctions = { + fbCombineFuncU, + fbCombineFuncC, + fbCombineMaskU +}; + + static void fbFetchSolid(PicturePtr pict, int x, int y, int width, CARD32 *buffer) { FbBits *bits; @@ -3340,7 +3341,7 @@ fbCompositeRect (const FbComposeData *data, CARD32 *scanline_buffer) if (fetchSrc && fetchMask && data->mask && data->mask->componentAlpha && PICT_FORMAT_RGB(data->mask->format)) { CARD32 *mask_buffer = dest_buffer + data->width; - CombineFuncC compose = combineFuncC[data->op]; + CombineFuncC compose = composeFunctions.combineC[data->op]; if (!compose) return; @@ -3362,7 +3363,7 @@ fbCompositeRect (const FbComposeData *data, CARD32 *scanline_buffer) } } else { - CombineFuncU compose = combineFuncU[data->op]; + CombineFuncU compose = composeFunctions.combineU[data->op]; if (!compose) return; @@ -3370,7 +3371,7 @@ fbCompositeRect (const FbComposeData *data, CARD32 *scanline_buffer) fetchSrc(data->src, data->xSrc, data->ySrc, data->width, src_buffer); if (fetchMask) { fetchMask(data->mask, data->xMask, data->yMask, data->width, dest_buffer); - fbCombineMaskU(src_buffer, dest_buffer, data->width); + composeFunctions.combineMaskU(src_buffer, dest_buffer, data->width); } fetchSrc = 0; fetchMask = 0; @@ -3385,7 +3386,7 @@ fbCompositeRect (const FbComposeData *data, CARD32 *scanline_buffer) /* add in mask */ if (fetchMask) { fetchMask(data->mask, data->xMask, data->yMask + i, data->width, dest_buffer); - fbCombineMaskU(src_buffer, dest_buffer, data->width); + composeFunctions.combineMaskU(src_buffer, dest_buffer, data->width); } } @@ -3425,7 +3426,7 @@ fbCompositeGeneral (CARD8 op, CARD32 _scanline_buffer[SCANLINE_BUFFER_LENGTH*3]; CARD32 *scanline_buffer = _scanline_buffer; FbComposeData compose_data; - + if (pSrc->pDrawable) srcRepeat = pSrc->repeat == RepeatNormal && !pSrc->transform && (pSrc->pDrawable->width != 1 || pSrc->pDrawable->height != 1); diff --git a/fb/fbmmx.c b/fb/fbmmx.c index 6517122d4..a89359f61 100644 --- a/fb/fbmmx.c +++ b/fb/fbmmx.c @@ -1,6 +1,7 @@ /* * Copyright © 2004 Red Hat, Inc. * Copyright © 2004 Nicholas Miell + * Copyright © 2005 Trolltech AS * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that @@ -12,15 +13,18 @@ * suitability of this software for any purpose. It is provided "as is" * without express or implied warranty. * - * RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL RED HAT - * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. * * Author: Søren Sandmann (sandmann@redhat.com) * Minor Improvements: Nicholas Miell (nmiell@gmail.com) + * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com) * * Based on work by Owen Taylor */ @@ -32,12 +36,8 @@ #ifdef USE_MMX - #include - -#ifdef USE_SSE #include /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ -#endif #ifdef RENDER @@ -48,8 +48,6 @@ #include "mipict.h" #include "fbpict.h" -typedef unsigned long long ullong; - #define noVERBOSE #ifdef VERBOSE @@ -58,6 +56,582 @@ typedef unsigned long long ullong; #define CHECKPOINT() #endif +/* --------------- MMX code patch for fbcompose.c --------------------- */ + +static FASTCALL void +mmxCombineMaskU (CARD32 *src, const CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + + const CARD32 *end = mask + width; + while (mask < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + a = MmxAlpha(a); + MmxMul(s, a); + *src = MmxFrom(s); + ++src; + ++mask; + } + _mm_empty(); +} + + +static FASTCALL void +mmxCombineOverU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = dest + width; + + while (dest < end) { + __m64 x, y, a; + x = MmxTo(*src); + y = MmxTo(*dest); + a = MmxAlpha(x); + a = MmxNegate(a); + MmxMulAdd(y, a, x); + *dest = MmxFrom(y); + ++dest; + ++src; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineOverReverseU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = dest + width; + + while (dest < end) { + __m64 x, y, a; + x = MmxTo(*dest); + y = MmxTo(*src); + a = MmxAlpha(x); + a = MmxNegate(a); + MmxMulAdd(y, a, x); + *dest = MmxFrom(y); + ++dest; + ++src; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineInU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + + const CARD32 *end = dest + width; + + while (dest < end) { + __m64 x, a; + x = MmxTo(*src); + a = MmxTo(*dest); + a = MmxAlpha(a); + MmxMul(x, a); + *dest = MmxFrom(x); + ++dest; + ++src; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineInReverseU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + + const CARD32 *end = dest + width; + + while (dest < end) { + __m64 x, a; + x = MmxTo(*dest); + a = MmxTo(*src); + a = MmxAlpha(a); + MmxMul(x, a); + *dest = MmxFrom(x); + ++dest; + ++src; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineOutU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = dest + width; + + while (dest < end) { + __m64 x, a; + x = MmxTo(*src); + a = MmxTo(*dest); + a = MmxAlpha(a); + a = MmxNegate(a); + MmxMul(x, a); + *dest = MmxFrom(x); + ++dest; + ++src; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineOutReverseU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = dest + width; + + while (dest < end) { + __m64 x, a; + x = MmxTo(*dest); + a = MmxTo(*src); + a = MmxAlpha(a); + a = MmxNegate(a); + MmxMul(x, a); + *dest = MmxFrom(x); + ++dest; + ++src; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineAtopU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = dest + width; + + while (dest < end) { + __m64 s, da, d, sia; + s = MmxTo(*src); + d = MmxTo(*dest); + sia = MmxAlpha(s); + sia = MmxNegate(sia); + da = MmxAlpha(d); + MmxAddMul(s, da, d, sia); + *dest = MmxFrom(s); + ++dest; + ++src; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineAtopReverseU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end; + + end = dest + width; + + while (dest < end) { + __m64 s, dia, d, sa; + s = MmxTo(*src); + d = MmxTo(*dest); + sa = MmxAlpha(s); + dia = MmxAlpha(d); + dia = MmxNegate(dia); + MmxAddMul(s, dia, d, sa); + *dest = MmxFrom(s); + ++dest; + ++src; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineXorU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = dest + width; + + while (dest < end) { + __m64 s, dia, d, sia; + s = MmxTo(*src); + d = MmxTo(*dest); + sia = MmxAlpha(s); + dia = MmxAlpha(d); + sia = MmxNegate(sia); + dia = MmxNegate(dia); + MmxAddMul(s, dia, d, sia); + *dest = MmxFrom(s); + ++dest; + ++src; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineAddU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + + const CARD32 *end = dest + width; + while (dest < end) { + __m64 s, d; + s = MmxTo(*src); + d = MmxTo(*dest); + s = MmxAdd(s, d); + *dest = MmxFrom(s); + ++dest; + ++src; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineSaturateU (CARD32 *dest, const CARD32 *src, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + + const CARD32 *end = dest + width; + while (dest < end) { + CARD32 s = *src; + CARD32 d = *dest; + __m64 ms = MmxTo(s); + __m64 md = MmxTo(d); + CARD32 sa = s >> 24; + CARD32 da = ~d >> 24; + + if (sa > da) { + __m64 msa = MmxTo(FbIntDiv(da, sa)); + msa = MmxAlpha(msa); + MmxMul(ms, msa); + } + MmxAdd(md, ms); + *dest = MmxFrom(md); + ++src; + ++dest; + } + _mm_empty(); +} + + +static FASTCALL void +mmxCombineSrcC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + MmxMul(s, a); + *dest = MmxFrom(s); + ++src; + ++mask; + ++dest; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineOverC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + __m64 d = MmxTo(*dest); + __m64 sa = MmxAlpha(s); + MmxMul(s, a); + MmxMul(a, sa); + a = MmxNegate(a); + MmxMulAdd(d, a, s); + *dest = MmxFrom(d); + ++src; + ++dest; + ++mask; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineOverReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + __m64 d = MmxTo(*dest); + __m64 da = MmxAlpha(d); + da = MmxNegate(da); + MmxMul(s, a); + MmxMulAdd(s, da, d); + *dest = MmxFrom(s); + ++src; + ++dest; + ++mask; + } + _mm_empty(); +} + + +static FASTCALL void +mmxCombineInC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + __m64 d = MmxTo(*dest); + __m64 da = MmxAlpha(d); + MmxMul(s, a); + MmxMul(s, da); + *dest = MmxFrom(s); + ++src; + ++dest; + ++mask; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineInReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + __m64 d = MmxTo(*dest); + __m64 sa = MmxAlpha(s); + MmxMul(a, sa); + MmxMul(d, a); + *dest = MmxFrom(d); + ++src; + ++dest; + ++mask; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineOutC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + __m64 d = MmxTo(*dest); + __m64 da = MmxAlpha(d); + da = MmxNegate(da); + MmxMul(s, a); + MmxMul(s, da); + *dest = MmxFrom(s); + ++src; + ++dest; + ++mask; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineOutReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + __m64 d = MmxTo(*dest); + __m64 sa = MmxAlpha(s); + MmxMul(a, sa); + a = MmxNegate(a); + MmxMul(d, a); + *dest = MmxFrom(d); + ++src; + ++dest; + ++mask; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineAtopC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + __m64 d = MmxTo(*dest); + __m64 da = MmxAlpha(d); + __m64 sa = MmxAlpha(s); + MmxMul(s, a); + MmxMul(a, sa); + a = MmxNegate(a); + MmxAddMul(d, a, s, da); + *dest = MmxFrom(d); + ++src; + ++dest; + ++mask; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineAtopReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + __m64 d = MmxTo(*dest); + __m64 da = MmxAlpha(d); + __m64 sa = MmxAlpha(s) + MmxMul(s, a); + MmxMul(a, sa); + da = MmxNegate(da); + MmxAddMul(d, a, s, da); + *dest = MmxFrom(d); + ++src; + ++dest; + ++mask; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineXorC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + __m64 d = MmxTo(*dest); + __m64 da = MmxAlpha(d); + __m64 sa = MmxAlpha(s); + MmxMul(s, a); + MmxMul(a, sa); + da = MmxNegate(da); + a = MmxNegate(a); + MmxAddMul(d, a, s, da); + *dest = MmxFrom(d); + ++src; + ++dest; + ++mask; + } + _mm_empty(); +} + +static FASTCALL void +mmxCombineAddC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) +{ + const __m64 mmx_0 = _mm_setzero_si64(); + const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; + + const CARD32 *end = src + width; + while (src < end) { + __m64 a = MmxTo(*mask); + __m64 s = MmxTo(*src); + __m64 d = MmxTo(*dest); + MmxMul(s, a); + d = MmxAdd(s, d); + *dest = MmxFrom(d); + ++src; + ++dest; + ++mask; + } + _mm_empty(); +} + +extern FbComposeFunctions composeFunctions; + +void fbComposeSetupMMX(void) +{ + /* check if we have MMX support and initialize accordingly */ + if (fbHaveMMX()) { + composeFunctions.combineU[PictOpOver] = mmxCombineOverU; + composeFunctions.combineU[PictOpOverReverse] = mmxCombineOverReverseU; + composeFunctions.combineU[PictOpIn] = mmxCombineInU; + composeFunctions.combineU[PictOpInReverse] = mmxCombineInReverseU; + composeFunctions.combineU[PictOpOut] = mmxCombineOutU; + composeFunctions.combineU[PictOpOutReverse] = mmxCombineOutReverseU; + composeFunctions.combineU[PictOpAtop] = mmxCombineAtopU; + composeFunctions.combineU[PictOpAtopReverse] = mmxCombineAtopReverseU; + composeFunctions.combineU[PictOpXor] = mmxCombineXorU; + composeFunctions.combineU[PictOpAdd] = mmxCombineAddU; + composeFunctions.combineU[PictOpSaturate] = mmxCombineSaturateU; + + composeFunctions.combineC[PictOpSrc] = mmxCombineSrcC; + composeFunctions.combineC[PictOpOver] = mmxCombineOverC; + composeFunctions.combineC[PictOpOverReverse] = mmxCombineOverReverseC; + composeFunctions.combineC[PictOpIn] = mmxCombineInC; + composeFunctions.combineC[PictOpInReverse] = mmxCombineInReverseC; + composeFunctions.combineC[PictOpOut] = mmxCombineOutC; + composeFunctions.combineC[PictOpOutReverse] = mmxCombineOutReverseC; + composeFunctions.combineC[PictOpAtop] = mmxCombineAtopC; + composeFunctions.combineC[PictOpAtopReverse] = mmxCombineAtopReverseC; + composeFunctions.combineC[PictOpXor] = mmxCombineXorC; + composeFunctions.combineC[PictOpAdd] = mmxCombineAddC; + + composeFunctions.combineMaskU = mmxCombineMaskU; + } +} + + +/* ------------------ MMX code paths called from fbpict.c ----------------------- */ + typedef struct { ullong mmx_4x00ff; @@ -127,12 +701,6 @@ pix_multiply (__m64 a, __m64 b) return res; } -#ifdef USE_SSE -#define HAVE_PSHUFW -#endif - -#ifdef HAVE_PSHUFW - static __inline__ __m64 expand_alpha (__m64 pixel) { @@ -151,61 +719,6 @@ invert_colors (__m64 pixel) return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2)); } -#else - -static __inline__ __m64 -expand_alpha (__m64 pixel) -{ - __m64 t1, t2; - - t1 = shift (pixel, -48); - t2 = shift (t1, 16); - t1 = _mm_or_si64 (t1, t2); - t2 = shift (t1, 32); - t1 = _mm_or_si64 (t1, t2); - - return t1; -} - -static __inline__ __m64 -expand_alpha_rev (__m64 pixel) -{ - __m64 t1, t2; - - /* move alpha to low 16 bits and zero the rest */ - t1 = shift (pixel, 48); - t1 = shift (t1, -48); - - t2 = shift (t1, 16); - t1 = _mm_or_si64 (t1, t2); - t2 = shift (t1, 32); - t1 = _mm_or_si64 (t1, t2); - - return t1; -} - -static __inline__ __m64 -invert_colors (__m64 pixel) -{ - __m64 x, y, z; - - x = y = z = pixel; - - x = _mm_and_si64 (x, MC(ffff0000ffff0000)); - y = _mm_and_si64 (y, MC(000000000000ffff)); - z = _mm_and_si64 (z, MC(0000ffff00000000)); - - y = shift (y, 32); - z = shift (z, -32); - - x = _mm_or_si64 (x, y); - x = _mm_or_si64 (x, z); - - return x; -} - -#endif - /* Notes about writing mmx code * * give memory operands as the second operand. If you give it as the @@ -1711,7 +2224,7 @@ fbCopyAreammx (DrawablePtr pSrc, d += 2; } - while (w >= 4 && ((unsigned int)d & 7)) + while (w >= 4 && ((unsigned long)d & 7)) { *(CARD32 *)d = *(CARD32 *)s; @@ -1776,7 +2289,89 @@ fbCompositeCopyAreammx (CARD8 op, width, height); } -#ifndef __amd64__ +#if !defined(__amd64__) && !defined(__x86_64__) + +enum CPUFeatures { + NoFeatures = 0, + MMX = 0x1, + MMX_Extensions = 0x2, + SSE = 0x6, + SSE2 = 0x8, + CMOV = 0x10 +}; + +static uint detectCPUFeatures(void) { + uint result; + char vendor[13]; + vendor[0] = 0; + vendor[12] = 0; + /* see p. 118 of amd64 instruction set manual Vol3 */ + asm ("push %%ebx\n" + "pushf\n" + "pop %%eax\n" + "mov %%eax, %%ebx\n" + "xor $0x00200000, %%eax\n" + "push %%eax\n" + "popf\n" + "pushf\n" + "pop %%eax\n" + "mov $0x0, %%edx\n" + "xor %%ebx, %%eax\n" + "jz skip\n" + + "mov $0x00000000, %%eax\n" + "cpuid\n" + "mov %%ebx, %1\n" + "mov %%edx, %2\n" + "mov %%ecx, %3\n" + "mov $0x00000001, %%eax\n" + "cpuid\n" + "skip:\n" + "pop %%ebx\n" + "mov %%edx, %0\n" + : "=r" (result), + "=m" (vendor[0]), + "=m" (vendor[4]), + "=m" (vendor[8]) + : + : "%eax", "%ebx", "%ecx", "%edx" + ); + + uint features = 0; + if (result) { + // result now contains the standard feature bits + if (result & (1 << 15)) + features |= CMOV; + if (result & (1 << 23)) + features |= MMX; + if (result & (1 << 25)) + features |= SSE; + if (result & (1 << 26)) + features |= SSE2; + if ((result & MMX) && !(result & SSE) && (strcmp(vendor, "AuthenticAMD") == 0)) { + /* check for AMD MMX extensions */ + + uint result; + asm("mov $0x80000000, %%eax\n" + "cpuid\n" + "xor %%edx, %%edx\n" + "cmp $0x1, %%eax\n" + "jge skip2\n" + "mov $0x80000001, %%eax\n" + "cpuid\n" + "skip2:\n" + "mov %%edx, %0\n" + : "=r" (result) + : + : "%eax", "%ebx", "%ecx", "%edx" + ); + if (result & (1<<22)) + features |= MMX_Extensions; + } + } + return features; +} + Bool fbHaveMMX (void) { @@ -1785,52 +2380,9 @@ fbHaveMMX (void) if (!initialized) { - int tmp; /* static variables are accessed through %ebx, - * but we mess around with the registers below, - * so we need a temporary variable that can - * be accessed directly. - */ - - __asm__ __volatile__ ( -/* Check if bit 21 in flags word is writeable */ - - "pusha \n\t" - "pushfl \n\t" - "popl %%eax \n\t" - "movl %%eax, %%ebx \n\t" - "xorl $0x00200000, %%eax \n\t" - "pushl %%eax \n\t" - "popfl \n\t" - "pushfl \n\t" - "popl %%eax \n\t" - - "cmpl %%eax, %%ebx \n\t" - - "je .notfound \n\t" - -/* OK, we have CPUID */ - - "movl $1, %%eax \n\t" - "cpuid \n\t" - - "test $0x00800000, %%edx \n\t" - "jz .notfound \n\t" - - "movl $1, %0 \n\t" - "jmp .out \n\t" - - ".notfound: \n\t" - "movl $0, %0 \n\t" - - ".out: \n\t" - "popa \n\t" - : - "=m" (tmp) - : /* no input */); - - initialized = TRUE; - - mmx_present = tmp; + uint features = detectCPUFeatures(); + mmx_present = (features & (MMX|MMX_Extensions)) == (MMX|MMX_Extensions); + initialized = TRUE; } return mmx_present; diff --git a/fb/fbmmx.h b/fb/fbmmx.h index 9ec8affd7..d40733cfa 100644 --- a/fb/fbmmx.h +++ b/fb/fbmmx.h @@ -1,5 +1,6 @@ /* * Copyright © 2004 Red Hat, Inc. + * Copyright © 2005 Trolltech AS * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that @@ -11,14 +12,17 @@ * suitability of this software for any purpose. It is provided "as is" * without express or implied warranty. * - * RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL RED HAT - * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. * * Author: Søren Sandmann (sandmann@redhat.com) + * Lars Knoll (lars@trolltech.com) * * Based on work by Owen Taylor */ @@ -28,7 +32,7 @@ #ifdef USE_MMX -#ifndef __amd64__ +#if !defined(__amd64__) && !defined(__x86_64__) Bool fbHaveMMX(void); #else #define fbHaveMMX() TRUE @@ -40,6 +44,42 @@ Bool fbHaveMMX(void); #ifdef USE_MMX +#define MmxNegate(x) _mm_xor_si64((x), mmx_4x00ff) +#define MmxAlpha(x) _mm_shuffle_pi16 ((x), _MM_SHUFFLE(3, 3, 3, 3)); +#define MmxTo(x) _mm_unpacklo_pi8 (_mm_cvtsi32_si64((x)), mmx_0) +#define MmxFrom(x) (CARD32)_mm_cvtsi64_si32(_mm_packs_pu16((x), mmx_0)) +#define MmxAdd(x, y) _mm_adds_pu16 ((x), (y)) + +#define MmxMulAdd(x, a, y) do { \ + x = _mm_mullo_pi16 (x, a); \ + x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8)); \ + x = _mm_adds_pu16 (x, mmx_4x0080); \ + x = _mm_srli_pi16 (x, 8); \ + x = _mm_adds_pu16(x, y); \ + } while (0) + +#define MmxMul(x, a) do { \ + x = _mm_mullo_pi16 (x, a); \ + x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8)); \ + x = _mm_adds_pu16 (x, mmx_4x0080); \ + x = _mm_srli_pi16 (x, 8); \ + } while (0) + +#define MmxAddMul(x, a, y, b) do { \ + x = _mm_mullo_pi16 (x, a); \ + y = _mm_mullo_pi16 (y, b); \ + x = _mm_srli_pi16(x, 1); \ + y = _mm_srli_pi16(y, 1); \ + x = _mm_adds_pu16 (x, y); \ + x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8)); \ + x = _mm_adds_pu16 (x, mmx_4x0080); \ + x = _mm_srli_pi16 (x, 7); \ + } while (0) + +typedef unsigned long long ullong; + +void fbComposeSetupMMX(void); + void fbCompositeSolidMask_nx8888x0565Cmmx (CARD8 op, PicturePtr pSrc, PicturePtr pMask, diff --git a/fb/fbpict.c b/fb/fbpict.c index 40dd2853c..20d9ad559 100644 --- a/fb/fbpict.c +++ b/fb/fbpict.c @@ -850,6 +850,14 @@ fbComposite (CARD8 op, int x_msk, y_msk, x_src, y_src, x_dst, y_dst; int w, h, w_this, h_this; +#ifdef USE_MMX + static Bool mmx_setup = FALSE; + if (!mmx_setup) { + fbComposeSetupMMX(); + mmx_setup = TRUE; + } +#endif + xDst += pDst->pDrawable->x; yDst += pDst->pDrawable->y; if (pSrc->pDrawable) { diff --git a/fb/fbpict.h b/fb/fbpict.h index 59ab8a9b8..ab44bfa66 100644 --- a/fb/fbpict.h +++ b/fb/fbpict.h @@ -339,6 +339,15 @@ typedef struct _FbComposeData { CARD16 height; } FbComposeData; +typedef FASTCALL void (*CombineMaskU) (CARD32 *src, const CARD32 *mask, int width); +typedef FASTCALL void (*CombineFuncU) (CARD32 *dest, const CARD32 *src, int width); +typedef FASTCALL void (*CombineFuncC) (CARD32 *dest, CARD32 *src, CARD32 *mask, int width); + +typedef struct _FbComposeFunctions { + CombineFuncU *combineU; + CombineFuncC *combineC; + CombineMaskU combineMaskU; +} FbComposeFunctions; /* fbcompose.c */ -- cgit v1.2.3