/*
 * Accelerated rootless blit
 */
/*
 * This code is largely copied from fbBlt.c.
 *
 * Copyright © 1998 Keith Packard
 * Copyright (c) 2002 Apple Computer, Inc. All Rights Reserved.
 * Copyright (c) 2003 Torrey T. Lyons. All Rights Reserved.
 *
 * Permission to use, copy, modify, distribute, and sell this software and its
 * documentation for any purpose is hereby granted without fee, provided that
 * the above copyright notice appear in all copies and that both that
 * copyright notice and this permission notice appear in supporting
 * documentation, and that the name of Keith Packard not be used in
 * advertising or publicity pertaining to distribution of the software without
 * specific, written prior permission.  Keith Packard makes no
 * representations about the suitability of this software for any purpose.  It
 * is provided "as is" without express or implied warranty.
 *
 * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
 * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
 * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
 * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
 * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 * PERFORMANCE OF THIS SOFTWARE.
 */
/* $XFree86: xc/programs/Xserver/fb/fbblt.c,v 1.8 2000/09/28 00:47:22 keithp Exp $ */

#include "fb.h"
#include "rootlessCommon.h"
#include "rlAccel.h"


void
rlBlt (FbBits   *srcLine,
       FbStride	srcStride,
       int	srcX,

       ScreenPtr pDstScreen,
       FbBits   *dstLine,
       FbStride dstStride,
       int	dstX,

       int	width,
       int	height,

       int	alu,
       FbBits	pm,
       int	bpp,

       Bool	reverse,
       Bool	upsidedown)
{
    FbBits  *src, *dst;
    int	    leftShift, rightShift;
    FbBits  startmask, endmask;
    FbBits  bits, bits1;
    int	    n, nmiddle;
    Bool    destInvarient;
    int	    startbyte, endbyte;
    FbDeclareMergeRop ();

#ifdef FB_24BIT
    if (bpp == 24 && !FbCheck24Pix (pm))
    {
	fbBlt24 (srcLine, srcStride, srcX, dstLine, dstStride, dstX,
		 width, height, alu, pm, reverse, upsidedown);
	return;
    }
#endif
    FbInitializeMergeRop(alu, pm);
    destInvarient = FbDestInvarientMergeRop();
    if (upsidedown)
    {
	srcLine += (height - 1) * (srcStride);
	dstLine += (height - 1) * (dstStride);
	srcStride = -srcStride;
	dstStride = -dstStride;
    }
    FbMaskBitsBytes (dstX, width, destInvarient, startmask, startbyte,
		     nmiddle, endmask, endbyte);

    /*
     * Beginning of the rootless acceleration code
     */
    if (!startmask && !endmask && alu == GXcopy &&
        height * nmiddle * sizeof(*dst) > rootless_CopyBytes_threshold)
    {
	if (pm == FB_ALLONES && SCREENREC(pDstScreen)->imp->CopyBytes)
	{
	    SCREENREC(pDstScreen)->imp->CopyBytes(
                            nmiddle * sizeof(*dst), height,
                            (char *) srcLine + (srcX >> 3),
                            srcStride * sizeof (*src),
                            (char *) dstLine + (dstX >> 3),
                            dstStride * sizeof (*dst));
	    return;
	}

	/* FIXME: the pm test here isn't super-wonderful - just because
	   we don't care about the top eight bits doesn't necessarily
	   mean we want them set to 255. But doing this does give a
	   factor of two performance improvement when copying from a
	   pixmap to a window, which is pretty common.. */

	else if (bpp == 32 && sizeof(FbBits) == 4 &&
                 pm == 0x00FFFFFFUL && !reverse &&
                 SCREENREC(pDstScreen)->imp->CompositePixels)
	{
	    /* need to copy XRGB to ARGB. */

	    void *src[2], *dest[2];
	    unsigned int src_rowbytes[2], dest_rowbytes[2];
            unsigned int fn;

	    src[0] = (char *) srcLine + (srcX >> 3);
	    src[1] = NULL;
	    src_rowbytes[0] = srcStride * sizeof(*src);
	    src_rowbytes[1] = 0;

	    dest[0] = (char *) dstLine + (dstX >> 3);
	    dest[1] = dest[0];
	    dest_rowbytes[0] = dstStride * sizeof(*dst);
	    dest_rowbytes[1] = dest_rowbytes[0];

	    fn = RL_COMPOSITE_FUNCTION(RL_COMPOSITE_SRC, RL_DEPTH_ARGB8888,
                                       RL_DEPTH_NIL, RL_DEPTH_ARGB8888);

            if (SCREENREC(pDstScreen)->imp->CompositePixels(
                                nmiddle, height,
                                fn, src, src_rowbytes,
                                NULL, 0, dest, dest_rowbytes) == Success)
            {
                return;
            }
	}
    }
    /* End of the rootless acceleration code */

    if (reverse)
    {
	srcLine += ((srcX + width - 1) >> FB_SHIFT) + 1;
	dstLine += ((dstX + width - 1) >> FB_SHIFT) + 1;
	srcX = (srcX + width - 1) & FB_MASK;
	dstX = (dstX + width - 1) & FB_MASK;
    }
    else
    {
	srcLine += srcX >> FB_SHIFT;
	dstLine += dstX >> FB_SHIFT;
	srcX &= FB_MASK;
	dstX &= FB_MASK;
    }
    if (srcX == dstX)
    {
	while (height--)
	{
	    src = srcLine;
	    srcLine += srcStride;
	    dst = dstLine;
	    dstLine += dstStride;
	    if (reverse)
	    {
		if (endmask)
		{
		    bits = *--src;
		    --dst;
		    FbDoRightMaskByteMergeRop(dst, bits, endbyte, endmask);
		}
		n = nmiddle;
		if (destInvarient)
		{
		    while (n--)
			*--dst = FbDoDestInvarientMergeRop(*--src);
		}
		else
		{
		    while (n--)
		    {
			bits = *--src;
			--dst;
			*dst = FbDoMergeRop (bits, *dst);
		    }
		}
		if (startmask)
		{
		    bits = *--src;
		    --dst;
		    FbDoLeftMaskByteMergeRop(dst, bits, startbyte, startmask);
		}
	    }
	    else
	    {
		if (startmask)
		{
		    bits = *src++;
		    FbDoLeftMaskByteMergeRop(dst, bits, startbyte, startmask);
		    dst++;
		}
		n = nmiddle;
		if (destInvarient)
		{
#if 0
		    /*
		     * This provides some speedup on screen->screen blts
		     * over the PCI bus, usually about 10%.  But fb
		     * isn't usually used for this operation...
		     */
		    if (_ca2 + 1 == 0 && _cx2 == 0)
		    {
			FbBits	t1, t2, t3, t4;
			while (n >= 4)
			{
			    t1 = *src++;
			    t2 = *src++;
			    t3 = *src++;
			    t4 = *src++;
			    *dst++ = t1;
			    *dst++ = t2;
			    *dst++ = t3;
			    *dst++ = t4;
			    n -= 4;
			}
		    }
#endif
		    while (n--)
			*dst++ = FbDoDestInvarientMergeRop(*src++);
		}
		else
		{
		    while (n--)
		    {
			bits = *src++;
			*dst = FbDoMergeRop (bits, *dst);
			dst++;
		    }
		}
		if (endmask)
		{
		    bits = *src;
		    FbDoRightMaskByteMergeRop(dst, bits, endbyte, endmask);
		}
	    }
	}
    }
    else
    {
	if (srcX > dstX)
	{
	    leftShift = srcX - dstX;
	    rightShift = FB_UNIT - leftShift;
	}
	else
	{
	    rightShift = dstX - srcX;
	    leftShift = FB_UNIT - rightShift;
	}
	while (height--)
	{
	    src = srcLine;
	    srcLine += srcStride;
	    dst = dstLine;
	    dstLine += dstStride;
	    
	    bits1 = 0;
	    if (reverse)
	    {
		if (srcX < dstX)
		    bits1 = *--src;
		if (endmask)
		{
		    bits = FbScrRight(bits1, rightShift);
		    if (FbScrRight(endmask, leftShift))
		    {
			bits1 = *--src;
			bits |= FbScrLeft(bits1, leftShift);
		    }
		    --dst;
		    FbDoRightMaskByteMergeRop(dst, bits, endbyte, endmask);
		}
		n = nmiddle;
		if (destInvarient)
		{
		    while (n--)
		    {
			bits = FbScrRight(bits1, rightShift);
			bits1 = *--src;
			bits |= FbScrLeft(bits1, leftShift);
			--dst;
			*dst = FbDoDestInvarientMergeRop(bits);
		    }
		}
		else
		{
		    while (n--)
		    {
			bits = FbScrRight(bits1, rightShift);
			bits1 = *--src;
			bits |= FbScrLeft(bits1, leftShift);
			--dst;
			*dst = FbDoMergeRop(bits, *dst);
		    }
		}
		if (startmask)
		{
		    bits = FbScrRight(bits1, rightShift);
		    if (FbScrRight(startmask, leftShift))
		    {
			bits1 = *--src;
			bits |= FbScrLeft(bits1, leftShift);
		    }
		    --dst;
		    FbDoLeftMaskByteMergeRop (dst, bits, startbyte, startmask);
		}
	    }
	    else
	    {
		if (srcX > dstX)
		    bits1 = *src++;
		if (startmask)
		{
		    bits = FbScrLeft(bits1, leftShift);
		    bits1 = *src++;
		    bits |= FbScrRight(bits1, rightShift);
		    FbDoLeftMaskByteMergeRop (dst, bits, startbyte, startmask);
		    dst++;
		}
		n = nmiddle;
		if (destInvarient)
		{
		    while (n--)
		    {
			bits = FbScrLeft(bits1, leftShift);
			bits1 = *src++;
			bits |= FbScrRight(bits1, rightShift);
			*dst = FbDoDestInvarientMergeRop(bits);
			dst++;
		    }
		}
		else
		{
		    while (n--)
		    {
			bits = FbScrLeft(bits1, leftShift);
			bits1 = *src++;
			bits |= FbScrRight(bits1, rightShift);
			*dst = FbDoMergeRop(bits, *dst);
			dst++;
		    }
		}
		if (endmask)
		{
		    bits = FbScrLeft(bits1, leftShift);
		    if (FbScrLeft(endmask, rightShift))
		    {
			bits1 = *src;
			bits |= FbScrRight(bits1, rightShift);
		    }
		    FbDoRightMaskByteMergeRop (dst, bits, endbyte, endmask);
		}
	    }
	}
    }
}