/*
 */

/* $XFree86: xc/programs/Xserver/hw/xfree86/vga256/enhanced/vgaBltFillc.c,v 3.0 1996/12/09 11:54:28 dawes Exp $ */

#include "X.h"
#include "misc.h"

#include "compiler.h"

#include "xf86.h"
#include "xf86Priv.h"
#include "xf86_OSlib.h"
#include "xf86_HWlib.h"

#include "vga.h"
#include "cfb.h"

/* fBitBlt.s */

void fastBitBltCopy(
    int xdir,
    unsigned char *psrc,
    unsigned char *pdst,
    int h,
    int w,
    int srcPitch,
    int dstPitch
)
{
  if (xdir <= 0) {
    while (h--) {
      memmove(pdst-w, psrc-w, w);
      pdst += dstPitch - w;
      psrc += srcPitch - w;
    }
  } else {
    while (h--) {
      memcpy(pdst, psrc, w);
      pdst += dstPitch + w;
      psrc += srcPitch + w;
    }
  }
}

/* fFillAnd.s */

unsigned char *fastFillSolidGXand(
    unsigned char *pdst,
    unsigned long fill1,
    unsigned long fill2,
    int hcount,
    int count,
    int w,
    int widthPitch
)
{
	/*
	 * NOTES: original assembly code presumes hcount > 0 to start with
	 *	New code assumes that all bytes of fill1, fill2 are
	 *	consistent.  i.e. 0xefefefef, and not 0x12345678.
	 *	This is because the caller of this routine does a PFILL()
	 *	of the [fill1, fill2] values before they get here.
	 *	For large block cases (count > 3), the original code
	 *	assumed that width == count.
	 *	Fills hcount trips of count bytes each trip through loop
	 */

  if (count == 0)
    return pdst;

  while (hcount > 0) {
	/* No special 'fast' cases here */
    int		cur_count;
    unsigned char	tmpb = fill1;
    unsigned short	tmph = fill1;
    unsigned int	tmpi = fill1;

    cur_count = count;

    /* Fiddle with leading bits up to large block */
    if (((long)pdst & 0x1) && cur_count >= 1) {
		/* To next 0mod2 */
	*(unsigned char *) pdst &= tmpb;
	pdst++;
	cur_count--;
    }

    if (((long)pdst & 0x2) && cur_count >= 2) {
		/* To next 0mod4 */
	*(unsigned short *) pdst &= tmph;
	pdst += 2;
	cur_count -= 2;
    }

    if (((long)pdst & 0x4) && cur_count >= 4) {
		/* To next 0mod8 */
	*(unsigned int *) pdst &= tmpi;
	pdst += 4;
	cur_count -= 4;
    }

	/*
	 * Perform bulk copy, knowing 0mod8 alignment
	 * Assumes 64-bit longs.
	 */

    while (cur_count >= 64) {

	/* Hand unrolled x8, assumes scheduler does a good job */
	*(unsigned long *) ((long) pdst + 0 ) &= fill1;
	*(unsigned long *) ((long) pdst + 8 ) &= fill1;
	*(unsigned long *) ((long) pdst + 16) &= fill1;
	*(unsigned long *) ((long) pdst + 24) &= fill1;
	*(unsigned long *) ((long) pdst + 32) &= fill1;
	*(unsigned long *) ((long) pdst + 40) &= fill1;
	*(unsigned long *) ((long) pdst + 48) &= fill1;
	*(unsigned long *) ((long) pdst + 56) &= fill1;

	pdst += 64;
	cur_count -= 64;
    }

	/* Perform trailing bits cleanup */
    while (cur_count >= 8) {
	*(unsigned long *) ((long) pdst + 0) &= fill1;
	pdst += 8;
	cur_count -= 8;
    }

    if (cur_count >= 4) {
		/* On 0mod4 boundary already */
	*(unsigned int *) pdst &= tmpi;
	pdst += 4;
	cur_count -= 4;
    }

    if (cur_count >= 2) {
		/* On 0mod2 boundary already */
	*(unsigned short *) pdst &= tmph;
	pdst += 2;
	cur_count -= 2;
    }

    if (cur_count >= 1) {
		/* last possible byte */
	*(unsigned char *) pdst &= tmpb;
	pdst++;
	cur_count--;
    }

	/* Loop epilogue */
/* assert(cur_count == 0); */
    pdst += widthPitch;
    hcount--;
  }

  return pdst;
}

/* fFillOr.s */

unsigned char *fastFillSolidGXor(
    unsigned char *pdst,
    unsigned long fill1,
    unsigned long fill2,
    int hcount,
    int count,
    int w,
    int widthPitch
)
{
	/*
	 * NOTES: original assembly code presumes hcount > 0 to start with
	 *	New code assumes that all bytes of fill1, fill2 are
	 *	consistent.  i.e. 0xefefefef, and not 0x12345678.
	 *	This is because the caller of this routine does a PFILL()
	 *	of the [fill1, fill2] values before they get here.
	 *	For large block cases (count > 3), the original code
	 *	assumed that width == count.
	 *	Fills hcount trips of count bytes each trip through loop
	 */

  if (count == 0)
    return pdst;

  while (hcount > 0) {
	/* No special 'fast' cases here */
    int		cur_count;
    unsigned char	tmpb = fill1;
    unsigned short	tmph = fill1;
    unsigned int	tmpi = fill1;

    cur_count = count;

    /* Fiddle with leading bits up to large block */
    if (((long)pdst & 0x1) && cur_count >= 1) {
		/* To next 0mod2 */
	*(unsigned char *) pdst |= tmpb;
	pdst++;
	cur_count--;
    }

    if (((long)pdst & 0x2) && cur_count >= 2) {
		/* To next 0mod4 */
	*(unsigned short *) pdst |= tmph;
	pdst += 2;
	cur_count -= 2;
    }

    if (((long)pdst & 0x4) && cur_count >= 4) {
		/* To next 0mod8 */
	*(unsigned int *) pdst |= tmpi;
	pdst += 4;
	cur_count -= 4;
    }

	/*
	 * Perform bulk copy, knowing 0mod8 alignment
	 * Assumes 64-bit longs.
	 */

    while (cur_count >= 64) {

	/* Hand unrolled x8, assumes scheduler does a good job */
	*(unsigned long *) ((long) pdst + 0 ) |= fill1;
	*(unsigned long *) ((long) pdst + 8 ) |= fill1;
	*(unsigned long *) ((long) pdst + 16) |= fill1;
	*(unsigned long *) ((long) pdst + 24) |= fill1;
	*(unsigned long *) ((long) pdst + 32) |= fill1;
	*(unsigned long *) ((long) pdst + 40) |= fill1;
	*(unsigned long *) ((long) pdst + 48) |= fill1;
	*(unsigned long *) ((long) pdst + 56) |= fill1;

	pdst += 64;
	cur_count -= 64;
    }

	/* Perform trailing bits cleanup */
    while (cur_count >= 8) {
	*(unsigned long *) ((long) pdst + 0) |= fill1;
	pdst += 8;
	cur_count -= 8;
    }

    if (cur_count >= 4) {
		/* On 0mod4 boundary already */
	*(unsigned int *) pdst |= tmpi;
	pdst += 4;
	cur_count -= 4;
    }

    if (cur_count >= 2) {
		/* On 0mod2 boundary already */
	*(unsigned short *) pdst |= tmph;
	pdst += 2;
	cur_count -= 2;
    }

    if (cur_count >= 1) {
		/* last possible byte */
	*(unsigned char *) pdst |= tmpb;
	pdst++;
	cur_count--;
    }

	/* Loop epilogue */
/* assert(cur_count == 0); */
    pdst += widthPitch;
    hcount--;
  }

  return pdst;
}

/* fFillXor.s */

unsigned char *fastFillSolidGXxor(
    unsigned char *pdst,
    unsigned long fill1,
    unsigned long fill2,
    int hcount,
    int count,
    int w,
    int widthPitch
)
{
	/*
	 * NOTES: original assembly code presumes hcount > 0 to start with
	 *	New code assumes that all bytes of fill1, fill2 are
	 *	consistent.  i.e. 0xefefefef, and not 0x12345678.
	 *	This is because the caller of this routine does a PFILL()
	 *	of the [fill1, fill2] values before they get here.
	 *	For large block cases (count > 3), the original code
	 *	assumed that width == count.
	 *	Fills hcount trips of count bytes each trip through loop
	 */

  if (count == 0)
    return pdst;

  while (hcount > 0) {
	/* No special 'fast' cases here */
    int		cur_count;
    unsigned char	tmpb = fill1;
    unsigned short	tmph = fill1;
    unsigned int	tmpi = fill1;

    cur_count = count;

    /* Fiddle with leading bits up to large block */
    if (((long)pdst & 0x1) && cur_count >= 1) {
		/* To next 0mod2 */
	*(unsigned char *) pdst ^= tmpb;
	pdst++;
	cur_count--;
    }

    if (((long)pdst & 0x2) && cur_count >= 2) {
		/* To next 0mod4 */
	*(unsigned short *) pdst ^= tmph;
	pdst += 2;
	cur_count -= 2;
    }

    if (((long)pdst & 0x4) && cur_count >= 4) {
		/* To next 0mod8 */
	*(unsigned int *) pdst ^= tmpi;
	pdst += 4;
	cur_count -= 4;
    }

	/*
	 * Perform bulk copy, knowing 0mod8 alignment
	 * Assumes 64-bit longs.
	 */

    while (cur_count >= 64) {

	/* Hand unrolled x8, assumes scheduler does a good job */
	*(unsigned long *) ((long) pdst + 0 ) ^= fill1;
	*(unsigned long *) ((long) pdst + 8 ) ^= fill1;
	*(unsigned long *) ((long) pdst + 16) ^= fill1;
	*(unsigned long *) ((long) pdst + 24) ^= fill1;
	*(unsigned long *) ((long) pdst + 32) ^= fill1;
	*(unsigned long *) ((long) pdst + 40) ^= fill1;
	*(unsigned long *) ((long) pdst + 48) ^= fill1;
	*(unsigned long *) ((long) pdst + 56) ^= fill1;

	pdst += 64;
	cur_count -= 64;
    }

	/* Perform trailing bits cleanup */
    while (cur_count >= 8) {
	*(unsigned long *) ((long) pdst + 0) ^= fill1;
	pdst += 8;
	cur_count -= 8;
    }

    if (cur_count >= 4) {
		/* On 0mod4 boundary already */
	*(unsigned int *) pdst ^= tmpi;
	pdst += 4;
	cur_count -= 4;
    }

    if (cur_count >= 2) {
		/* On 0mod2 boundary already */
	*(unsigned short *) pdst ^= tmph;
	pdst += 2;
	cur_count -= 2;
    }

    if (cur_count >= 1) {
		/* last possible byte */
	*(unsigned char *) pdst ^= tmpb;
	pdst++;
	cur_count--;
    }

	/* Loop epilogue */
/* assert(cur_count == 0); */
    pdst += widthPitch;
    hcount--;
  }

  return pdst;
}

/* fFillCopy.s */

unsigned char *fastFillSolidGXcopy(
    unsigned char *pdst,
    unsigned long fill1,
    unsigned long fill2,
    int hcount,
    int count,
    int w,
    int widthPitch
)
{
	/*
	 * NOTES: original assembly code presumes hcount > 0 to start with
	 *	New code assumes that all bytes of fill1, fill2 are
	 *	consistent.  i.e. 0xefefefef, and not 0x12345678.
	 *	This is because the caller of this routine does a PFILL()
	 *	of the [fill1, fill2] values before they get here.
	 *	For large block cases (count > 3), the original code
	 *	assumed that width == count.
	 *	Fills hcount trips of count bytes each trip through loop
	 */

  if (count == 0)
    return pdst;

  while (hcount > 0) {
	/* No special 'fast' cases here */
    int		cur_count;
    unsigned char	tmpb = fill1;
    unsigned short	tmph = fill1;
    unsigned int	tmpi = fill1;

    cur_count = count;

    /* Fiddle with leading bits up to large block */
    if (((long)pdst & 0x1) && cur_count >= 1) {
		/* To next 0mod2 */
	*(unsigned char *) pdst = tmpb;
	pdst++;
	cur_count--;
    }

    if (((long)pdst & 0x2) && cur_count >= 2) {
		/* To next 0mod4 */
	*(unsigned short *) pdst = tmph;
	pdst += 2;
	cur_count -= 2;
    }

    if (((long)pdst & 0x4) && cur_count >= 4) {
		/* To next 0mod8 */
	*(unsigned int *) pdst = tmpi;
	pdst += 4;
	cur_count -= 4;
    }

	/*
	 * Perform bulk copy, knowing 0mod8 alignment
	 * Assumes 64-bit longs.
	 */

    while (cur_count >= 64) {

	/* Hand unrolled x8, assumes scheduler does a good job */
	*(unsigned long *) ((long) pdst + 0 ) = fill1;
	*(unsigned long *) ((long) pdst + 8 ) = fill1;
	*(unsigned long *) ((long) pdst + 16) = fill1;
	*(unsigned long *) ((long) pdst + 24) = fill1;
	*(unsigned long *) ((long) pdst + 32) = fill1;
	*(unsigned long *) ((long) pdst + 40) = fill1;
	*(unsigned long *) ((long) pdst + 48) = fill1;
	*(unsigned long *) ((long) pdst + 56) = fill1;

	pdst += 64;
	cur_count -= 64;
    }

	/* Perform trailing bits cleanup */
    while (cur_count >= 8) {
	*(unsigned long *) ((long) pdst + 0) = fill1;
	pdst += 8;
	cur_count -= 8;
    }

    if (cur_count >= 4) {
		/* On 0mod4 boundary already */
	*(unsigned int *) pdst = tmpi;
	pdst += 4;
	cur_count -= 4;
    }

    if (cur_count >= 2) {
		/* On 0mod2 boundary already */
	*(unsigned short *) pdst = tmph;
	pdst += 2;
	cur_count -= 2;
    }

    if (cur_count >= 1) {
		/* last possible byte */
	*(unsigned char *) pdst = tmpb;
	pdst++;
	cur_count--;
    }

	/* Loop epilogue */
/* assert(cur_count == 0); */
    pdst += widthPitch;
    hcount--;
  }

  return pdst;
}

/*
 * Reverse engineered version of XFree86 code for fFillSet.s by
 * Rick Gorton (gorton@tallis.enet.dec.com)
 * This version should work well on strongly aligned RISC architectures
 * in general.  In particular, the even-odd trip performance problem
 * with 'tribbleloop' is eliminated.
 *
 * Jay, please put the original header back in here...
 */

unsigned char *fastFillSolidGXset(
    unsigned char *pdst,
    unsigned long fill1,
    unsigned long fill2,
    int hcount,
    int count,
    int w,
    int widthPitch)
{
	/*
	 * NOTES: original assembly code presumes hcount > 0 to start with
	 *	New code assumes that all bytes of fill1, fill2 are
	 *	consistent.  i.e. 0xefefefef, and not 0x12345678.
	 *	This is because the caller of this routine does a PFILL()
	 *	of the [fill1, fill2] values before they get here.
	 *	For large block cases (count > 3), the original code
	 *	assumed that width == count.
	 *	Fills hcount trips of count bytes each trip through loop
	 */

  if (count == 0)
    return pdst;

  while (hcount > 0) {
	/* No special 'fast' cases here */
    int		cur_count;
    char	tmpb;
    short	tmph;
    int		tmpi;

    cur_count = count;

    /* Fiddle with leading bits up to large block */
    if (((long)pdst & 0x1) && cur_count >= 1) {
		/* To next 0mod2 */
	tmpb = *(unsigned char *) pdst;
	tmpb = (tmpb & fill1) ^ fill2;
	*(unsigned char *) pdst = tmpb;
	pdst++;
	cur_count--;
    }

    if (((long)pdst & 0x2) && cur_count >= 2) {
		/* To next 0mod4 */
	tmph = *(unsigned short *) pdst;
	tmph = (tmph & fill1) ^ fill2;
	*(unsigned short *) pdst = tmph;
	pdst += 2;
	cur_count -= 2;
    }

    if (((long)pdst & 0x4) && cur_count >= 4) {
		/* To next 0mod8 */
	tmpi = *(unsigned int *) pdst;
	tmpi = (tmpi & fill1) ^ fill2;
	*(unsigned int *) pdst = tmpi;
	pdst += 4;
	cur_count -= 4;
    }

	/*
	 * Perform bulk copy, knowing 0mod8 alignment
	 * Assumes 64-bit longs.
	 */

    while (cur_count >= 64) {
	unsigned long	tmp_1, tmp_2, tmp_3, tmp_4;
	unsigned long	tmp_5, tmp_6, tmp_7, tmp_8;

	/* Hand unrolled x8, assumes scheduler does a good job */
	tmp_1 = *(unsigned long *) ((long) pdst + 0);
	tmp_2 = *(unsigned long *) ((long) pdst + 8);
	tmp_3 = *(unsigned long *) ((long) pdst + 16);
	tmp_4 = *(unsigned long *) ((long) pdst + 24);
	tmp_5 = *(unsigned long *) ((long) pdst + 32);
	tmp_6 = *(unsigned long *) ((long) pdst + 40);
	tmp_7 = *(unsigned long *) ((long) pdst + 48);
	tmp_8 = *(unsigned long *) ((long) pdst + 56);

	tmp_1 = (fill1 & tmp_1) ^ fill2;
	tmp_2 = (fill1 & tmp_2) ^ fill2;
	tmp_3 = (fill1 & tmp_3) ^ fill2;
	tmp_4 = (fill1 & tmp_4) ^ fill2;
	tmp_5 = (fill1 & tmp_5) ^ fill2;
	tmp_6 = (fill1 & tmp_6) ^ fill2;
	tmp_7 = (fill1 & tmp_7) ^ fill2;
	tmp_8 = (fill1 & tmp_8) ^ fill2;

	*(unsigned long *) ((long) pdst + 0) = tmp_1;
	*(unsigned long *) ((long) pdst + 8) = tmp_2;
	*(unsigned long *) ((long) pdst + 16) = tmp_3;
	*(unsigned long *) ((long) pdst + 24) = tmp_4;
	*(unsigned long *) ((long) pdst + 32) = tmp_5;
	*(unsigned long *) ((long) pdst + 40) = tmp_6;
	*(unsigned long *) ((long) pdst + 48) = tmp_7;
	*(unsigned long *) ((long) pdst + 56) = tmp_8;

	pdst += 64;
	cur_count -= 64;
    }

	/* Perform trailing bits cleanup */
    while (cur_count >= 8) {
	unsigned long	tmpl;

	tmpl = *(unsigned long *) ((long) pdst + 0);
	tmpl = (tmpl & fill1) ^ fill2;
	*(unsigned long *) ((long) pdst + 0) = tmpl;
	pdst += 8;
	cur_count -= 8;
    }

    if (cur_count >= 4) {
		/* On 0mod4 boundary already */
	tmpi = *(unsigned int *) pdst;
	tmpi = (tmpi & fill1) ^ fill2;
	*(unsigned int *) pdst = tmpi;
	pdst += 4;
	cur_count -= 4;
    }

    if (cur_count >= 2) {
		/* On 0mod2 boundary already */
	tmph = *(unsigned short *) pdst;
	tmph = (tmph & fill1) ^ fill2;
	*(unsigned short *) pdst = tmph;
	pdst += 2;
	cur_count -= 2;
    }

    if (cur_count >= 1) {
		/* last possible byte */
	tmpb = *(unsigned char *) pdst;
	tmpb = (tmpb & fill1) ^ fill2;
	*(unsigned char *) pdst = tmpb;
	pdst++;
	cur_count--;
    }

	/* Loop epilogue */
/* assert(cur_count == 0); */
    pdst += widthPitch;
    hcount--;
  }

  return pdst;
}