PageRenderTime 346ms CodeModel.GetById 65ms app.highlight 239ms RepoModel.GetById 1ms app.codeStats 2ms

/project/jni/sdl_gfx/SDL_imageFilter.c

https://github.com/aichunyu/FFPlayer
C | 7556 lines | 6038 code | 274 blank | 1244 comment | 520 complexity | d43bab1767b6590f1804da8a41766b90 MD5 | raw file
   1/*
   2
   3SDL_imageFilter - bytes-image "filter" routines.
   4(Uses inline x86 MMX or ASM optimizations if available and enabled.)
   5
   6LGPL (c) A. Schiffler
   7
   8Note: Most of the MMX code is based on published routines 
   9by Vladimir Kravtchenko at vk@cs.ubc.ca - credits go to 
  10him for his work.
  11
  12*/
  13
  14#include <stdio.h>
  15#include <stdlib.h>
  16#include <string.h>
  17
  18#include "SDL_imageFilter.h"
  19
  20/*!
  21\brief Swaps the byte order in a 32bit integer (LSB becomes MSB, etc.). 
  22*/
  23#define SWAP_32(x) (((x) >> 24) | (((x) & 0x00ff0000) >> 8)  | (((x) & 0x0000ff00) << 8)  | ((x) << 24))
  24
  25/* ------ Static variables ----- */
  26
  27/*! 
  28\brief Static state which enables the use of the MMX routines. Enabled by default 
  29*/
  30static int SDL_imageFilterUseMMX = 1;
  31
  32/* Detect GCC */
  33#if defined(__GNUC__)
  34#define GCC__
  35#endif
  36
  37/*!
  38\brief Internal function returning the CPU flags. 
  39
  40\returns Flags of system CPU.
  41*/
  42unsigned int _cpuFlags()
  43{
  44	int flags = 0;
  45
  46#ifdef USE_MMX
  47#if !defined(GCC__)
  48	__asm
  49	{
  50		pusha
  51			mov eax, 1
  52			cpuid	/* get CPU ID flag */
  53			mov flags,edx	/* move result to mmx_bit */
  54			popa
  55	}
  56#else
  57	asm volatile ("pusha		     \n\t" "mov    %1, %%eax     \n\t"	/* request feature flag */
  58		"cpuid                \n\t"	/* get CPU ID flag */
  59		"mov    %%edx, %0     \n\t"	/* move result to mmx_bit */
  60		"popa		     \n\t":"=m" (flags)	/* %0 */
  61		:"i"(0x00000001)	/* %1 */
  62		);
  63#endif
  64#endif
  65
  66	return (flags);
  67}
  68
  69/*!
  70\brief MMX detection routine (with override flag). 
  71
  72\returns 1 of MMX was detected, 0 otherwise.
  73*/
  74int SDL_imageFilterMMXdetect(void)
  75{
  76	unsigned int mmx_bit;
  77
  78	/* Check override flag */
  79	if (SDL_imageFilterUseMMX == 0) {
  80		return (0);
  81	}
  82
  83	mmx_bit = _cpuFlags();
  84	mmx_bit &= 0x00800000;
  85	mmx_bit = (mmx_bit && 0x00800000);
  86
  87	return (mmx_bit);
  88}
  89
  90/*!
  91\brief Disable MMX check for filter functions and and force to use non-MMX C based code.
  92*/
  93void SDL_imageFilterMMXoff()
  94{
  95	SDL_imageFilterUseMMX = 0;
  96}
  97
  98/*!
  99\brief Enable MMX check for filter functions and use MMX code if available.
 100*/
 101void SDL_imageFilterMMXon()
 102{
 103	SDL_imageFilterUseMMX = 1;
 104}
 105
 106/* ------------------------------------------------------------------------------------ */
 107
 108/*!
 109\brief Internal MMX Filter using Add: D = saturation255(S1 + S2) 
 110
 111\param Src1 Pointer to the start of the first source byte array (S1).
 112\param Src2 Pointer to the start of the second source byte array (S2).
 113\param Dest Pointer to the start of the destination byte array (D).
 114\param SrcLength The number of bytes in the source arrays.
 115
 116\return Returns 0 for success or -1 for error.
 117*/
 118int SDL_imageFilterAddMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
 119{
 120#ifdef USE_MMX
 121#if !defined(GCC__)
 122	__asm
 123	{
 124		pusha
 125			mov eax, Src1	/* load Src1 address into eax */
 126			mov ebx, Src2	/* load Src2 address into ebx */
 127			mov edi, Dest	/* load Dest address into edi */
 128			mov ecx, SrcLength	/* load loop counter (SIZE) into ecx */
 129			shr ecx, 3	/* counter/8 (MMX loads 8 bytes at a time) */
 130			align 16	/* 16 byte alignment of the loop entry */
 131L1010:
 132		movq mm1, [eax]	/* load 8 bytes from Src1 into mm1 */
 133		paddusb mm1, [ebx]	/* mm1=Src1+Src2 (add 8 bytes with saturation) */
 134		movq [edi], mm1	/* store result in Dest */
 135			add eax, 8	/* increase Src1, Src2 and Dest  */
 136			add ebx, 8	/* register pointers by 8 */
 137			add edi, 8
 138			dec ecx	/* decrease loop counter */
 139			jnz L1010	/* check loop termination, proceed if required */
 140			emms /* exit MMX state */
 141			popa
 142	}
 143#else
 144	asm volatile
 145		("pusha		     \n\t" "mov          %2, %%eax \n\t"	/* load Src1 address into eax */
 146		"mov          %1, %%ebx \n\t"	/* load Src2 address into ebx */
 147		"mov          %0, %%edi \n\t"	/* load Dest address into edi */
 148		"mov          %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
 149		"shr          $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
 150		".align 16              \n\t"	/* 16 byte alignment of the loop entry */
 151		"1: movq (%%eax), %%mm1 \n\t"    	/* load 8 bytes from Src1 into mm1 */
 152		"paddusb (%%ebx), %%mm1 \n\t"	/* mm1=Src1+Src2 (add 8 bytes with saturation) */
 153		"movq    %%mm1, (%%edi) \n\t"	/* store result in Dest */
 154		"add          $8, %%eax \n\t"	/* increase Src1, Src2 and Dest  */
 155		"add          $8, %%ebx \n\t"	/* register pointers by 8 */
 156		"add          $8, %%edi \n\t" "dec              %%ecx \n\t"	/* decrease loop counter */
 157		"jnz             1b     \n\t"     /* check loop termination, proceed if required */
 158		"emms                   \n\t"	/* exit MMX state */
 159		"popa                   \n\t":"=m" (Dest)	/* %0 */
 160		:"m"(Src2),		/* %1 */
 161		"m"(Src1),		/* %2 */
 162		"m"(SrcLength)		/* %3 */
 163		);
 164#endif
 165	return (0);
 166#else
 167	return (-1);
 168#endif
 169}
 170
 171/*!
 172\brief Filter using Add: D = saturation255(S1 + S2) 
 173
 174\param Src1 Pointer to the start of the first source byte array (S1).
 175\param Src2 Pointer to the start of the second source byte array (S2).
 176\param Dest Pointer to the start of the destination byte array (D).
 177\param length The number of bytes in the source arrays.
 178
 179\return Returns 0 for success or -1 for error.
 180*/
 181int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
 182{
 183	unsigned int i, istart;
 184	unsigned char *cursrc1, *cursrc2, *curdst;
 185	int result;
 186
 187	/* Validate input parameters */
 188	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
 189		return(-1);
 190	if (length == 0)
 191		return(0);
 192
 193	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
 194
 195		/* Use MMX assembly routine */
 196		SDL_imageFilterAddMMX(Src1, Src2, Dest, length);
 197
 198		/* Check for unaligned bytes */
 199		if ((length & 7) > 0) {
 200			/* Setup to process unaligned bytes */
 201			istart = length & 0xfffffff8;
 202			cursrc1 = &Src1[istart];
 203			cursrc2 = &Src2[istart];
 204			curdst = &Dest[istart];
 205		} else {
 206			/* No unaligned bytes - we are done */
 207			return (0);
 208		}
 209	} else {
 210		/* Setup to process whole image */
 211		istart = 0;
 212		cursrc1 = Src1;
 213		cursrc2 = Src2;
 214		curdst = Dest;
 215	}
 216
 217	/* C routine to process image */
 218	for (i = istart; i < length; i++) {
 219		result = (int) *cursrc1 + (int) *cursrc2;
 220		if (result > 255)
 221			result = 255;
 222		*curdst = (unsigned char) result;
 223		/* Advance pointers */
 224		cursrc1++;
 225		cursrc2++;
 226		curdst++;
 227	}
 228
 229	return (0);
 230}
 231
 232/*!
 233\brief Internal MMX Filter using Mean: D = S1/2 + S2/2
 234
 235\param Src1 Pointer to the start of the first source byte array (S1).
 236\param Src2 Pointer to the start of the second source byte array (S2).
 237\param Dest Pointer to the start of the destination byte array (D).
 238\param SrcLength The number of bytes in the source arrays.
 239\param Mask Mask array containing 8 bytes with 0x7F value.
 240]
 241\return Returns 0 for success or -1 for error.
 242*/
 243int SDL_imageFilterMeanMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength,
 244						   unsigned char *Mask)
 245{
 246#ifdef USE_MMX
 247#if !defined(GCC__)
 248	__asm
 249	{ 
 250		pusha
 251			mov edx, Mask /* load Mask address into edx */
 252			movq mm0, [edx] /* load Mask into mm0 */
 253		mov eax, Src1 /* load Src1 address into eax */
 254			mov ebx, Src2 /* load Src2 address into ebx */
 255			mov edi, Dest /* load Dest address into edi */
 256			mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
 257			shr ecx, 3 	/* counter/8 (MMX loads 8 bytes at a time) */
 258			align 16	/* 16 byte alignment of the loop entry */
 259L21011:
 260		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
 261		movq mm2,  [ebx] 	/* load 8 bytes from Src2 into mm2 */
 262		/* --- Byte shift via Word shift --- */
 263		psrlw mm1, 1 	/* shift 4 WORDS of mm1 1 bit to the right */
 264			psrlw mm2, 1 	/* shift 4 WORDS of mm2 1 bit to the right */
 265			pand mm1, mm0   // apply Mask to 8 BYTES of mm1 */
 266			/* byte     0x0f, 0xdb, 0xc8 */
 267			pand mm2, mm0   // apply Mask to 8 BYTES of mm2 */
 268			/* byte     0x0f, 0xdb, 0xd0 */
 269			paddusb mm1,  mm2 	/* mm1=mm1+mm2 (add 8 bytes with saturation) */
 270			movq [edi],  mm1 	/* store result in Dest */
 271			add eax,  8 	/* increase Src1, Src2 and Dest  */
 272			add ebx,  8 	/* register pointers by 8 */
 273			add edi,  8
 274			dec ecx 	/* decrease loop counter */
 275			jnz L21011	/* check loop termination, proceed if required */
 276			emms	/* exit MMX state */
 277			popa
 278	}
 279#else
 280	asm volatile
 281		("pusha		     \n\t" "movl         %4, %%edx \n\t"	/* load Mask address into edx */
 282		"movq    (%%edx), %%mm0 \n\t"	/* load Mask into mm0 */
 283		"mov          %2, %%eax \n\t"	/* load Src1 address into eax */
 284		"mov          %1, %%ebx \n\t"	/* load Src2 address into ebx */
 285		"mov          %0, %%edi \n\t"	/* load Dest address into edi */
 286		"mov          %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
 287		"shr          $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
 288		".align 16              \n\t"	/* 16 byte alignment of the loop entry */
 289		"1:                      \n\t"
 290		"movq    (%%eax), %%mm1 \n\t"	/* load 8 bytes from Src1 into mm1 */
 291		"movq    (%%ebx), %%mm2 \n\t"	/* load 8 bytes from Src2 into mm2 */
 292		/* --- Byte shift via Word shift --- */
 293		"psrlw        $1, %%mm1 \n\t"	/* shift 4 WORDS of mm1 1 bit to the right */
 294		"psrlw        $1, %%mm2 \n\t"	/* shift 4 WORDS of mm2 1 bit to the right */
 295		/*      "pand      %%mm0, %%mm1 \n\t"    // apply Mask to 8 BYTES of mm1 */
 296		".byte     0x0f, 0xdb, 0xc8 \n\t"
 297		/*      "pand      %%mm0, %%mm2 \n\t"    // apply Mask to 8 BYTES of mm2 */
 298		".byte     0x0f, 0xdb, 0xd0 \n\t" 
 299		"paddusb   %%mm2, %%mm1 \n\t"	/* mm1=mm1+mm2 (add 8 bytes with saturation) */
 300		"movq    %%mm1, (%%edi) \n\t"	/* store result in Dest */
 301		"add          $8, %%eax \n\t"	/* increase Src1, Src2 and Dest  */
 302		"add          $8, %%ebx \n\t"	/* register pointers by 8 */
 303		"add          $8, %%edi \n\t" 
 304		"dec              %%ecx \n\t"	/* decrease loop counter */
 305		"jnz                 1b \n\t"     /* check loop termination, proceed if required */
 306		"emms                   \n\t"	/* exit MMX state */
 307		"popa                   \n\t":"=m" (Dest)	/* %0 */
 308		:"m"(Src2),		/* %1 */
 309		"m"(Src1),		/* %2 */
 310		"m"(SrcLength),		/* %3 */
 311		"m"(Mask)			/* %4 */
 312		);
 313#endif
 314	return (0);
 315#else
 316	return (-1);
 317#endif
 318}
 319
 320/*!
 321\brief Filter using Mean: D = S1/2 + S2/2
 322
 323\param Src1 Pointer to the start of the first source byte array (S1).
 324\param Src2 Pointer to the start of the second source byte array (S2).
 325\param Dest Pointer to the start of the destination byte array (D).
 326\param length The number of bytes in the source arrays.
 327
 328\return Returns 0 for success or -1 for error.
 329*/
 330int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
 331{
 332	static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
 333	unsigned int i, istart;
 334	unsigned char *cursrc1, *cursrc2, *curdst;
 335	int result;
 336
 337	/* Validate input parameters */
 338	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
 339		return(-1);
 340	if (length == 0)
 341		return(0);
 342
 343	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
 344		/* MMX routine */
 345		SDL_imageFilterMeanMMX(Src1, Src2, Dest, length, Mask);
 346
 347		/* Check for unaligned bytes */
 348		if ((length & 7) > 0) {
 349			/* Setup to process unaligned bytes */
 350			istart = length & 0xfffffff8;
 351			cursrc1 = &Src1[istart];
 352			cursrc2 = &Src2[istart];
 353			curdst = &Dest[istart];
 354		} else {
 355			/* No unaligned bytes - we are done */
 356			return (0);
 357		}
 358	} else {
 359		/* Setup to process whole image */
 360		istart = 0;
 361		cursrc1 = Src1;
 362		cursrc2 = Src2;
 363		curdst = Dest;
 364	}
 365
 366	/* C routine to process image */
 367	for (i = istart; i < length; i++) {
 368		result = (int) *cursrc1 / 2 + (int) *cursrc2 / 2;
 369		*curdst = (unsigned char) result;
 370		/* Advance pointers */
 371		cursrc1++;
 372		cursrc2++;
 373		curdst++;
 374	}
 375
 376	return (0);
 377}
 378
 379/*!
 380\brief Internal MMX Filter using Sub: D = saturation0(S1 - S2)
 381
 382\param Src1 Pointer to the start of the first source byte array (S1).
 383\param Src2 Pointer to the start of the second source byte array (S2).
 384\param Dest Pointer to the start of the destination byte array (D).
 385\param SrcLength The number of bytes in the source arrays.
 386
 387\return Returns 0 for success or -1 for error.
 388*/
 389int SDL_imageFilterSubMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
 390{
 391#ifdef USE_MMX
 392#if !defined(GCC__)
 393	__asm
 394	{
 395		pusha
 396			mov eax,  Src1 	/* load Src1 address into eax */
 397			mov ebx,  Src2 	/* load Src2 address into ebx */
 398			mov edi,  Dest 	/* load Dest address into edi */
 399			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
 400			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
 401			align 16 /* 16 byte alignment of the loop entry */
 402L1012:
 403		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
 404		psubusb mm1,  [ebx] 	/* mm1=Src1-Src2 (sub 8 bytes with saturation) */
 405		movq [edi],  mm1 	/* store result in Dest */
 406			add eax, 8 	/* increase Src1, Src2 and Dest  */
 407			add ebx, 8 	/* register pointers by 8 */
 408			add edi, 8
 409			dec ecx	/* decrease loop counter */
 410			jnz L1012	/* check loop termination, proceed if required */
 411			emms /* exit MMX state */
 412			popa
 413	}
 414#else
 415	asm volatile
 416		("pusha		     \n\t" "mov %2, %%eax \n\t"	/* load Src1 address into eax */
 417		"mov %1, %%ebx \n\t"	/* load Src2 address into ebx */
 418		"mov %0, %%edi \n\t"	/* load Dest address into edi */
 419		"mov %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
 420		"shr $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
 421		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
 422		"1: movq (%%eax), %%mm1 \n\t"     /* load 8 bytes from Src1 into mm1 */
 423		"psubusb (%%ebx), %%mm1 \n\t"	/* mm1=Src1-Src2 (sub 8 bytes with saturation) */
 424		"movq    %%mm1, (%%edi) \n\t"	/* store result in Dest */
 425		"add $8, %%eax \n\t"	/* increase Src1, Src2 and Dest  */
 426		"add $8, %%ebx \n\t"	/* register pointers by 8 */
 427		"add $8, %%edi \n\t" "dec %%ecx     \n\t"	/* decrease loop counter */
 428		"jnz 1b         \n\t"     /* check loop termination, proceed if required */
 429		"emms          \n\t"	/* exit MMX state */
 430		"popa                   \n\t":"=m" (Dest)	/* %0 */
 431		:"m"(Src2),		/* %1 */
 432		"m"(Src1),		/* %2 */
 433		"m"(SrcLength)		/* %3 */
 434		);
 435#endif
 436	return (0);
 437#else
 438	return (-1);
 439#endif
 440}
 441
 442/*!
 443\brief Filter using Sub: D = saturation0(S1 - S2)
 444
 445\param Src1 Pointer to the start of the first source byte array (S1).
 446\param Src2 Pointer to the start of the second source byte array (S2).
 447\param Dest Pointer to the start of the destination byte array (D).
 448\param length The number of bytes in the source arrays.
 449
 450\return Returns 0 for success or -1 for error.
 451*/
 452int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
 453{
 454	unsigned int i, istart;
 455	unsigned char *cursrc1, *cursrc2, *curdst;
 456	int result;
 457
 458	/* Validate input parameters */
 459	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
 460		return(-1);
 461	if (length == 0)
 462		return(0);
 463
 464	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
 465		/* MMX routine */
 466		SDL_imageFilterSubMMX(Src1, Src2, Dest, length);
 467
 468		/* Check for unaligned bytes */
 469		if ((length & 7) > 0) {
 470			/* Setup to process unaligned bytes */
 471			istart = length & 0xfffffff8;
 472			cursrc1 = &Src1[istart];
 473			cursrc2 = &Src2[istart];
 474			curdst = &Dest[istart];
 475		} else {
 476			/* No unaligned bytes - we are done */
 477			return (0);
 478		}
 479	} else {
 480		/* Setup to process whole image */
 481		istart = 0;
 482		cursrc1 = Src1;
 483		cursrc2 = Src2;
 484		curdst = Dest;
 485	}
 486
 487	/* C routine to process image */
 488	for (i = istart; i < length; i++) {
 489		result = (int) *cursrc1 - (int) *cursrc2;
 490		if (result < 0)
 491			result = 0;
 492		*curdst = (unsigned char) result;
 493		/* Advance pointers */
 494		cursrc1++;
 495		cursrc2++;
 496		curdst++;
 497	}
 498
 499	return (0);
 500}
 501
 502/*!
 503\brief Internal MMX Filter using AbsDiff: D = | S1 - S2 |
 504
 505\param Src1 Pointer to the start of the first source byte array (S1).
 506\param Src2 Pointer to the start of the second source byte array (S2).
 507\param Dest Pointer to the start of the destination byte array (D).
 508\param SrcLength The number of bytes in the source arrays.
 509
 510\return Returns 0 for success or -1 for error.
 511*/
 512int SDL_imageFilterAbsDiffMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
 513{
 514#ifdef USE_MMX
 515#if !defined(GCC__)
 516	__asm
 517	{
 518		pusha
 519			mov eax, Src1  	/* load Src1 address into eax */
 520			mov ebx, Src2 	/* load Src2 address into ebx */
 521			mov edi, Dest 	/* load Dest address into edi */
 522			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
 523			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
 524			align 16	/* 16 byte alignment of the loop entry */
 525L1013:
 526		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
 527		movq mm2,  [ebx] 	/* load 8 bytes from Src2 into mm2 */
 528		psubusb mm1,  [ebx] 	/* mm1=Src1-Src2 (sub 8 bytes with saturation) */
 529		psubusb mm2,  [eax] 	/* mm2=Src2-Src1 (sub 8 bytes with saturation) */
 530		por mm1,  mm2 	/* combine both mm2 and mm1 results */
 531			movq [edi],  mm1 	/* store result in Dest */
 532			add eax, 8 	/* increase Src1, Src2 and Dest  */
 533			add ebx, 8 	/* register pointers by 8 */
 534			add edi, 8
 535			dec ecx 	/* decrease loop counter */
 536			jnz L1013    	/* check loop termination, proceed if required */
 537			emms         /* exit MMX state */
 538			popa
 539	}
 540#else
 541	asm volatile
 542		("pusha		     \n\t" "mov %2, %%eax \n\t"	/* load Src1 address into eax */
 543		"mov %1, %%ebx \n\t"	/* load Src2 address into ebx */
 544		"mov %0, %%edi \n\t"	/* load Dest address into edi */
 545		"mov %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
 546		"shr $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
 547		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
 548		"1: movq (%%eax), %%mm1 \n\t"     /* load 8 bytes from Src1 into mm1 */
 549		"movq    (%%ebx), %%mm2 \n\t"	/* load 8 bytes from Src2 into mm2 */
 550		"psubusb (%%ebx), %%mm1 \n\t"	/* mm1=Src1-Src2 (sub 8 bytes with saturation) */
 551		"psubusb (%%eax), %%mm2 \n\t"	/* mm2=Src2-Src1 (sub 8 bytes with saturation) */
 552		"por       %%mm2, %%mm1 \n\t"	/* combine both mm2 and mm1 results */
 553		"movq    %%mm1, (%%edi) \n\t"	/* store result in Dest */
 554		"add $8, %%eax \n\t"	/* increase Src1, Src2 and Dest  */
 555		"add $8, %%ebx \n\t"	/* register pointers by 8 */
 556		"add $8, %%edi \n\t" "dec %%ecx     \n\t"	/* decrease loop counter */
 557		"jnz 1b        \n\t"      /* check loop termination, proceed if required */
 558		"emms          \n\t"	/* exit MMX state */
 559		"popa                   \n\t":"=m" (Dest)	/* %0 */
 560		:"m"(Src2),		/* %1 */
 561		"m"(Src1),		/* %2 */
 562		"m"(SrcLength)		/* %3 */
 563		);
 564#endif
 565	return (0);
 566#else
 567	return (-1);
 568#endif
 569}
 570
 571/*!
 572\brief Filter using AbsDiff: D = | S1 - S2 |
 573
 574\param Src1 Pointer to the start of the first source byte array (S1).
 575\param Src2 Pointer to the start of the second source byte array (S2).
 576\param Dest Pointer to the start of the destination byte array (D).
 577\param length The number of bytes in the source arrays.
 578
 579\return Returns 0 for success or -1 for error.
 580*/
 581int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
 582{
 583	unsigned int i, istart;
 584	unsigned char *cursrc1, *cursrc2, *curdst;
 585	int result;
 586
 587	/* Validate input parameters */
 588	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
 589		return(-1);
 590	if (length == 0)
 591		return(0);
 592
 593	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
 594		/* MMX routine */
 595		SDL_imageFilterAbsDiffMMX(Src1, Src2, Dest, length);
 596
 597		/* Check for unaligned bytes */
 598		if ((length & 7) > 0) {
 599			/* Setup to process unaligned bytes */
 600			istart = length & 0xfffffff8;
 601			cursrc1 = &Src1[istart];
 602			cursrc2 = &Src2[istart];
 603			curdst = &Dest[istart];
 604		} else {
 605			/* No unaligned bytes - we are done */
 606			return (0);
 607		}
 608	} else {
 609		/* Setup to process whole image */
 610		istart = 0;
 611		cursrc1 = Src1;
 612		cursrc2 = Src2;
 613		curdst = Dest;
 614	}
 615
 616	/* C routine to process image */
 617	for (i = istart; i < length; i++) {
 618		result = abs((int) *cursrc1 - (int) *cursrc2);
 619		*curdst = (unsigned char) result;
 620		/* Advance pointers */
 621		cursrc1++;
 622		cursrc2++;
 623		curdst++;
 624	}
 625
 626	return (0);
 627}
 628
 629/*!
 630\brief Internal MMX Filter using Mult: D = saturation255(S1 * S2)
 631
 632\param Src1 Pointer to the start of the first source byte array (S1).
 633\param Src2 Pointer to the start of the second source byte array (S2).
 634\param Dest Pointer to the start of the destination byte array (D).
 635\param SrcLength The number of bytes in the source arrays.
 636
 637\return Returns 0 for success or -1 for error.
 638*/
 639int SDL_imageFilterMultMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
 640{
 641#ifdef USE_MMX
 642#if !defined(GCC__)
 643	__asm
 644	{
 645		pusha
 646			mov eax, Src1   /* load Src1 address into eax */
 647			mov ebx, Src2   /* load Src2 address into ebx */
 648			mov edi, Dest   /* load Dest address into edi */
 649			mov ecx, SrcLength   /* load loop counter (SIZE) into ecx */
 650			shr ecx, 3   /* counter/8 (MMX loads 8 bytes at a time) */
 651			pxor mm0, mm0   /* zero mm0 register */
 652			align 16      	/* 16 byte alignment of the loop entry */
 653L1014:
 654		movq mm1, [eax]   /* load 8 bytes from Src1 into mm1 */
 655		movq mm3, [ebx]   /* load 8 bytes from Src2 into mm3 */
 656		movq mm2, mm1   /* copy mm1 into mm2 */
 657			movq mm4, mm3   /* copy mm3 into mm4  */
 658			punpcklbw mm1, mm0   /* unpack low  bytes of Src1 into words */
 659			punpckhbw mm2, mm0   /* unpack high bytes of Src1 into words */
 660			punpcklbw mm3, mm0   /* unpack low  bytes of Src2 into words */
 661			punpckhbw mm4, mm0   /* unpack high bytes of Src2 into words */
 662			pmullw mm1, mm3   /* mul low  bytes of Src1 and Src2  */
 663			pmullw mm2, mm4   /* mul high bytes of Src1 and Src2 */
 664			/* Take abs value of the results (signed words) */
 665			movq mm5, mm1   /* copy mm1 into mm5 */
 666			movq mm6, mm2   /* copy mm2 into mm6 */
 667			psraw mm5, 15   /* fill mm5 words with word sign bit */
 668			psraw mm6, 15   /* fill mm6 words with word sign bit */
 669			pxor mm1, mm5   /* take 1's compliment of only neg. words */
 670			pxor mm2, mm6   /* take 1's compliment of only neg. words */
 671			psubsw mm1, mm5   /* add 1 to only neg. words, W-(-1) or W-0 */
 672			psubsw mm2, mm6   /* add 1 to only neg. words, W-(-1) or W-0 */
 673			packuswb mm1, mm2   /* pack words back into bytes with saturation */
 674			movq [edi], mm1   /* store result in Dest */
 675			add eax, 8   /* increase Src1, Src2 and Dest  */
 676			add ebx, 8   /* register pointers by 8 */
 677			add edi, 8
 678			dec ecx 	/* decrease loop counter */
 679			jnz L1014	/* check loop termination, proceed if required */
 680			emms /* exit MMX state */
 681			popa
 682	}
 683#else
 684	asm volatile
 685		("pusha		     \n\t" "mov %2, %%eax \n\t"	/* load Src1 address into eax */
 686		"mov %1, %%ebx \n\t"	/* load Src2 address into ebx */
 687		"mov %0, %%edi \n\t"	/* load Dest address into edi */
 688		"mov %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
 689		"shr $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
 690		"pxor      %%mm0, %%mm0 \n\t"	/* zero mm0 register */
 691		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
 692		"1: movq (%%eax), %%mm1 \n\t"     /* load 8 bytes from Src1 into mm1 */
 693		"movq    (%%ebx), %%mm3 \n\t"	/* load 8 bytes from Src2 into mm3 */
 694		"movq      %%mm1, %%mm2 \n\t"	/* copy mm1 into mm2 */
 695		"movq      %%mm3, %%mm4 \n\t"	/* copy mm3 into mm4  */
 696		"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack low  bytes of Src1 into words */
 697		"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack high bytes of Src1 into words */
 698		"punpcklbw %%mm0, %%mm3 \n\t"	/* unpack low  bytes of Src2 into words */
 699		"punpckhbw %%mm0, %%mm4 \n\t"	/* unpack high bytes of Src2 into words */
 700		"pmullw    %%mm3, %%mm1 \n\t"	/* mul low  bytes of Src1 and Src2  */
 701		"pmullw    %%mm4, %%mm2 \n\t"	/* mul high bytes of Src1 and Src2 */
 702		/* Take abs value of the results (signed words) */
 703		"movq      %%mm1, %%mm5 \n\t"	/* copy mm1 into mm5 */
 704		"movq      %%mm2, %%mm6 \n\t"	/* copy mm2 into mm6 */
 705		"psraw       $15, %%mm5 \n\t"	/* fill mm5 words with word sign bit */
 706		"psraw       $15, %%mm6 \n\t"	/* fill mm6 words with word sign bit */
 707		"pxor      %%mm5, %%mm1 \n\t"	/* take 1's compliment of only neg. words */
 708		"pxor      %%mm6, %%mm2 \n\t"	/* take 1's compliment of only neg. words */
 709		"psubsw    %%mm5, %%mm1 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
 710		"psubsw    %%mm6, %%mm2 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
 711		"packuswb  %%mm2, %%mm1 \n\t"	/* pack words back into bytes with saturation */
 712		"movq    %%mm1, (%%edi) \n\t"	/* store result in Dest */
 713		"add $8, %%eax \n\t"	/* increase Src1, Src2 and Dest  */
 714		"add $8, %%ebx \n\t"	/* register pointers by 8 */
 715		"add $8, %%edi \n\t" "dec %%ecx     \n\t"	/* decrease loop counter */
 716		"jnz 1b        \n\t"      /* check loop termination, proceed if required */
 717		"emms          \n\t"	/* exit MMX state */
 718		"popa \n\t":"=m" (Dest)	/* %0 */
 719		:"m"(Src2),		/* %1 */
 720		"m"(Src1),		/* %2 */
 721		"m"(SrcLength)		/* %3 */
 722		);
 723#endif
 724	return (0);
 725#else
 726	return (-1);
 727#endif
 728}
 729
 730/*!
 731\brief Filter using Mult: D = saturation255(S1 * S2)
 732
 733\param Src1 Pointer to the start of the first source byte array (S1).
 734\param Src2 Pointer to the start of the second source byte array (S2).
 735\param Dest Pointer to the start of the destination byte array (D).
 736\param length The number of bytes in the source arrays.
 737
 738\return Returns 0 for success or -1 for error.
 739*/
 740int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
 741{
 742	unsigned int i, istart;
 743	unsigned char *cursrc1, *cursrc2, *curdst;
 744	int result;
 745
 746	/* Validate input parameters */
 747	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
 748		return(-1);
 749	if (length == 0)
 750		return(0);
 751
 752	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
 753		/* MMX routine */
 754		SDL_imageFilterMultMMX(Src1, Src2, Dest, length);
 755
 756		/* Check for unaligned bytes */
 757		if ((length & 7) > 0) {
 758			/* Setup to process unaligned bytes */
 759			istart = length & 0xfffffff8;
 760			cursrc1 = &Src1[istart];
 761			cursrc2 = &Src2[istart];
 762			curdst = &Dest[istart];
 763		} else {
 764			/* No unaligned bytes - we are done */
 765			return (0);
 766		}
 767	} else {
 768		/* Setup to process whole image */
 769		istart = 0;
 770		cursrc1 = Src1;
 771		cursrc2 = Src2;
 772		curdst = Dest;
 773	}
 774
 775	/* C routine to process image */
 776	for (i = istart; i < length; i++) {
 777
 778		/* NOTE: this is probably wrong - dunno what the MMX code does */
 779
 780		result = (int) *cursrc1 * (int) *cursrc2;
 781		if (result > 255)
 782			result = 255;
 783		*curdst = (unsigned char) result;
 784		/* Advance pointers */
 785		cursrc1++;
 786		cursrc2++;
 787		curdst++;
 788	}
 789
 790	return (0);
 791}
 792
 793/*!
 794\brief Internal ASM Filter using MultNor: D = S1 * S2
 795
 796\param Src1 Pointer to the start of the first source byte array (S1).
 797\param Src2 Pointer to the start of the second source byte array (S2).
 798\param Dest Pointer to the start of the destination byte array (D).
 799\param SrcLength The number of bytes in the source arrays.
 800
 801\return Returns 0 for success or -1 for error.
 802*/
 803int SDL_imageFilterMultNorASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
 804{
 805#ifdef USE_MMX
 806#if !defined(GCC__)
 807	__asm
 808	{
 809		pusha
 810			mov edx, Src1   /* load Src1 address into edx */
 811			mov esi, Src2   /* load Src2 address into esi */
 812			mov edi, Dest   /* load Dest address into edi */
 813			mov ecx, SrcLength   /* load loop counter (SIZE) into ecx */
 814			align 16 	/* 16 byte alignment of the loop entry */
 815L10141:
 816		mov al, [edx]   /* load a byte from Src1 */
 817		mul [esi] 	/* mul with a byte from Src2 */
 818		mov [edi], al   /* move a byte result to Dest */
 819			inc edx 	/* increment Src1, Src2, Dest */
 820			inc esi 		/* pointer registers by one */
 821			inc edi
 822			dec ecx	/* decrease loop counter */
 823			jnz L10141  	/* check loop termination, proceed if required */
 824			popa
 825	}
 826#else
 827	asm volatile
 828		("pusha		     \n\t" "mov %2, %%edx \n\t"	/* load Src1 address into edx */
 829		"mov %1, %%esi \n\t"	/* load Src2 address into esi */
 830		"mov %0, %%edi \n\t"	/* load Dest address into edi */
 831		"mov %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
 832		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
 833		"1:mov  (%%edx), %%al \n\t"      /* load a byte from Src1 */
 834		"mulb (%%esi)       \n\t"	/* mul with a byte from Src2 */
 835		"mov %%al, (%%edi)  \n\t"       /* move a byte result to Dest */
 836		"inc %%edx \n\t"		/* increment Src1, Src2, Dest */
 837		"inc %%esi \n\t"		/* pointer registers by one */
 838		"inc %%edi \n\t" "dec %%ecx      \n\t"	/* decrease loop counter */
 839		"jnz 1b         \n\t"     /* check loop termination, proceed if required */
 840		"popa                   \n\t":"=m" (Dest)	/* %0 */
 841		:"m"(Src2),		/* %1 */
 842		"m"(Src1),		/* %2 */
 843		"m"(SrcLength)		/* %3 */
 844		);
 845#endif
 846	return (0);
 847#else
 848	return (-1);
 849#endif
 850}
 851
 852/*!
 853\brief Filter using MultNor: D = S1 * S2
 854
 855\param Src1 Pointer to the start of the first source byte array (S1).
 856\param Src2 Pointer to the start of the second source byte array (S2).
 857\param Dest Pointer to the start of the destination byte array (D).
 858\param length The number of bytes in the source arrays.
 859
 860\return Returns 0 for success or -1 for error.
 861*/
 862int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
 863{
 864	unsigned int i, istart;
 865	unsigned char *cursrc1, *cursrc2, *curdst;
 866	int result;
 867
 868	/* Validate input parameters */
 869	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
 870		return(-1);
 871	if (length == 0)
 872		return(0);
 873
 874	if (SDL_imageFilterMMXdetect()) {
 875		if (length > 0) {
 876			/* ASM routine */
 877			SDL_imageFilterMultNorASM(Src1, Src2, Dest, length);
 878
 879			/* Check for unaligned bytes */
 880			if ((length & 7) > 0) {
 881				/* Setup to process unaligned bytes */
 882				istart = length & 0xfffffff8;
 883				cursrc1 = &Src1[istart];
 884				cursrc2 = &Src2[istart];
 885				curdst = &Dest[istart];
 886			} else {
 887				/* No unaligned bytes - we are done */
 888				return (0);
 889			}
 890		} else {
 891			/* No bytes - we are done */
 892			return (0);
 893		}
 894	} else {
 895		/* Setup to process whole image */
 896		istart = 0;
 897		cursrc1 = Src1;
 898		cursrc2 = Src2;
 899		curdst = Dest;
 900	}
 901
 902	/* C routine to process image */
 903	for (i = istart; i < length; i++) {
 904		result = (int) *cursrc1 * (int) *cursrc2;
 905		*curdst = (unsigned char) result;
 906		/* Advance pointers */
 907		cursrc1++;
 908		cursrc2++;
 909		curdst++;
 910	}
 911
 912	return (0);
 913}
 914
 915/*!
 916\brief Internal MMX Filter using MultDivby2: D = saturation255(S1/2 * S2)
 917
 918\param Src1 Pointer to the start of the first source byte array (S1).
 919\param Src2 Pointer to the start of the second source byte array (S2).
 920\param Dest Pointer to the start of the destination byte array (D).
 921\param SrcLength The number of bytes in the source arrays.
 922
 923\return Returns 0 for success or -1 for error.
 924*/
 925int SDL_imageFilterMultDivby2MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
 926{
 927#ifdef USE_MMX
 928#if !defined(GCC__)
 929	__asm
 930	{ 
 931		pusha
 932			mov eax, Src1   	/* load Src1 address into eax */
 933			mov ebx, Src2   	/* load Src2 address into ebx */
 934			mov edi, Dest   	/* load Dest address into edi */
 935			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
 936			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
 937			pxor mm0,  mm0 	/* zero mm0 register */
 938			align 16          	/* 16 byte alignment of the loop entry */
 939L1015:
 940		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
 941		movq mm3,  [ebx] 	/* load 8 bytes from Src2 into mm3 */
 942		movq mm2,  mm1 	/* copy mm1 into mm2 */
 943			movq mm4,  mm3 	/* copy mm3 into mm4  */
 944			punpcklbw mm1,  mm0 	/* unpack low  bytes of Src1 into words */
 945			punpckhbw mm2,  mm0 	/* unpack high bytes of Src1 into words */
 946			punpcklbw mm3,  mm0 	/* unpack low  bytes of Src2 into words */
 947			punpckhbw mm4,  mm0 	/* unpack high bytes of Src2 into words */
 948			psrlw mm1,  1 	/* divide mm1 words by 2, Src1 low bytes */
 949			psrlw mm2,  1 	/* divide mm2 words by 2, Src1 high bytes */
 950			pmullw mm1,  mm3 	/* mul low  bytes of Src1 and Src2  */
 951			pmullw mm2,  mm4 	/* mul high bytes of Src1 and Src2 */
 952			packuswb mm1,  mm2 	/* pack words back into bytes with saturation */
 953			movq [edi],  mm1 	/* store result in Dest */
 954			add eax,  8 	/* increase Src1, Src2 and Dest  */
 955			add ebx,  8 	/* register pointers by 8 */
 956			add edi,  8
 957			dec ecx        	/* decrease loop counter */
 958			jnz L1015       	/* check loop termination, proceed if required */
 959			emms             	/* exit MMX state */
 960			popa
 961	}
 962#else
 963	asm volatile
 964		("pusha \n\t" "mov %2, %%eax \n\t"	/* load Src1 address into eax */
 965		"mov %1, %%ebx \n\t"	/* load Src2 address into ebx */
 966		"mov %0, %%edi \n\t"	/* load Dest address into edi */
 967		"mov %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
 968		"shr $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
 969		"pxor      %%mm0, %%mm0 \n\t"	/* zero mm0 register */
 970		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
 971		"1: movq (%%eax), %%mm1 \n\t"	/* load 8 bytes from Src1 into mm1 */
 972		"movq    (%%ebx), %%mm3 \n\t"	/* load 8 bytes from Src2 into mm3 */
 973		"movq      %%mm1, %%mm2 \n\t"	/* copy mm1 into mm2 */
 974		"movq      %%mm3, %%mm4 \n\t"	/* copy mm3 into mm4  */
 975		"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack low  bytes of Src1 into words */
 976		"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack high bytes of Src1 into words */
 977		"punpcklbw %%mm0, %%mm3 \n\t"	/* unpack low  bytes of Src2 into words */
 978		"punpckhbw %%mm0, %%mm4 \n\t"	/* unpack high bytes of Src2 into words */
 979		"psrlw        $1, %%mm1 \n\t"	/* divide mm1 words by 2, Src1 low bytes */
 980		"psrlw        $1, %%mm2 \n\t"	/* divide mm2 words by 2, Src1 high bytes */
 981		"pmullw    %%mm3, %%mm1 \n\t"	/* mul low  bytes of Src1 and Src2  */
 982		"pmullw    %%mm4, %%mm2 \n\t"	/* mul high bytes of Src1 and Src2 */
 983		"packuswb  %%mm2, %%mm1 \n\t"	/* pack words back into bytes with saturation */
 984		"movq    %%mm1, (%%edi) \n\t"	/* store result in Dest */
 985		"add $8, %%eax \n\t"	/* increase Src1, Src2 and Dest  */
 986		"add $8, %%ebx \n\t"	/* register pointers by 8 */
 987		"add $8, %%edi \n\t" "dec %%ecx     \n\t"	/* decrease loop counter */
 988		"jnz 1b        \n\t"	/* check loop termination, proceed if required */
 989		"emms          \n\t"	/* exit MMX state */
 990		"popa \n\t":"=m" (Dest)	/* %0 */
 991		:"m"(Src2),		/* %1 */
 992		"m"(Src1),		/* %2 */
 993		"m"(SrcLength)		/* %3 */
 994		);
 995#endif
 996	return (0);
 997#else
 998	return (-1);
 999#endif
1000}
1001
1002/*!
1003\brief Filter using MultDivby2: D = saturation255(S1/2 * S2)
1004
1005\param Src1 Pointer to the start of the first source byte array (S1).
1006\param Src2 Pointer to the start of the second source byte array (S2).
1007\param Dest Pointer to the start of the destination byte array (D).
1008\param length The number of bytes in the source arrays.
1009
1010\return Returns 0 for success or -1 for error.
1011*/
1012int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
1013{
1014	unsigned int i, istart;
1015	unsigned char *cursrc1, *cursrc2, *curdst;
1016	int result;
1017
1018	/* Validate input parameters */
1019	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
1020		return(-1);
1021	if (length == 0)
1022		return(0);
1023
1024	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1025		/* MMX routine */
1026		SDL_imageFilterMultDivby2MMX(Src1, Src2, Dest, length);
1027
1028		/* Check for unaligned bytes */
1029		if ((length & 7) > 0) {
1030			/* Setup to process unaligned bytes */
1031			istart = length & 0xfffffff8;
1032			cursrc1 = &Src1[istart];
1033			cursrc2 = &Src2[istart];
1034			curdst = &Dest[istart];
1035		} else {
1036			/* No unaligned bytes - we are done */
1037			return (0);
1038		}
1039	} else {
1040		/* Setup to process whole image */
1041		istart = 0;
1042		cursrc1 = Src1;
1043		cursrc2 = Src2;
1044		curdst = Dest;
1045	}
1046
1047	/* C routine to process image */
1048	for (i = istart; i < length; i++) {
1049		result = ((int) *cursrc1 / 2) * (int) *cursrc2;
1050		if (result > 255)
1051			result = 255;
1052		*curdst = (unsigned char) result;
1053		/* Advance pointers */
1054		cursrc1++;
1055		cursrc2++;
1056		curdst++;
1057	}
1058
1059	return (0);
1060}
1061
1062/*!
1063\brief Internal MMX Filter using MultDivby4: D = saturation255(S1/2 * S2/2)
1064
1065\param Src1 Pointer to the start of the first source byte array (S1).
1066\param Src2 Pointer to the start of the second source byte array (S2).
1067\param Dest Pointer to the start of the destination byte array (D).
1068\param SrcLength The number of bytes in the source arrays.
1069
1070\return Returns 0 for success or -1 for error.
1071*/
1072int SDL_imageFilterMultDivby4MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
1073{
1074#ifdef USE_MMX
1075#if !defined(GCC__)
1076	__asm
1077	{
1078		pusha
1079			mov eax, Src1   	/* load Src1 address into eax */
1080			mov ebx, Src2   	/* load Src2 address into ebx */
1081			mov edi, Dest   	/* load Dest address into edi */
1082			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
1083			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
1084			pxor mm0, mm0   	/* zero mm0 register */
1085			align 16          	/* 16 byte alignment of the loop entry */
1086L1016:
1087		movq mm1, [eax]   	/* load 8 bytes from Src1 into mm1 */
1088		movq mm3, [ebx]   	/* load 8 bytes from Src2 into mm3 */
1089		movq mm2, mm1   	/* copy mm1 into mm2 */
1090			movq mm4, mm3   	/* copy mm3 into mm4  */
1091			punpcklbw mm1, mm0   	/* unpack low  bytes of Src1 into words */
1092			punpckhbw mm2, mm0   	/* unpack high bytes of Src1 into words */
1093			punpcklbw mm3, mm0   	/* unpack low  bytes of Src2 into words */
1094			punpckhbw mm4, mm0   	/* unpack high bytes of Src2 into words */
1095			psrlw mm1, 1   	/* divide mm1 words by 2, Src1 low bytes */
1096			psrlw mm2, 1   	/* divide mm2 words by 2, Src1 high bytes */
1097			psrlw mm3, 1   	/* divide mm3 words by 2, Src2 low bytes */
1098			psrlw mm4, 1   	/* divide mm4 words by 2, Src2 high bytes */
1099			pmullw mm1, mm3   	/* mul low  bytes of Src1 and Src2  */
1100			pmullw mm2, mm4   	/* mul high bytes of Src1 and Src2 */
1101			packuswb mm1, mm2   	/* pack words back into bytes with saturation */
1102			movq [edi], mm1   	/* store result in Dest */
1103			add eax, 8   	/* increase Src1, Src2 and Dest  */
1104			add ebx, 8   	/* register pointers by 8 */
1105			add edi,  8
1106			dec ecx        	/* decrease loop counter */
1107			jnz L1016       	/* check loop termination, proceed if required */
1108			emms             	/* exit MMX state */
1109			popa
1110	}
1111#else
1112	asm volatile
1113		("pusha		     \n\t" "mov %2, %%eax \n\t"	/* load Src1 address into eax */
1114		"mov %1, %%ebx \n\t"	/* load Src2 address into ebx */
1115		"mov %0, %%edi \n\t"	/* load Dest address into edi */
1116		"mov %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
1117		"shr $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
1118		"pxor      %%mm0, %%mm0 \n\t"	/* zero mm0 register */
1119		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
1120		"1: movq (%%eax), %%mm1 \n\t"	/* load 8 bytes from Src1 into mm1 */
1121		"movq    (%%ebx), %%mm3 \n\t"	/* load 8 bytes from Src2 into mm3 */
1122		"movq      %%mm1, %%mm2 \n\t"	/* copy mm1 into mm2 */
1123		"movq      %%mm3, %%mm4 \n\t"	/* copy mm3 into mm4  */
1124		"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack low  bytes of Src1 into words */
1125		"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack high bytes of Src1 into words */
1126		"punpcklbw %%mm0, %%mm3 \n\t"	/* unpack low  bytes of Src2 into words */
1127		"punpckhbw %%mm0, %%mm4 \n\t"	/* unpack high bytes of Src2 into words */
1128		"psrlw        $1, %%mm1 \n\t"	/* divide mm1 words by 2, Src1 low bytes */
1129		"psrlw        $1, %%mm2 \n\t"	/* divide mm2 words by 2, Src1 high bytes */
1130		"psrlw        $1, %%mm3 \n\t"	/* divide mm3 words by 2, Src2 low bytes */
1131		"psrlw        $1, %%mm4 \n\t"	/* divide mm4 words by 2, Src2 high bytes */
1132		"pmullw    %%mm3, %%mm1 \n\t"	/* mul low  bytes of Src1 and Src2  */
1133		"pmullw    %%mm4, %%mm2 \n\t"	/* mul high bytes of Src1 and Src2 */
1134		"packuswb  %%mm2, %%mm1 \n\t"	/* pack words back into bytes with saturation */
1135		"movq    %%mm1, (%%edi) \n\t"	/* store result in Dest */
1136		"add $8, %%eax \n\t"	/* increase Src1, Src2 and Dest  */
1137		"add $8, %%ebx \n\t"	/* register pointers by 8 */
1138		"add $8, %%edi \n\t" "dec %%ecx     \n\t"	/* decrease loop counter */
1139		"jnz 1b        \n\t"	/* check loop termination, proceed if required */
1140		"emms          \n\t"	/* exit MMX state */
1141		"popa                   \n\t":"=m" (Dest)	/* %0 */
1142		:"m"(Src2),		/* %1 */
1143		"m"(Src1),		/* %2 */
1144		"m"(SrcLength)		/* %3 */
1145		);
1146#endif
1147	return (0);
1148#else
1149	return (-1);
1150#endif
1151}
1152
1153/*!
1154\brief Filter using MultDivby4: D = saturation255(S1/2 * S2/2)
1155
1156\param Src1 Pointer to the start of the first source byte array (S1).
1157\param Src2 Pointer to the start of the second source byte array (S2).
1158\param Dest Pointer to the start of the destination byte array (D).
1159\param length The number of bytes in the source arrays.
1160
1161\return Returns 0 for success or -1 for error.
1162*/
1163int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
1164{
1165	unsigned int i, istart;
1166	unsigned char *cursrc1, *cursrc2, *curdst;
1167	int result;
1168
1169	/* Validate input parameters */
1170	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
1171		return(-1);
1172	if (length == 0)
1173		return(0);
1174
1175	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1176		/* MMX routine */
1177		SDL_imageFilterMultDivby4MMX(Src1, Src2, Dest, length);
1178
1179		/* Check for unaligned bytes */
1180		if ((length & 7) > 0) {
1181			/* Setup to process unaligned bytes */
1182			istart = length & 0xfffffff8;
1183			cursrc1 = &Src1[istart];
1184			cursrc2 = &Src2[istart];
1185			curdst = &Dest[istart];
1186		} else {
1187			/* No unaligned bytes - we are done */
1188			return (0);
1189		}
1190	} else {
1191		/* Setup to process whole image */
1192		istart = 0;
1193		cursrc1 = Src1;
1194		cursrc2 = Src2;
1195		curdst = Dest;
1196	}
1197
1198	/* C routine to process image */
1199	for (i = istart; i < length; i++) {
1200		result = ((int) *cursrc1 / 2) * ((int) *cursrc2 / 2);
1201		if (result > 255)
1202			result = 255;
1203		*curdst = (unsigned char) result;
1204		/* Advance pointers */
1205		cursrc1++;
1206		cursrc2++;
1207		curdst++;
1208	}
1209
1210	return (0);
1211}
1212
1213/*!
1214\brief Internal MMX Filter using BitAnd: D = S1 & S2
1215
1216\param Src1 Pointer to the start of the first source byte array (S1).
1217\param Src2 Pointer to the start of the second source byte array (S2).
1218\param Dest Pointer to the start of the destination byte array (D).
1219\param SrcLength The number of bytes in the source arrays.
1220
1221\return Returns 0 for success or -1 for error.
1222*/
1223int SDL_imageFilterBitAndMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
1224{
1225#ifdef USE_MMX
1226#if !defined(GCC__)
1227	__asm
1228	{
1229		pusha
1230			mov eax, Src1   	/* load Src1 address into eax */
1231			mov ebx, Src2   	/* load Src2 address into ebx */
1232			mov edi, Dest   	/* load Dest address into edi */
1233			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
1234			shr ecx, 3 	/* counter/8 (MMX loads 8 bytes at a time) */
1235			align 16          	/* 16 byte alignment of the loop entry */
1236L1017:
1237		movq mm1, [eax]   	/* load 8 bytes from Src1 into mm1 */
1238		pand mm1, [ebx]   	/* mm1=Src1&Src2 */
1239		movq [edi], mm1   	/* store result in Dest */
1240			add eax, 8   	/* increase Src1, Src2 and Dest  */
1241			add ebx, 8   	/* register pointers by 8 */
1242			add edi, 8
1243			dec ecx        	/* decrease loop counter */
1244			jnz L1017       	/* check loop termination, proceed if required */
1245			emms             	/* exit MMX state */
1246			popa
1247	}
1248#else
1249	asm volatile
1250		("pusha		     \n\t" "mov %2, %%eax \n\t"	/* load Src1 address into eax */
1251		"mov %1, %%ebx \n\t"	/* load Src2 address into ebx */
1252		"mov %0, %%edi \n\t"	/* load Dest address into edi */
1253		"mov %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
1254		"shr $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
1255		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
1256		"1: movq (%%eax), %%mm1 \n\t"	/* load 8 bytes from Src1 into mm1 */
1257		"pand    (%%ebx), %%mm1 \n\t"	/* mm1=Src1&Src2 */
1258		"movq    %%mm1, (%%edi) \n\t"	/* store result in Dest */
1259		"add $8, %%eax \n\t"	/* increase Src1, Src2 and Dest  */
1260		"add $8, %%ebx \n\t"	/* register pointers by 8 */
1261		"add $8, %%edi \n\t" "dec %%ecx     \n\t"	/* decrease loop counter */
1262		"jnz 1b        \n\t"	/* check loop termination, proceed if required */
1263		"emms          \n\t"	/* exit MMX state */
1264		"popa                   \n\t":"=m" (Dest)	/* %0 */
1265		:"m"(Src2),		/* %1 */
1266		"m"(Src1),		/* %2 */
1267		"m"(SrcLength)		/* %3 */
1268		);
1269#endif
1270	return (0);
1271#else
1272	return (-1);
1273#endif
1274}
1275
1276/*!
1277\brief Filter using BitAnd: D = S1 & S2
1278
1279\param Src1 Pointer to the start of the first source byte array (S1).
1280\param Src2 Pointer to the start of the second source byte array (S2).
1281\param Dest Pointer to the start of the destination byte array (D).
1282\param length The number of bytes in the source arrays.
1283
1284\return Returns 0 for success or -1 for error.
1285*/
1286int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
1287{
1288	unsigned int i, istart;
1289	unsigned char *cursrc1, *cursrc2, *curdst;
1290
1291	/* Validate input parameters */
1292	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
1293		return(-1);
1294	if (length == 0)
1295		return(0);
1296
1297	if ((SDL_imageFilterMMXdetect()>0) && (length>7)) {
1298		/*  if (length > 7) { */
1299		/* Call MMX routine */
1300
1301		SDL_imageFilterBitAndMMX(Src1, Src2, Dest, length);
1302
1303		/* Check for unaligned bytes */
1304		if ((length & 7) > 0) {
1305
1306			/* Setup to process unaligned bytes */
1307			istart = length & 0xfffffff8;
1308			cursrc1 = &Src1[istart];
1309			cursrc2 = &Src2[istart];
1310			curdst = &Dest[istart];
1311		} else {
1312			/* No unaligned bytes - we are done */
1313			return (0);
1314		}
1315	} else {
1316		/* Setup to process whole image */
1317		istart = 0;
1318		cursrc1 = Src1;
1319		cursrc2 = Src2;
1320		curdst = Dest;
1321	}
1322
1323	/* C routine to process image */
1324	for (i = istart; i < length; i++) {
1325		*curdst = (*cursrc1) & (*cursrc2);
1326		/* Advance pointers */
1327		cursrc1++;
1328		cursrc2++;
1329		curdst++;
1330	}
1331
1332	return (0);
1333}
1334
1335/*!
1336\brief Internal MMX Filter using BitOr: D = S1 | S2
1337
1338\param Src1 Pointer to the start of the first source byte array (S1).
1339\param Src2 Pointer to the start of the second source byte array (S2).
1340\param Dest Pointer to the start of the destination byte array (D).
1341\param SrcLength The number of bytes in the source arrays.
1342
1343\return Returns 0 for success or -1 for error.
1344*/
1345int SDL_imageFilterBitOrMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
1346{
1347#ifdef USE_MMX
1348#if !defined(GCC__)
1349	__asm
1350	{
1351		pusha
1352			mov eax, Src1   	/* load Src1 address into eax */
1353			mov ebx, Src2   	/* load Src2 address into ebx */
1354			mov edi, Dest   	/* load Dest address into edi */
1355			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
1356			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
1357			align 16          	/* 16 byte alignment of the loop entry */
1358L91017:
1359		movq mm1, [eax]   	/* load 8 bytes from Src1 into mm1 */
1360		por mm1, [ebx]   	/* mm1=Src1|Src2 */
1361		movq [edi], mm1   	/* store result in Dest */
1362			add eax, 8   	/* increase Src1, Src2 and Dest  */
1363			add ebx, 8   	/* register pointers by 8 */
1364			add edi,  8
1365			dec ecx        	/* decrease loop counter */
1366			jnz L91017      	/* check loop termination, proceed if required */
1367			emms             	/* exit MMX state */
1368			popa
1369	}
1370#else
1371	asm volatile
1372		("pusha		     \n\t" "mov %2, %%eax \n\t"	/* load Src1 address into eax */
1373		"mov %1, %%ebx \n\t"	/* load Src2 address into ebx */
1374		"mov %0, %%edi \n\t"	/* load Dest address into edi */
1375		"mov %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
1376		"shr $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
1377		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
1378		"1: movq (%%eax), %%mm1 \n\t"	/* load 8 bytes from Src1 into mm1 */
1379		"por     (%%ebx), %%mm1 \n\t"	/* mm1=Src1|Src2 */
1380		"movq    %%mm1, (%%edi) \n\t"	/* store result in Dest */
1381		"add $8, %%eax \n\t"	/* increase Src1, Src2 and Dest  */
1382		"add $8, %%ebx \n\t"	/* register pointers by 8 */
1383		"add $8, %%edi \n\t" "dec %%ecx     \n\t"	/* decrease loop counter */
1384		"jnz 1b        \n\t"	/* check loop termination, proceed if required */
1385		"emms          \n\t"	/* exit MMX state */
1386		"popa                   \n\t":"=m" (Dest)	/* %0 */
1387		:"m"(Src2),		/* %1 */
1388		"m"(Src1),		/* %2 */
1389		"m"(SrcLength)		/* %3 */
1390		);
1391#endif
1392	return (0);
1393#else
1394	return (-1);
1395#endif
1396}
1397
1398/*!
1399\brief Filter using BitOr: D = S1 | S2
1400
1401\param Src1 Pointer to the start of the first source byte array (S1).
1402\param Src2 Pointer to the start of the second source byte array (S2).
1403\param Dest Pointer to the start of the destination byte array (D).
1404\param length The number of bytes in the source arrays.
1405
1406\return Returns 0 for success or -1 for error.
1407*/
1408int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
1409{
1410	unsigned int i, istart;
1411	unsigned char *cursrc1, *cursrc2, *curdst;
1412
1413	/* Validate input parameters */
1414	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
1415		return(-1);
1416	if (length == 0)
1417		return(0);
1418
1419	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1420
1421		/* MMX routine */
1422		SDL_imageFilterBitOrMMX(Src1, Src2, Dest, length);
1423
1424		/* Check for unaligned bytes */
1425		if ((length & 7) > 0) {
1426			/* Setup to process unaligned bytes */
1427			istart = length & 0xfffffff8;
1428			cursrc1 = &Src1[istart];
1429			cursrc2 = &Src2[istart];
1430			curdst = &Dest[istart];
1431		} else {
1432			/* No unaligned bytes - we are done */
1433			return (0);
1434		}
1435	} else {
1436		/* Setup to process whole image */
1437		istart = 0;
1438		cursrc1 = Src1;
1439		cursrc2 = Src2;
1440		curdst = Dest;
1441	}
1442
1443	/* C routine to process image */
1444	for (i = istart; i < length; i++) {
1445		*curdst = *cursrc1 | *cursrc2;
1446		/* Advance pointers */
1447		cursrc1++;
1448		cursrc2++;
1449		curdst++;
1450	}
1451	return (0);
1452}
1453
1454/*!
1455\brief Internal ASM Filter using Div: D = S1 / S2
1456
1457\param Src1 Pointer to the start of the first source byte array (S1).
1458\param Src2 Pointer to the start of the second source byte array (S2).
1459\param Dest Pointer to the start of the destination byte array (D).
1460\param SrcLength The number of bytes in the source arrays.
1461
1462\return Returns 0 for success or -1 for error.
1463*/
1464int SDL_imageFilterDivASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
1465{
1466#ifdef USE_MMX
1467#if !defined(GCC__)
1468	__asm
1469	{
1470		pusha
1471			mov edx, Src1   	/* load Src1 address into edx */
1472			mov esi, Src2   	/* load Src2 address into esi */
1473			mov edi, Dest   	/* load Dest address into edi */
1474			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
1475			align 16        	/* 16 byte alignment of the loop entry */
1476L10191:
1477		mov bl, [esi]   	/* load a byte from Src2 */
1478		cmp bl, 0   	/* check if it zero */
1479			jnz L10192
1480			mov [edi], 255   	/* division by zero = 255 !!! */
1481			jmp  L10193
1482L10192:
1483		xor ah, ah   	/* prepare AX, zero AH register */
1484			mov al, [edx]   	/* load a byte from Src1 into AL */
1485		div   bl             	/* divide AL by BL */
1486			mov [edi], al   	/* move a byte result to Dest */
1487L10193:
1488		inc edx    	/* increment Src1, Src2, Dest */
1489			inc esi    		/* pointer registers by one */
1490			inc edi
1491			dec ecx       	/* decrease loop counter */
1492			jnz L10191     	/* check loop termination, proceed if required */
1493			popa
1494	}
1495#else
1496	asm volatile
1497		("pusha \n\t" "mov %2, %%edx \n\t"	/* load Src1 address into edx */
1498		"mov %1, %%esi \n\t"	/* load Src2 address into esi */
1499		"mov %0, %%edi \n\t"	/* load Dest address into edi */
1500		"mov %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
1501		".align 16     \n\t"	/* 16 byte alignment of the loop entry */
1502		"1: mov (%%esi), %%bl  \n\t"	/* load a byte from Src2 */
1503		"cmp       $0, %%bl  \n\t"	/* check if it zero */
1504		"jnz 2f              \n\t" "movb  $255, (%%edi) \n\t"	/* division by zero = 255 !!! */
1505		"jmp 3f              \n\t" "2:                  \n\t" "xor   %%ah, %%ah    \n\t"	/* prepare AX, zero AH register */
1506		"mov   (%%edx), %%al \n\t"	/* load a byte from Src1 into AL */
1507		"div   %%bl          \n\t"	/* divide AL by BL */
1508		"mov   %%al, (%%edi) \n\t"	/* move a byte result to Dest */
1509		"3: inc %%edx        \n\t"	/* increment Src1, Src2, Dest */
1510		"inc %%esi \n\t"		/* pointer registers by one */
1511		"inc %%edi \n\t" "dec %%ecx    \n\t"	/* decrease loop counter */
1512		"jnz 1b       \n\t"	/* check loop termination, proceed if required */
1513		"popa \n\t":"=m" (Dest)	/* %0 */
1514		:"m"(Src2),		/* %1 */
1515		"m"(Src1),		/* %2 */
1516		"m"(SrcLength)		/* %3 */
1517		);
1518#endif
1519	return (0);
1520#else
1521	return (-1);
1522#endif
1523}
1524
1525/*!
1526\brief Filter using Div: D = S1 / S2
1527
1528\param Src1 Pointer to the start of the first source byte array (S1).
1529\param Src2 Pointer to the start of the second source byte array (S2).
1530\param Dest Pointer to the start of the destination byte array (D).
1531\param length The number of bytes in the source arrays.
1532
1533\return Returns 0 for success or -1 for error.
1534*/
1535int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
1536{
1537	unsigned int i, istart;
1538	unsigned char *cursrc1, *cursrc2, *curdst;
1539	int result;
1540
1541	/* Validate input parameters */
1542	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
1543		return(-1);
1544	if (length == 0)
1545		return(0);
1546
1547	if (SDL_imageFilterMMXdetect()) {
1548		if (length > 0) {
1549			/* Call ASM routine */
1550			SDL_imageFilterDivASM(Src1, Src2, Dest, length);
1551
1552			/* Never unaligned bytes - we are done */
1553			return (0);
1554		} else {
1555			return (-1);
1556		}
1557	} else {
1558		/* Setup to process whole image */
1559		istart = 0;
1560		cursrc1 = Src1;
1561		cursrc2 = Src2;
1562		curdst = Dest;
1563	}
1564
1565	/* C routine to process image */
1566	for (i = istart; i < length; i++) {
1567		result = (int) *cursrc1 / (int) *cursrc2;
1568		*curdst = (unsigned char) result;
1569		/* Advance pointers */
1570		cursrc1++;
1571		cursrc2++;
1572		curdst++;
1573	}
1574
1575	return (0);
1576}
1577
1578/* ------------------------------------------------------------------------------------ */
1579
1580/*!
1581\brief Internal MMX Filter using BitNegation: D = !S
1582
1583\param Src1 Pointer to the start of the source byte array (S1).
1584\param Dest Pointer to the start of the destination byte array (D).
1585\param SrcLength The number of bytes in the source array.
1586
1587\return Returns 0 for success or -1 for error.
1588*/
1589int SDL_imageFilterBitNegationMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength)
1590{
1591#ifdef USE_MMX
1592#if !defined(GCC__)
1593	__asm
1594	{
1595		pusha
1596			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
1597			mov eax, Src1   	/* load Src1 address into eax */
1598			mov edi, Dest   	/* load Dest address into edi */
1599			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
1600			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
1601			align 16          	/* 16 byte alignment of the loop entry */
1602L91117:
1603		movq mm0, [eax]   	/* load 8 bytes from Src1 into mm1 */
1604		pxor mm0, mm1   	/* negate mm0 by xoring with mm1 */
1605			movq [edi], mm0   	/* store result in Dest */
1606			add eax, 8   	/* increase Src1, Src2 and Dest  */
1607			add edi,  8
1608			dec ecx        	/* decrease loop counter */
1609			jnz L91117      	/* check loop termination, proceed if required */
1610			emms             	/* exit MMX state */
1611			popa
1612	}
1613#else
1614	asm volatile
1615		("pusha		     \n\t" "pcmpeqb   %%mm1, %%mm1 \n\t"	/* generate all 1's in mm1 */
1616		"mov %1, %%eax \n\t"	/* load Src1 address into eax */
1617		"mov %0, %%edi \n\t"	/* load Dest address into edi */
1618		"mov %2, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
1619		"shr $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
1620		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
1621		"1: movq (%%eax), %%mm0 \n\t"	/* load 8 bytes from Src1 into mm1 */
1622		"pxor      %%mm1, %%mm0 \n\t"	/* negate mm0 by xoring with mm1 */
1623		"movq    %%mm0, (%%edi) \n\t"	/* store result in Dest */
1624		"add $8, %%eax \n\t"	/* increase Src1, Src2 and Dest  */
1625		"add $8, %%edi \n\t" "dec %%ecx     \n\t"	/* decrease loop counter */
1626		"jnz 1b        \n\t"	/* check loop termination, proceed if required */
1627		"emms          \n\t"	/* exit MMX state */
1628		"popa                   \n\t":"=m" (Dest)	/* %0 */
1629		:"m"(Src1),		/* %1 */
1630		"m"(SrcLength)		/* %2 */
1631		);
1632#endif
1633	return (0);
1634#else
1635	return (-1);
1636#endif
1637}
1638
1639/*!
1640\brief Filter using BitNegation: D = !S
1641
1642\param Src1 Pointer to the start of the source byte array (S).
1643\param Dest Pointer to the start of the destination byte array (D).
1644\param length The number of bytes in the source array.
1645
1646\return Returns 0 for success or -1 for error.
1647*/
1648int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length)
1649{
1650	unsigned int i, istart;
1651	unsigned char *cursrc1, *curdst;
1652
1653	/* Validate input parameters */
1654	if ((Src1 == NULL) || (Dest == NULL))
1655		return(-1);
1656	if (length == 0)
1657		return(0);
1658
1659	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1660		/* MMX routine */
1661		SDL_imageFilterBitNegationMMX(Src1, Dest, length);
1662
1663		/* Check for unaligned bytes */
1664		if ((length & 7) > 0) {
1665			/* Setup to process unaligned bytes */
1666			istart = length & 0xfffffff8;
1667			cursrc1 = &Src1[istart];
1668			curdst = &Dest[istart];
1669		} else {
1670			/* No unaligned bytes - we are done */
1671			return (0);
1672		}
1673	} else {
1674		/* Setup to process whole image */
1675		istart = 0;
1676		cursrc1 = Src1;
1677		curdst = Dest;
1678	}
1679
1680	/* C routine to process image */
1681	for (i = istart; i < length; i++) {
1682		*curdst = ~(*cursrc1);
1683		/* Advance pointers */
1684		cursrc1++;
1685		curdst++;
1686	}
1687
1688	return (0);
1689}
1690
1691/*!
1692\brief Internal MMX Filter using AddByte: D = saturation255(S + C) 
1693
1694\param Src1 Pointer to the start of the source byte array (S).
1695\param Dest Pointer to the start of the destination byte array (D).
1696\param SrcLength The number of bytes in the source array.
1697\param C Constant value to add (C).
1698
1699\return Returns 0 for success or -1 for error.
1700*/
1701int SDL_imageFilterAddByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
1702{
1703#ifdef USE_MMX
1704#if !defined(GCC__)
1705	__asm
1706	{
1707		pusha
1708			/* ** Duplicate C in 8 bytes of MM1 ** */
1709			mov al, C   	/* load C into AL */
1710			mov ah, al   	/* copy AL into AH */
1711			mov bx, ax   	/* copy AX into BX */
1712			shl eax, 16   	/* shift 2 bytes of EAX left */
1713			mov ax, bx   	/* copy BX into AX */
1714			movd mm1, eax   	/* copy EAX into MM1 */
1715			movd mm2, eax   	/* copy EAX into MM2 */
1716			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
1717			mov eax, Src1   	/* load Src1 address into eax */
1718			mov edi, Dest   	/* load Dest address into edi */
1719			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
1720			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
1721			align 16                 	/* 16 byte alignment of the loop entry */
1722L1021:
1723		movq mm0, [eax]   	/* load 8 bytes from Src1 into MM0 */
1724		paddusb mm0,  mm1 	/* MM0=SrcDest+C (add 8 bytes with saturation) */
1725			movq [edi], mm0   	/* store result in Dest */
1726			add eax, 8   	/* increase Dest register pointer by 8 */
1727			add edi, 8   	/* increase Dest register pointer by 8 */
1728			dec              ecx    	/* decrease loop counter */
1729			jnz             L1021    	/* check loop termination, proceed if required */
1730			emms                      	/* exit MMX state */
1731			popa
1732	}
1733#else
1734	asm volatile
1735		("pusha		     \n\t"
1736		/* ** Duplicate C in 8 bytes of MM1 ** */
1737		"mov           %3, %%al \n\t"	/* load C into AL */
1738		"mov         %%al, %%ah \n\t"	/* copy AL into AH */
1739		"mov         %%ax, %%bx \n\t"	/* copy AX into BX */
1740		"shl         $16, %%eax \n\t"	/* shift 2 bytes of EAX left */
1741		"mov         %%bx, %%ax \n\t"	/* copy BX into AX */
1742		"movd      %%eax, %%mm1 \n\t"	/* copy EAX into MM1 */
1743		"movd      %%eax, %%mm2 \n\t"	/* copy EAX into MM2 */
1744		"punpckldq %%mm2, %%mm1 \n\t"	/* fill higher bytes of MM1 with C */
1745		"mov          %1, %%eax \n\t"	/* load Src1 address into eax */
1746		"mov          %0, %%edi \n\t"	/* load Dest address into edi */
1747		"mov          %2, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
1748		"shr          $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
1749		".align 16              \n\t"	/* 16 byte alignment of the loop entry */
1750		"1:                     \n\t" 
1751		"movq    (%%eax), %%mm0 \n\t"	/* load 8 bytes from Src1 into MM0 */
1752		"paddusb   %%mm1, %%mm0 \n\t"	/* MM0=SrcDest+C (add 8 bytes with saturation) */
1753		"movq    %%mm0, (%%edi) \n\t"	/* store result in Dest */
1754		"add          $8, %%eax \n\t"	/* increase Dest register pointer by 8 */
1755		"add          $8, %%edi \n\t"	/* increase Dest register pointer by 8 */
1756		"dec              %%ecx \n\t"	/* decrease loop counter */
1757		"jnz                 1b \n\t"	/* check loop termination, proceed if required */
1758		"emms                   \n\t"	/* exit MMX state */
1759		"popa                   \n\t":"=m" (Dest)	/* %0 */
1760		:"m"(Src1),		/* %1 */
1761		"m"(SrcLength),		/* %2 */
1762		"m"(C)			/* %3 */
1763		);
1764#endif
1765	return (0);
1766#else
1767	return (-1);
1768#endif
1769}
1770
1771/*!
1772\brief Filter using AddByte: D = saturation255(S + C) 
1773
1774\param Src1 Pointer to the start of the source byte array (S).
1775\param Dest Pointer to the start of the destination byte array (D).
1776\param length The number of bytes in the source array.
1777\param C Constant value to add (C).
1778
1779
1780\return Returns 0 for success or -1 for error.
1781*/
1782int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
1783{
1784	unsigned int i, istart;
1785	int iC;
1786	unsigned char *cursrc1, *curdest;
1787	int result;
1788
1789	/* Validate input parameters */
1790	if ((Src1 == NULL) || (Dest == NULL))
1791		return(-1);
1792	if (length == 0)
1793		return(0);
1794
1795	/* Special case: C==0 */
1796	if (C == 0) {
1797		memcpy(Src1, Dest, length);
1798		return (0); 
1799	}
1800
1801	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1802
1803		/* MMX routine */
1804		SDL_imageFilterAddByteMMX(Src1, Dest, length, C);
1805
1806		/* Check for unaligned bytes */
1807		if ((length & 7) > 0) {
1808			/* Setup to process unaligned bytes */
1809			istart = length & 0xfffffff8;
1810			cursrc1 = &Src1[istart];
1811			curdest = &Dest[istart];
1812		} else {
1813			/* No unaligned bytes - we are done */
1814			return (0);
1815		}
1816	} else {
1817		/* Setup to process whole image */
1818		istart = 0;
1819		cursrc1 = Src1;
1820		curdest = Dest;
1821	}
1822
1823	/* C routine to process image */
1824	iC = (int) C;
1825	for (i = istart; i < length; i++) {
1826		result = (int) *cursrc1 + iC;
1827		if (result > 255)
1828			result = 255;
1829		*curdest = (unsigned char) result;
1830		/* Advance pointers */
1831		cursrc1++;
1832		curdest++;
1833	}
1834	return (0);
1835}
1836
1837/*!
1838\brief Internal MMX Filter using AddUint: D = saturation255((S[i] + Cs[i % 4]), Cs=Swap32((uint)C)
1839
1840\param Src1 Pointer to the start of the source byte array (S).
1841\param Dest Pointer to the start of the destination byte array (D).
1842\param SrcLength The number of bytes in the source array.
1843\param C Constant to add (C).
1844\param D Byteorder-swapped constant to add (Cs).
1845
1846\return Returns 0 for success or -1 for error.
1847*/
1848int SDL_imageFilterAddUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
1849{
1850#ifdef USE_MMX
1851#if !defined(GCC__)
1852	__asm
1853	{
1854		pusha
1855			/* ** Duplicate (int)C in 8 bytes of MM1 ** */
1856			mov eax, C   	/* load C into EAX */
1857			movd mm1, eax   	/* copy EAX into MM1 */
1858			mov eax, D   	/* load D into EAX */
1859			movd mm2, eax   	/* copy EAX into MM2 */
1860			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
1861			mov eax, Src1   	/* load Src1 address into eax */
1862			mov edi, Dest   	/* load Dest address into edi */
1863			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
1864			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
1865			align 16                 	/* 16 byte alignment of the loop entry */
1866L11023:
1867		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
1868		paddusb mm0,  mm1 	/* MM0=SrcDest+C (add 8 bytes with saturation) */
1869			movq [edi],  mm0 	/* store result in SrcDest */
1870			add eax, 8   	/* increase Src1 register pointer by 8 */
1871			add edi, 8   	/* increase Dest register pointer by 8 */
1872			dec              ecx    	/* decrease loop counter */
1873			jnz             L11023    	/* check loop termination, proceed if required */
1874			emms                      	/* exit MMX state */
1875			popa
1876	}
1877#else
1878	asm volatile
1879		("pusha		     \n\t"
1880		/* ** Duplicate (int)C in 8 bytes of MM1 ** */
1881		"mov          %3, %%eax \n\t"	/* load C into EAX */
1882		"movd      %%eax, %%mm1 \n\t"	/* copy EAX into MM1 */
1883		"mov          %4, %%eax \n\t"	/* load D into EAX */
1884		"movd      %%eax, %%mm2 \n\t"	/* copy EAX into MM2 */
1885		"punpckldq %%mm2, %%mm1 \n\t"	/* fill higher bytes of MM1 with C */
1886		"mov          %1, %%eax \n\t"	/* load Src1 address into eax */
1887		"mov          %0, %%edi \n\t"	/* load Dest address into edi */
1888		"mov          %2, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
1889		"shr          $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
1890		".align 16              \n\t"	/* 16 byte alignment of the loop entry */
1891		"1:                     \n\t" 
1892		"movq    (%%eax), %%mm0 \n\t"	/* load 8 bytes from SrcDest into MM0 */
1893		"paddusb   %%mm1, %%mm0 \n\t"	/* MM0=SrcDest+C (add 8 bytes with saturation) */
1894		"movq    %%mm0, (%%edi) \n\t"	/* store result in SrcDest */
1895		"add          $8, %%eax \n\t"	/* increase Src1 register pointer by 8 */
1896		"add          $8, %%edi \n\t"	/* increase Dest register pointer by 8 */
1897		"dec              %%ecx \n\t"	/* decrease loop counter */
1898		"jnz                 1b \n\t"	/* check loop termination, proceed if required */
1899		"emms                   \n\t"	/* exit MMX state */
1900		"popa                   \n\t":"=m" (Dest)	/* %0 */
1901		:"m"(Src1),		/* %1 */
1902		"m"(SrcLength),		/* %2 */
1903		"m"(C),			/* %3 */
1904		"m"(D)			/* %4 */
1905		);
1906#endif
1907	return (0);
1908#else
1909	return (-1);
1910#endif
1911}
1912
1913/*!
1914\brief Filter using AddUint: D = saturation255((S[i] + Cs[i % 4]), Cs=Swap32((uint)C)
1915
1916\param Src1 Pointer to the start of the source byte array (S).
1917\param Dest Pointer to the start of the destination byte array (D).
1918\param length The number of bytes in the source array.
1919\param C Constant to add (C).
1920
1921\return Returns 0 for success or -1 for error.
1922*/
1923int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
1924{
1925	unsigned int i, j, istart, D;
1926	int iC[4];
1927	unsigned char *cursrc1;
1928	unsigned char *curdest;
1929	int result;
1930
1931	/* Validate input parameters */
1932	if ((Src1 == NULL) || (Dest == NULL))
1933		return(-1);
1934	if (length == 0)
1935		return(0);
1936
1937	/* Special case: C==0 */
1938	if (C == 0) {
1939		memcpy(Src1, Dest, length);
1940		return (0); 
1941	}
1942
1943	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1944
1945		/* MMX routine */
1946		D=SWAP_32(C);
1947		SDL_imageFilterAddUintMMX(Src1, Dest, length, C, D);
1948
1949		/* Check for unaligned bytes */
1950		if ((length & 7) > 0) {
1951			/* Setup to process unaligned bytes */
1952			istart = length & 0xfffffff8;
1953			cursrc1 = &Src1[istart];
1954			curdest = &Dest[istart];
1955		} else {
1956			/* No unaligned bytes - we are done */
1957			return (0);
1958		}
1959	} else {
1960		/* Setup to process whole image */
1961		istart = 0;
1962		cursrc1 = Src1;
1963		curdest = Dest;
1964	}
1965
1966	/* C routine to process bytes */
1967	iC[3] = (int) ((C >> 24) & 0xff);
1968	iC[2] = (int) ((C >> 16) & 0xff);
1969	iC[1] = (int) ((C >>  8) & 0xff);
1970	iC[0] = (int) ((C >>  0) & 0xff);
1971	for (i = istart; i < length; i += 4) {
1972		for (j = 0; j < 4; j++) {
1973			if ((i+j)<length) {
1974				result = (int) *cursrc1 + iC[j];
1975				if (result > 255) result = 255;
1976				*curdest = (unsigned char) result;
1977				/* Advance pointers */
1978				cursrc1++;
1979				curdest++;
1980			}
1981		}
1982	}
1983	return (0);
1984}
1985
1986/*!
1987\brief Internal MMX Filter using AddByteToHalf: D = saturation255(S/2 + C)
1988
1989\param Src1 Pointer to the start of the source byte array (S).
1990\param Dest Pointer to the start of the destination byte array (D).
1991\param SrcLength The number of bytes in the source array.
1992\param C Constant to add (C).
1993\param Mask Pointer to 8 mask bytes of value 0x7F.
1994
1995\return Returns 0 for success or -1 for error.
1996*/
1997int SDL_imageFilterAddByteToHalfMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C,
1998									unsigned char *Mask)
1999{
2000#ifdef USE_MMX
2001#if !defined(GCC__)
2002	__asm
2003	{
2004		pusha
2005			/* ** Duplicate C in 8 bytes of MM1 ** */
2006			mov al, C   	/* load C into AL */
2007			mov ah, al   	/* copy AL into AH */
2008			mov bx, ax   	/* copy AX into BX */
2009			shl eax, 16   	/* shift 2 bytes of EAX left */
2010			mov ax, bx   	/* copy BX into AX */
2011			movd mm1, eax   	/* copy EAX into MM1 */
2012			movd mm2, eax   	/* copy EAX into MM2 */
2013			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
2014			mov edx, Mask   	/* load Mask address into edx */
2015			movq mm0, [edx]   	/* load Mask into mm0 */
2016		mov eax, Src1   	/* load Src1 address into eax */
2017			mov edi, Dest   	/* load Dest address into edi */
2018			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
2019			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
2020			align 16                 	/* 16 byte alignment of the loop entry */
2021L1022:
2022		movq mm2, [eax]   	/* load 8 bytes from Src1 into MM2 */
2023		psrlw mm2, 1   	/* shift 4 WORDS of MM2 1 bit to the right */
2024			pand mm2, mm0        // apply Mask to 8 BYTES of MM2 */
2025			/* byte     0x0f, 0xdb, 0xd0 */
2026			paddusb mm2,  mm1 	/* MM2=SrcDest+C (add 8 bytes with saturation) */
2027			movq [edi], mm2   	/* store result in Dest */
2028			add eax, 8   	/* increase Src1 register pointer by 8 */
2029			add edi, 8   	/* increase Dest register pointer by 8 */
2030			dec              ecx    	/* decrease loop counter */
2031			jnz             L1022    	/* check loop termination, proceed if required */
2032			emms                      	/* exit MMX state */
2033			popa
2034	}
2035#else
2036	asm volatile
2037		("pusha		     \n\t"
2038		/* ** Duplicate C in 8 bytes of MM1 ** */
2039		"mov           %3, %%al \n\t"	/* load C into AL */
2040		"mov         %%al, %%ah \n\t"	/* copy AL into AH */
2041		"mov         %%ax, %%bx \n\t"	/* copy AX into BX */
2042		"shl         $16, %%eax \n\t"	/* shift 2 bytes of EAX left */
2043		"mov         %%bx, %%ax \n\t"	/* copy BX into AX */
2044		"movd      %%eax, %%mm1 \n\t"	/* copy EAX into MM1 */
2045		"movd      %%eax, %%mm2 \n\t"	/* copy EAX into MM2 */
2046		"punpckldq %%mm2, %%mm1 \n\t"	/* fill higher bytes of MM1 with C */
2047		"movl         %4, %%edx \n\t"	/* load Mask address into edx */
2048		"movq    (%%edx), %%mm0 \n\t"	/* load Mask into mm0 */
2049		"mov          %1, %%eax \n\t"	/* load Src1 address into eax */
2050		"mov          %0, %%edi \n\t"	/* load Dest address into edi */
2051		"mov          %2, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
2052		"shr          $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
2053		".align 16              \n\t"	/* 16 byte alignment of the loop entry */
2054		"1:                     \n\t" 
2055		"movq    (%%eax), %%mm2 \n\t"	/* load 8 bytes from Src1 into MM2 */
2056		"psrlw        $1, %%mm2 \n\t"	/* shift 4 WORDS of MM2 1 bit to the right */
2057		/*    "pand      %%mm0, %%mm2 \n\t"    // apply Mask to 8 BYTES of MM2 */
2058		".byte     0x0f, 0xdb, 0xd0 \n\t" 
2059		"paddusb   %%mm1, %%mm2 \n\t"	/* MM2=SrcDest+C (add 8 bytes with saturation) */
2060		"movq    %%mm2, (%%edi) \n\t"	/* store result in Dest */
2061		"add          $8, %%eax \n\t"	/* increase Src1 register pointer by 8 */
2062		"add          $8, %%edi \n\t"	/* increase Dest register pointer by 8 */
2063		"dec              %%ecx \n\t"	/* decrease loop counter */
2064		"jnz                  1b \n\t"	/* check loop termination, proceed if required */
2065		"emms                   \n\t"	/* exit MMX state */
2066		"popa                   \n\t":"=m" (Dest)	/* %0 */
2067		:"m"(Src1),		/* %1 */
2068		"m"(SrcLength),		/* %2 */
2069		"m"(C),			/* %3 */
2070		"m"(Mask)			/* %4 */
2071		);
2072#endif
2073	return (0);
2074#else
2075	return (-1);
2076#endif
2077}
2078
2079/*!
2080\brief Filter using AddByteToHalf: D = saturation255(S/2 + C)
2081
2082\param Src1 Pointer to the start of the source byte array (S).
2083\param Dest Pointer to the start of the destination byte array (D).
2084\param length The number of bytes in the source array.
2085\param C Constant to add (C).
2086
2087\return Returns 0 for success or -1 for error.
2088*/
2089int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
2090{
2091	static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
2092	unsigned int i, istart;
2093	int iC;
2094	unsigned char *cursrc1;
2095	unsigned char *curdest;
2096	int result;
2097
2098	/* Validate input parameters */
2099	if ((Src1 == NULL) || (Dest == NULL))
2100		return(-1);
2101	if (length == 0)
2102		return(0);
2103
2104	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
2105
2106		/* MMX routine */
2107		SDL_imageFilterAddByteToHalfMMX(Src1, Dest, length, C, Mask);
2108
2109		/* Check for unaligned bytes */
2110		if ((length & 7) > 0) {
2111			/* Setup to process unaligned bytes */
2112			istart = length & 0xfffffff8;
2113			cursrc1 = &Src1[istart];
2114			curdest = &Dest[istart];
2115		} else {
2116			/* No unaligned bytes - we are done */
2117			return (0);
2118		}
2119	} else {
2120		/* Setup to process whole image */
2121		istart = 0;
2122		cursrc1 = Src1;
2123		curdest = Dest;
2124	}
2125
2126	/* C routine to process image */
2127	iC = (int) C;
2128	for (i = istart; i < length; i++) {
2129		result = (int) (*cursrc1 / 2) + iC;
2130		if (result > 255)
2131			result = 255;
2132		*curdest = (unsigned char) result;
2133		/* Advance pointers */
2134		cursrc1++;
2135		curdest++;
2136	}
2137
2138	return (0);
2139}
2140
2141/*!
2142\brief Internal MMX Filter using SubByte: D = saturation0(S - C)
2143
2144\param Src1 Pointer to the start of the source byte array (S).
2145\param Dest Pointer to the start of the destination byte array (D).
2146\param SrcLength The number of bytes in the source array.
2147\param C Constant to subtract (C).
2148
2149\return Returns 0 for success or -1 for error.
2150*/
2151int SDL_imageFilterSubByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
2152{
2153#ifdef USE_MMX
2154#if !defined(GCC__)
2155	__asm
2156	{
2157		pusha
2158			/* ** Duplicate C in 8 bytes of MM1 ** */
2159			mov al, C   	/* load C into AL */
2160			mov ah, al   	/* copy AL into AH */
2161			mov bx, ax   	/* copy AX into BX */
2162			shl eax, 16   	/* shift 2 bytes of EAX left */
2163			mov ax, bx   	/* copy BX into AX */
2164			movd mm1, eax   	/* copy EAX into MM1 */
2165			movd mm2, eax   	/* copy EAX into MM2 */
2166			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
2167			mov eax, Src1   	/* load Src1 address into eax */
2168			mov edi, Dest   	/* load Dest address into edi */
2169			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
2170			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
2171			align 16                 	/* 16 byte alignment of the loop entry */
2172L1023:
2173		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
2174		psubusb mm0,  mm1 	/* MM0=SrcDest-C (sub 8 bytes with saturation) */
2175			movq [edi], mm0   	/* store result in SrcDest */
2176			add eax, 8   	/* increase Src1 register pointer by 8 */
2177			add edi, 8   	/* increase Dest register pointer by 8 */
2178			dec              ecx    	/* decrease loop counter */
2179			jnz             L1023    	/* check loop termination, proceed if required */
2180			emms                      	/* exit MMX state */
2181			popa
2182	}
2183#else
2184	asm volatile
2185		("pusha		     \n\t"
2186		/* ** Duplicate C in 8 bytes of MM1 ** */
2187		"mov           %3, %%al \n\t"	/* load C into AL */
2188		"mov         %%al, %%ah \n\t"	/* copy AL into AH */
2189		"mov         %%ax, %%bx \n\t"	/* copy AX into BX */
2190		"shl         $16, %%eax \n\t"	/* shift 2 bytes of EAX left */
2191		"mov         %%bx, %%ax \n\t"	/* copy BX into AX */
2192		"movd      %%eax, %%mm1 \n\t"	/* copy EAX into MM1 */
2193		"movd      %%eax, %%mm2 \n\t"	/* copy EAX into MM2 */
2194		"punpckldq %%mm2, %%mm1 \n\t"	/* fill higher bytes of MM1 with C */
2195		"mov          %1, %%eax \n\t"	/* load Src1 address into eax */
2196		"mov          %0, %%edi \n\t"	/* load Dest address into edi */
2197		"mov          %2, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
2198		"shr          $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
2199		".align 16              \n\t"	/* 16 byte alignment of the loop entry */
2200		"1: movq (%%eax), %%mm0 \n\t"	/* load 8 bytes from SrcDest into MM0 */
2201		"psubusb   %%mm1, %%mm0 \n\t"	/* MM0=SrcDest-C (sub 8 bytes with saturation) */
2202		"movq    %%mm0, (%%edi) \n\t"	/* store result in SrcDest */
2203		"add          $8, %%eax \n\t"	/* increase Src1 register pointer by 8 */
2204		"add          $8, %%edi \n\t"	/* increase Dest register pointer by 8 */
2205		"dec              %%ecx \n\t"	/* decrease loop counter */
2206		"jnz                 1b \n\t"	/* check loop termination, proceed if required */
2207		"emms                   \n\t"	/* exit MMX state */
2208		"popa                   \n\t":"=m" (Dest)	/* %0 */
2209		:"m"(Src1),		/* %1 */
2210		"m"(SrcLength),		/* %2 */
2211		"m"(C)			/* %3 */
2212		);
2213#endif
2214	return (0);
2215#else
2216	return (-1);
2217#endif
2218}
2219
2220/*!
2221\brief Filter using SubByte: D = saturation0(S - C)
2222
2223\param Src1 Pointer to the start of the source byte array (S).
2224\param Dest Pointer to the start of the destination byte array (D).
2225\param length The number of bytes in the source arrays.
2226\param C Constant to subtract (C).
2227
2228\return Returns 0 for success or -1 for error.
2229*/
2230int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
2231{
2232	unsigned int i, istart;
2233	int iC;
2234	unsigned char *cursrc1;
2235	unsigned char *curdest;
2236	int result;
2237
2238	/* Validate input parameters */
2239	if ((Src1 == NULL) || (Dest == NULL))
2240		return(-1);
2241	if (length == 0)
2242		return(0);
2243
2244	/* Special case: C==0 */
2245	if (C == 0) {
2246		memcpy(Src1, Dest, length);
2247		return (0); 
2248	}
2249
2250	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
2251
2252		/* MMX routine */
2253		SDL_imageFilterSubByteMMX(Src1, Dest, length, C);
2254
2255		/* Check for unaligned bytes */
2256		if ((length & 7) > 0) {
2257			/* Setup to process unaligned bytes */
2258			istart = length & 0xfffffff8;
2259			cursrc1 = &Src1[istart];
2260			curdest = &Dest[istart];
2261		} else {
2262			/* No unaligned bytes - we are done */
2263			return (0);
2264		}
2265	} else {
2266		/* Setup to process whole image */
2267		istart = 0;
2268		cursrc1 = Src1;
2269		curdest = Dest;
2270	}
2271
2272	/* C routine to process image */
2273	iC = (int) C;
2274	for (i = istart; i < length; i++) {
2275		result = (int) *cursrc1 - iC;
2276		if (result < 0)
2277			result = 0;
2278		*curdest = (unsigned char) result;
2279		/* Advance pointers */
2280		cursrc1++;
2281		curdest++;
2282	}
2283	return (0);
2284}
2285
2286/*!
2287\brief Internal MMX Filter using SubUint: D = saturation0(S[i] - Cs[i % 4]), Cs=Swap32((uint)C)
2288
2289\param Src1 Pointer to the start of the source byte array (S).
2290\param Dest Pointer to the start of the destination byte array (D).
2291\param SrcLength The number of bytes in the source array.
2292\param C Constant to subtract (C).
2293\param D Byteorder-swapped constant to subtract (Cs).
2294
2295\return Returns 0 for success or -1 for error.
2296*/
2297int SDL_imageFilterSubUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
2298{
2299#ifdef USE_MMX
2300#if !defined(GCC__)
2301	__asm
2302	{
2303		pusha
2304			/* ** Duplicate (int)C in 8 bytes of MM1 ** */
2305			mov eax, C   	/* load C into EAX */
2306			movd mm1, eax   	/* copy EAX into MM1 */
2307			mov eax, D   	/* load D into EAX */
2308			movd mm2, eax   	/* copy EAX into MM2 */
2309			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
2310			mov eax, Src1   	/* load Src1 address into eax */
2311			mov edi, Dest   	/* load Dest address into edi */
2312			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
2313			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
2314			align 16                 	/* 16 byte alignment of the loop entry */
2315L11024:
2316		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
2317		psubusb mm0, mm1 	/* MM0=SrcDest-C (sub 8 bytes with saturation) */
2318			movq [edi], mm0   	/* store result in SrcDest */
2319			add eax, 8   	/* increase Src1 register pointer by 8 */
2320			add edi, 8   	/* increase Dest register pointer by 8 */
2321			dec              ecx    	/* decrease loop counter */
2322			jnz             L11024    	/* check loop termination, proceed if required */
2323			emms                      	/* exit MMX state */
2324			popa
2325	}
2326#else
2327	asm volatile
2328		("pusha		     \n\t"
2329		/* ** Duplicate (int)C in 8 bytes of MM1 ** */
2330		"mov          %3, %%eax \n\t"	/* load C into EAX */
2331		"movd      %%eax, %%mm1 \n\t"	/* copy EAX into MM1 */
2332		"mov          %4, %%eax \n\t"	/* load D into EAX */
2333		"movd      %%eax, %%mm2 \n\t"	/* copy EAX into MM2 */
2334		"punpckldq %%mm2, %%mm1 \n\t"	/* fill higher bytes of MM1 with C */
2335		"mov          %1, %%eax \n\t"	/* load Src1 address into eax */
2336		"mov          %0, %%edi \n\t"	/* load Dest address into edi */
2337		"mov          %2, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
2338		"shr          $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
2339		".align 16              \n\t"	/* 16 byte alignment of the loop entry */
2340		"1: movq (%%eax), %%mm0 \n\t"	/* load 8 bytes from SrcDest into MM0 */
2341		"psubusb   %%mm1, %%mm0 \n\t"	/* MM0=SrcDest-C (sub 8 bytes with saturation) */
2342		"movq    %%mm0, (%%edi) \n\t"	/* store result in SrcDest */
2343		"add          $8, %%eax \n\t"	/* increase Src1 register pointer by 8 */
2344		"add          $8, %%edi \n\t"	/* increase Dest register pointer by 8 */
2345		"dec              %%ecx \n\t"	/* decrease loop counter */
2346		"jnz                  1b \n\t"	/* check loop termination, proceed if required */
2347		"emms                   \n\t"	/* exit MMX state */
2348		"popa                   \n\t":"=m" (Dest)	/* %0 */
2349		:"m"(Src1),		/* %1 */
2350		"m"(SrcLength),		/* %2 */
2351		"m"(C),			/* %3 */
2352		"m"(D)			/* %4 */
2353		);
2354#endif
2355	return (0);
2356#else
2357	return (-1);
2358#endif
2359}
2360
2361/*!
2362\brief Filter using SubUint: D = saturation0(S[i] - Cs[i % 4]), Cs=Swap32((uint)C)
2363
2364\param Src1 Pointer to the start of the source byte array (S1).
2365\param Dest Pointer to the start of the destination byte array (D).
2366\param length The number of bytes in the source array.
2367\param C Constant to subtract (C).
2368
2369\return Returns 0 for success or -1 for error.
2370*/
2371int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
2372{
2373	unsigned int i, j, istart, D;
2374	int iC[4];
2375	unsigned char *cursrc1;
2376	unsigned char *curdest;
2377	int result;
2378
2379	/* Validate input parameters */
2380	if ((Src1 == NULL) || (Dest == NULL))
2381		return(-1);
2382	if (length == 0)
2383		return(0);
2384
2385    /* Special case: C==0 */
2386	if (C == 0) {
2387		memcpy(Src1, Dest, length);
2388		return (0); 
2389	}
2390
2391	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
2392
2393		/* MMX routine */
2394		D=SWAP_32(C);
2395		SDL_imageFilterSubUintMMX(Src1, Dest, length, C, D);
2396
2397		/* Check for unaligned bytes */
2398		if ((length & 7) > 0) {
2399			/* Setup to process unaligned bytes */
2400			istart = length & 0xfffffff8;
2401			cursrc1 = &Src1[istart];
2402			curdest = &Dest[istart];
2403		} else {
2404			/* No unaligned bytes - we are done */
2405			return (0);
2406		}
2407	} else {
2408		/* Setup to process whole image */
2409		istart = 0;
2410		cursrc1 = Src1;
2411		curdest = Dest;
2412	}
2413
2414	/* C routine to process image */
2415	iC[3] = (int) ((C >> 24) & 0xff);
2416	iC[2] = (int) ((C >> 16) & 0xff);
2417	iC[1] = (int) ((C >>  8) & 0xff);
2418	iC[0] = (int) ((C >>  0) & 0xff);
2419	for (i = istart; i < length; i += 4) {
2420		for (j = 0; j < 4; j++) {
2421			if ((i+j)<length) {
2422				result = (int) *cursrc1 - iC[j];
2423				if (result < 0) result = 0;
2424				*curdest = (unsigned char) result;
2425				/* Advance pointers */
2426				cursrc1++;
2427				curdest++;
2428			}
2429		}
2430	}
2431	return (0);
2432}
2433
2434/*!
2435\brief Internal MMX Filter using ShiftRight: D = saturation0(S >> N)
2436
2437\param Src1 Pointer to the start of the source byte array (S).
2438\param Dest Pointer to the start of the destination byte array (D).
2439\param SrcLength The number of bytes in the source array.
2440\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
2441\param Mask Byte array containing 8 bytes with 0x7F value.
2442
2443\return Returns 0 for success or -1 for error.
2444*/
2445int SDL_imageFilterShiftRightMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
2446								 unsigned char *Mask)
2447{
2448#ifdef USE_MMX
2449#if !defined(GCC__)
2450	__asm
2451	{
2452		pusha
2453			mov edx, Mask   	/* load Mask address into edx */
2454			movq mm0, [edx]   	/* load Mask into mm0 */
2455		xor ecx, ecx   	/* zero ECX */
2456			mov cl,  N 	/* load loop counter (N) into CL */
2457			movd mm3,  ecx 	/* copy (N) into MM3  */
2458			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
2459L10240:                  	/* ** Prepare proper bit-Mask in MM1 ** */
2460		psrlw mm1,  1 	/* shift 4 WORDS of MM1 1 bit to the right */
2461			pand mm1, mm0   // apply Mask to 8 BYTES of MM1 */
2462			/*  byte     0x0f, 0xdb, 0xc8 */
2463			dec               cl    	/* decrease loop counter */
2464			jnz            L10240    	/* check loop termination, proceed if required */
2465			/* ** Shift all bytes of the image ** */
2466			mov eax, Src1   	/* load Src1 address into eax */
2467			mov edi, Dest   	/* load Dest address into edi */
2468			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
2469			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
2470			align 16                 	/* 16 byte alignment of the loop entry */
2471L10241:
2472		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
2473		psrlw mm0, mm3   	/* shift 4 WORDS of MM0 (N) bits to the right */
2474			pand mm0, mm1    // apply proper bit-Mask to 8 BYTES of MM0 */
2475			/* byte     0x0f, 0xdb, 0xc1 */
2476			movq [edi], mm0   	/* store result in SrcDest */
2477			add eax, 8   	/* increase Src1 register pointer by 8 */
2478			add edi, 8   	/* increase Dest register pointer by 8 */
2479			dec              ecx    	/* decrease loop counter */
2480			jnz            L10241    	/* check loop termination, proceed if required */
2481			emms                      	/* exit MMX state */
2482			popa
2483	}
2484#else
2485	asm volatile
2486		("pusha		     \n\t" "movl         %4, %%edx \n\t"	/* load Mask address into edx */
2487		"movq    (%%edx), %%mm0 \n\t"	/* load Mask into mm0 */
2488		"xor       %%ecx, %%ecx \n\t"	/* zero ECX */
2489		"mov           %3, %%cl \n\t"	/* load loop counter (N) into CL */
2490		"movd      %%ecx, %%mm3 \n\t"	/* copy (N) into MM3  */
2491		"pcmpeqb   %%mm1, %%mm1 \n\t"	/* generate all 1's in mm1 */
2492		"1:                     \n\t"	/* ** Prepare proper bit-Mask in MM1 ** */
2493		"psrlw        $1, %%mm1 \n\t"	/* shift 4 WORDS of MM1 1 bit to the right */
2494		/*    "pand      %%mm0, %%mm1 \n\t"    // apply Mask to 8 BYTES of MM1 */
2495		".byte     0x0f, 0xdb, 0xc8 \n\t" 
2496		"dec               %%cl \n\t"	/* decrease loop counter */
2497		"jnz                 1b \n\t"	/* check loop termination, proceed if required */
2498		/* ** Shift all bytes of the image ** */
2499		"mov          %1, %%eax \n\t"	/* load Src1 address into eax */
2500		"mov          %0, %%edi \n\t"	/* load Dest address into edi */
2501		"mov          %2, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
2502		"shr          $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
2503		".align 16              \n\t"	/* 16 byte alignment of the loop entry */
2504		"2:                     \n\t" 
2505		"movq    (%%eax), %%mm0 \n\t"	/* load 8 bytes from SrcDest into MM0 */
2506		"psrlw     %%mm3, %%mm0 \n\t"	/* shift 4 WORDS of MM0 (N) bits to the right */
2507		/*    "pand      %%mm1, %%mm0 \n\t"    // apply proper bit-Mask to 8 BYTES of MM0 */
2508		".byte     0x0f, 0xdb, 0xc1 \n\t" 
2509		"movq    %%mm0, (%%edi) \n\t"	/* store result in SrcDest */
2510		"add          $8, %%eax \n\t"	/* increase Src1 register pointer by 8 */
2511		"add          $8, %%edi \n\t"	/* increase Dest register pointer by 8 */
2512		"dec              %%ecx \n\t"	/* decrease loop counter */
2513		"jnz                 2b \n\t"	/* check loop termination, proceed if required */
2514		"emms                   \n\t"	/* exit MMX state */
2515		"popa                   \n\t":"=m" (Dest)	/* %0 */
2516		:"m"(Src1),		/* %1 */
2517		"m"(SrcLength),		/* %2 */
2518		"m"(N),			/* %3 */
2519		"m"(Mask)			/* %4 */
2520		);
2521#endif
2522	return (0);
2523#else
2524	return (-1);
2525#endif
2526}
2527
2528/*!
2529\brief Filter using ShiftRight: D = saturation0(S >> N)
2530
2531\param Src1 Pointer to the start of the source byte array (S).
2532\param Dest Pointer to the start of the destination byte array (D).
2533\param length The number of bytes in the source array.
2534\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
2535
2536\return Returns 0 for success or -1 for error.
2537*/
2538int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
2539{
2540	static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
2541	unsigned int i, istart;
2542	unsigned char *cursrc1;
2543	unsigned char *curdest;
2544
2545	/* Validate input parameters */
2546	if ((Src1 == NULL) || (Dest == NULL))
2547		return(-1);
2548	if (length == 0)
2549		return(0);
2550
2551	/* Check shift */
2552	if (N > 8) {
2553		return (-1);
2554	}
2555
2556	/* Special case: N==0 */
2557	if (N == 0) {
2558		memcpy(Src1, Dest, length);
2559		return (0); 
2560	}
2561
2562	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
2563
2564		/* MMX routine */
2565		SDL_imageFilterShiftRightMMX(Src1, Dest, length, N, Mask);
2566
2567		/* Check for unaligned bytes */
2568		if ((length & 7) > 0) {
2569			/* Setup to process unaligned bytes */
2570			istart = length & 0xfffffff8;
2571			cursrc1 = &Src1[istart];
2572			curdest = &Dest[istart];
2573		} else {
2574			/* No unaligned bytes - we are done */
2575			return (0);
2576		}
2577	} else {
2578		/* Setup to process whole image */
2579		istart = 0;
2580		cursrc1 = Src1;
2581		curdest = Dest;
2582	}
2583
2584	/* C routine to process image */
2585	for (i = istart; i < length; i++) {
2586		*curdest = (unsigned char) *cursrc1 >> N;
2587		/* Advance pointers */
2588		cursrc1++;
2589		curdest++;
2590	}
2591
2592	return (0);
2593}
2594
2595/*!
2596\brief Internal MMX Filter using ShiftRightUint: D = saturation0((uint)S[i] >> N)
2597
2598\param Src1 Pointer to the start of the source byte array (S1).
2599\param Dest Pointer to the start of the destination byte array (D).
2600\param SrcLength The number of bytes in the source array.
2601\param N Number of bit-positions to shift (N).
2602
2603\return Returns 0 for success or -1 for error.
2604*/
2605int SDL_imageFilterShiftRightUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
2606{
2607#ifdef USE_MMX
2608#if !defined(GCC__)
2609	__asm
2610	{
2611		pusha
2612			mov eax, Src1   	/* load Src1 address into eax */
2613			mov edi, Dest   	/* load Dest address into edi */
2614			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
2615			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
2616			align 16                 	/* 16 byte alignment of the loop entry */
2617L13023:
2618		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
2619		psrld mm0, N
2620			movq [edi], mm0   	/* store result in SrcDest */
2621			add eax, 8   	/* increase Src1 register pointer by 8 */
2622			add edi, 8   	/* increase Dest register pointer by 8 */
2623			dec              ecx    	/* decrease loop counter */
2624			jnz             L13023    	/* check loop termination, proceed if required */
2625			emms                      	/* exit MMX state */
2626			popa
2627	}
2628#else
2629	asm volatile
2630		("pusha		     \n\t"
2631		"mov          %1, %%eax \n\t"	/* load Src1 address into eax */
2632		"mov          %0, %%edi \n\t"	/* load Dest address into edi */
2633		"mov          %2, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
2634		"shr          $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
2635		".align 16              \n\t"	/* 16 byte alignment of the loop entry */
2636		"1: movq (%%eax), %%mm0 \n\t"	/* load 8 bytes from SrcDest into MM0 */
2637		"psrld   %3, %%mm0 \n\t"
2638		"movq    %%mm0, (%%edi) \n\t"	/* store result in SrcDest */
2639		"add          $8, %%eax \n\t"	/* increase Src1 register pointer by 8 */
2640		"add          $8, %%edi \n\t"	/* increase Dest register pointer by 8 */
2641		"dec              %%ecx \n\t"	/* decrease loop counter */
2642		"jnz                 1b \n\t"	/* check loop termination, proceed if required */
2643		"emms                   \n\t"	/* exit MMX state */
2644		"popa                   \n\t":"=m" (Dest)	/* %0 */
2645		:"m"(Src1),		/* %1 */
2646		"m"(SrcLength),		/* %2 */
2647		"m"(N)			/* %3 */
2648		);
2649#endif
2650	return (0);
2651#else
2652	return (-1);
2653#endif
2654}
2655
2656/*!
2657\brief Filter using ShiftRightUint: D = saturation0((uint)S[i] >> N)
2658
2659\param Src1 Pointer to the start of the source byte array (S1).
2660\param Dest Pointer to the start of the destination byte array (D).
2661\param length The number of bytes in the source array.
2662\param N Number of bit-positions to shift (N). Valid range is 0 to 32.
2663
2664\return Returns 0 for success or -1 for error.
2665*/
2666int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
2667{
2668	unsigned int i, istart;
2669	unsigned char *cursrc1, *curdest;
2670	unsigned int *icursrc1, *icurdest;
2671	int result;
2672
2673	/* Validate input parameters */
2674	if ((Src1 == NULL) || (Dest == NULL))
2675		return(-1);
2676	if (length == 0)
2677		return(0);
2678
2679	if (N > 32) {
2680		return (-1);
2681	}
2682
2683	/* Special case: N==0 */
2684	if (N == 0) {
2685		memcpy(Src1, Dest, length);
2686		return (0); 
2687	}
2688
2689	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
2690
2691		SDL_imageFilterShiftRightUintMMX(Src1, Dest, length, N);
2692
2693		/* Check for unaligned bytes */
2694		if ((length & 7) > 0) {
2695			/* Setup to process unaligned bytes */
2696			istart = length & 0xfffffff8;
2697			cursrc1 = &Src1[istart];
2698			curdest = &Dest[istart];
2699		} else {
2700			/* No unaligned bytes - we are done */
2701			return (0);
2702		}
2703	} else {
2704		/* Setup to process whole image */
2705		istart = 0;
2706		cursrc1 = Src1;
2707		curdest = Dest;
2708	}
2709
2710	/* C routine to process image */
2711	icursrc1=(unsigned int *)cursrc1;
2712	icurdest=(unsigned int *)curdest;
2713	for (i = istart; i < length; i += 4) {
2714		if ((i+4)<length) {
2715			result = ((unsigned int)*icursrc1 >> N);
2716			*icurdest = (unsigned int)result;
2717		}
2718		/* Advance pointers */
2719		icursrc1++;
2720		icurdest++;
2721	}
2722
2723	return (0);
2724}
2725
2726/*!
2727\brief Internal MMX Filter using MultByByte: D = saturation255(S * C)
2728
2729\param Src1 Pointer to the start of the source byte array (S).
2730\param Dest Pointer to the start of the destination byte array (D).
2731\param SrcLength The number of bytes in the source array.
2732\param C Constant to multiply with (C).
2733
2734\return Returns 0 for success or -1 for error.
2735*/
2736int SDL_imageFilterMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
2737{
2738#ifdef USE_MMX
2739#if !defined(GCC__)
2740	__asm
2741	{
2742		pusha
2743			/* ** Duplicate C in 4 words of MM1 ** */
2744			mov al, C   	/* load C into AL */
2745			xor ah, ah   	/* zero AH */
2746			mov bx, ax   	/* copy AX into BX */
2747			shl eax, 16   	/* shift 2 bytes of EAX left */
2748			mov ax, bx   	/* copy BX into AX */
2749			movd mm1, eax   	/* copy EAX into MM1 */
2750			movd mm2, eax   	/* copy EAX into MM2 */
2751			punpckldq mm1, mm2   	/* fill higher words of MM1 with C */
2752			pxor mm0, mm0   	/* zero MM0 register */
2753			mov eax, Src1   	/* load Src1 address into eax */
2754			mov edi, Dest   	/* load Dest address into edi */
2755			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
2756			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
2757			cmp al, 128   	/* if (C <= 128) execute more efficient code */
2758			jg             L10251
2759			align 16                 	/* 16 byte alignment of the loop entry */
2760L10250:
2761		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
2762		movq mm4, mm3   	/* copy MM3 into MM4  */
2763			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
2764			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
2765			pmullw mm3, mm1   	/* mul low  bytes of SrcDest and MM1 */
2766			pmullw mm4, mm1   	/* mul high bytes of SrcDest and MM1 */
2767			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
2768			movq [edi], mm3   	/* store result in Dest */
2769			add eax, 8   	/* increase Src1 register pointer by 8 */
2770			add edi, 8   	/* increase Dest register pointer by 8 */
2771			dec              ecx    	/* decrease loop counter */
2772			jnz            L10250    	/* check loop termination, proceed if required */
2773			jmp            L10252
2774			align 16                 	/* 16 byte alignment of the loop entry */
2775L10251:
2776		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
2777		movq mm4, mm3   	/* copy MM3 into MM4  */
2778			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
2779			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
2780			pmullw mm3, mm1   	/* mul low  bytes of SrcDest and MM1 */
2781			pmullw mm4, mm1   	/* mul high bytes of SrcDest and MM1 */
2782			/* ** Take abs value of the results (signed words) ** */
2783			movq mm5, mm3   	/* copy mm3 into mm5 */
2784			movq mm6, mm4   	/* copy mm4 into mm6 */
2785			psraw mm5, 15   	/* fill mm5 words with word sign bit */
2786			psraw mm6, 15   	/* fill mm6 words with word sign bit */
2787			pxor mm3, mm5   	/* take 1's compliment of only neg words */
2788			pxor mm4, mm6   	/* take 1's compliment of only neg words */
2789			psubsw mm3, mm5   	/* add 1 to only neg words, W-(-1) or W-0 */
2790			psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
2791			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
2792			movq [edi], mm3   	/* store result in Dest */
2793			add eax, 8   	/* increase Src1 register pointer by 8 */
2794			add edi, 8   	/* increase Dest register pointer by 8 */
2795			dec              ecx    	/* decrease loop counter */
2796			jnz            L10251    	/* check loop termination, proceed if required */
2797L10252:
2798		emms                      	/* exit MMX state */
2799			popa
2800	}
2801#else
2802	asm volatile
2803		("pusha		     \n\t"
2804		/* ** Duplicate C in 4 words of MM1 ** */
2805		"mov           %3, %%al \n\t"	/* load C into AL */
2806		"xor         %%ah, %%ah \n\t"	/* zero AH */
2807		"mov         %%ax, %%bx \n\t"	/* copy AX into BX */
2808		"shl         $16, %%eax \n\t"	/* shift 2 bytes of EAX left */
2809		"mov         %%bx, %%ax \n\t"	/* copy BX into AX */
2810		"movd      %%eax, %%mm1 \n\t"	/* copy EAX into MM1 */
2811		"movd      %%eax, %%mm2 \n\t"	/* copy EAX into MM2 */
2812		"punpckldq %%mm2, %%mm1 \n\t"	/* fill higher words of MM1 with C */
2813		"pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 register */
2814		"mov          %1, %%eax \n\t"	/* load Src1 address into eax */
2815		"mov          %0, %%edi \n\t"	/* load Dest address into edi */
2816		"mov          %2, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
2817		"shr          $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
2818		"cmp         $128, %%al \n\t"	/* if (C <= 128) execute more efficient code */
2819		"jg                  2f \n\t" ".align 16              \n\t"	/* 16 byte alignment of the loop entry */
2820		"1: movq (%%eax), %%mm3 \n\t"	/* load 8 bytes from Src1 into MM3 */
2821		"movq      %%mm3, %%mm4 \n\t"	/* copy MM3 into MM4  */
2822		"punpcklbw %%mm0, %%mm3 \n\t"	/* unpack low  bytes of SrcDest into words */
2823		"punpckhbw %%mm0, %%mm4 \n\t"	/* unpack high bytes of SrcDest into words */
2824		"pmullw    %%mm1, %%mm3 \n\t"	/* mul low  bytes of SrcDest and MM1 */
2825		"pmullw    %%mm1, %%mm4 \n\t"	/* mul high bytes of SrcDest and MM1 */
2826		"packuswb  %%mm4, %%mm3 \n\t"	/* pack words back into bytes with saturation */
2827		"movq    %%mm3, (%%edi) \n\t"	/* store result in Dest */
2828		"add          $8, %%eax \n\t"	/* increase Src1 register pointer by 8 */
2829		"add          $8, %%edi \n\t"	/* increase Dest register pointer by 8 */
2830		"dec              %%ecx \n\t"	/* decrease loop counter */
2831		"jnz                 1b \n\t"	/* check loop termination, proceed if required */
2832		"jmp                 3f \n\t" ".align 16              \n\t"	/* 16 byte alignment of the loop entry */
2833		"2: movq (%%eax), %%mm3 \n\t"	/* load 8 bytes from Src1 into MM3 */
2834		"movq      %%mm3, %%mm4 \n\t"	/* copy MM3 into MM4  */
2835		"punpcklbw %%mm0, %%mm3 \n\t"	/* unpack low  bytes of SrcDest into words */
2836		"punpckhbw %%mm0, %%mm4 \n\t"	/* unpack high bytes of SrcDest into words */
2837		"pmullw    %%mm1, %%mm3 \n\t"	/* mul low  bytes of SrcDest and MM1 */
2838		"pmullw    %%mm1, %%mm4 \n\t"	/* mul high bytes of SrcDest and MM1 */
2839		/* ** Take abs value of the results (signed words) ** */
2840		"movq      %%mm3, %%mm5 \n\t"	/* copy mm3 into mm5 */
2841		"movq      %%mm4, %%mm6 \n\t"	/* copy mm4 into mm6 */
2842		"psraw       $15, %%mm5 \n\t"	/* fill mm5 words with word sign bit */
2843		"psraw       $15, %%mm6 \n\t"	/* fill mm6 words with word sign bit */
2844		"pxor      %%mm5, %%mm3 \n\t"	/* take 1's compliment of only neg. words */
2845		"pxor      %%mm6, %%mm4 \n\t"	/* take 1's compliment of only neg. words */
2846		"psubsw    %%mm5, %%mm3 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
2847		"psubsw    %%mm6, %%mm4 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
2848		"packuswb  %%mm4, %%mm3 \n\t"	/* pack words back into bytes with saturation */
2849		"movq    %%mm3, (%%edi) \n\t"	/* store result in Dest */
2850		"add          $8, %%eax \n\t"	/* increase Src1 register pointer by 8 */
2851		"add          $8, %%edi \n\t"	/* increase Dest register pointer by 8 */
2852		"dec              %%ecx \n\t"	/* decrease loop counter */
2853		"jnz                 2b \n\t"	/* check loop termination, proceed if required */
2854		"3: emms               \n\t"	/* exit MMX state */
2855		"popa                   \n\t":"=m" (Dest)	/* %0 */
2856		:"m"(Src1),		/* %1 */
2857		"m"(SrcLength),		/* %2 */
2858		"m"(C)			/* %3 */
2859		);
2860#endif
2861	return (0);
2862#else
2863	return (-1);
2864#endif
2865}
2866
2867/*!
2868\brief Filter using MultByByte: D = saturation255(S * C)
2869
2870\param Src1 Pointer to the start of the source byte array (S).
2871\param Dest Pointer to the start of the destination byte array (D).
2872\param length The number of bytes in the source arrays.
2873\param C Constant to multiply with (C).
2874
2875\return Returns 0 for success or -1 for error.
2876*/
2877int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
2878{
2879	unsigned int i, istart;
2880	int iC;
2881	unsigned char *cursrc1;
2882	unsigned char *curdest;
2883	int result;
2884
2885	/* Validate input parameters */
2886	if ((Src1 == NULL) || (Dest == NULL))
2887		return(-1);
2888	if (length == 0)
2889		return(0);
2890
2891	/* Special case: C==1 */
2892	if (C == 1) {
2893		memcpy(Src1, Dest, length);
2894		return (0); 
2895	}
2896
2897	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
2898
2899		SDL_imageFilterMultByByteMMX(Src1, Dest, length, C);
2900
2901		/* Check for unaligned bytes */
2902		if ((length & 7) > 0) {
2903			/* Setup to process unaligned bytes */
2904			istart = length & 0xfffffff8;
2905			cursrc1 = &Src1[istart];
2906			curdest = &Dest[istart];
2907		} else {
2908			/* No unaligned bytes - we are done */
2909			return (0);
2910		}
2911	} else {
2912		/* Setup to process whole image */
2913		istart = 0;
2914		cursrc1 = Src1;
2915		curdest = Dest;
2916	}
2917
2918	/* C routine to process image */
2919	iC = (int) C;
2920	for (i = istart; i < length; i++) {
2921		result = (int) *cursrc1 * iC;
2922		if (result > 255)
2923			result = 255;
2924		*curdest = (unsigned char) result;
2925		/* Advance pointers */
2926		cursrc1++;
2927		curdest++;
2928	}
2929
2930	return (0);
2931}
2932
2933/*!
2934\brief Internal MMX Filter using ShiftRightAndMultByByteMMX: D = saturation255((S >> N) * C) 
2935
2936\param Src1 Pointer to the start of the source byte array (S).
2937\param Dest Pointer to the start of the destination byte array (D).
2938\param SrcLength The number of bytes in the source array.
2939\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
2940\param C Constant to multiply with (C).
2941
2942\return Returns 0 for success or -1 for error.
2943*/
2944int SDL_imageFilterShiftRightAndMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
2945											  unsigned char C)
2946{
2947#ifdef USE_MMX
2948#if !defined(GCC__)
2949	__asm
2950	{
2951		pusha
2952			/* ** Duplicate C in 4 words of MM1 ** */
2953			mov al, C   	/* load C into AL */
2954			xor ah, ah   	/* zero AH */
2955			mov bx, ax   	/* copy AX into BX */
2956			shl eax, 16   	/* shift 2 bytes of EAX left */
2957			mov ax, bx   	/* copy BX into AX */
2958			movd mm1, eax   	/* copy EAX into MM1 */
2959			movd mm2, eax   	/* copy EAX into MM2 */
2960			punpckldq mm1, mm2   	/* fill higher words of MM1 with C */
2961			xor ecx, ecx   	/* zero ECX */
2962			mov cl, N   	/* load N into CL */
2963			movd mm7, ecx   	/* copy N into MM7 */
2964			pxor mm0, mm0   	/* zero MM0 register */
2965			mov eax, Src1   	/* load Src1 address into eax */
2966			mov edi, Dest   	/* load Dest address into edi */
2967			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
2968			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
2969			align 16                 	/* 16 byte alignment of the loop entry */
2970L1026:
2971		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
2972		movq mm4, mm3   	/* copy MM3 into MM4  */
2973			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
2974			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
2975			psrlw mm3, mm7   	/* shift 4 WORDS of MM3 (N) bits to the right */
2976			psrlw mm4, mm7   	/* shift 4 WORDS of MM4 (N) bits to the right */
2977			pmullw mm3, mm1   	/* mul low  bytes of SrcDest by MM1 */
2978			pmullw mm4, mm1   	/* mul high bytes of SrcDest by MM1 */
2979			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
2980			movq [edi], mm3   	/* store result in Dest */
2981			add eax, 8   	/* increase Src1 register pointer by 8 */
2982			add edi, 8   	/* increase Dest register pointer by 8 */
2983			dec              ecx    	/* decrease loop counter */
2984			jnz             L1026    	/* check loop termination, proceed if required */
2985			emms                      	/* exit MMX state */
2986			popa
2987	}
2988#else
2989	asm volatile
2990		("pusha		     \n\t"
2991		/* ** Duplicate C in 4 words of MM1 ** */
2992		"mov           %4, %%al \n\t"	/* load C into AL */
2993		"xor         %%ah, %%ah \n\t"	/* zero AH */
2994		"mov         %%ax, %%bx \n\t"	/* copy AX into BX */
2995		"shl         $16, %%eax \n\t"	/* shift 2 bytes of EAX left */
2996		"mov         %%bx, %%ax \n\t"	/* copy BX into AX */
2997		"movd      %%eax, %%mm1 \n\t"	/* copy EAX into MM1 */
2998		"movd      %%eax, %%mm2 \n\t"	/* copy EAX into MM2 */
2999		"punpckldq %%mm2, %%mm1 \n\t"	/* fill higher words of MM1 with C */
3000		"xor       %%ecx, %%ecx \n\t"	/* zero ECX */
3001		"mov           %3, %%cl \n\t"	/* load N into CL */
3002		"movd      %%ecx, %%mm7 \n\t"	/* copy N into MM7 */
3003		"pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 register */
3004		"mov          %1, %%eax \n\t"	/* load Src1 address into eax */
3005		"mov          %0, %%edi \n\t"	/* load Dest address into edi */
3006		"mov          %2, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
3007		"shr          $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
3008		".align 16             \n\t"	/* 16 byte alignment of the loop entry */
3009		"1: movq (%%eax), %%mm3 \n\t"	/* load 8 bytes from Src1 into MM3 */
3010		"movq      %%mm3, %%mm4 \n\t"	/* copy MM3 into MM4  */
3011		"punpcklbw %%mm0, %%mm3 \n\t"	/* unpack low  bytes of SrcDest into words */
3012		"punpckhbw %%mm0, %%mm4 \n\t"	/* unpack high bytes of SrcDest into words */
3013		"psrlw     %%mm7, %%mm3 \n\t"	/* shift 4 WORDS of MM3 (N) bits to the right */
3014		"psrlw     %%mm7, %%mm4 \n\t"	/* shift 4 WORDS of MM4 (N) bits to the right */
3015		"pmullw    %%mm1, %%mm3 \n\t"	/* mul low  bytes of SrcDest by MM1 */
3016		"pmullw    %%mm1, %%mm4 \n\t"	/* mul high bytes of SrcDest by MM1 */
3017		"packuswb  %%mm4, %%mm3 \n\t"	/* pack words back into bytes with saturation */
3018		"movq    %%mm3, (%%edi) \n\t"	/* store result in Dest */
3019		"add          $8, %%eax \n\t"	/* increase Src1 register pointer by 8 */
3020		"add          $8, %%edi \n\t"	/* increase Dest register pointer by 8 */
3021		"dec              %%ecx \n\t"	/* decrease loop counter */
3022		"jnz                 1b \n\t"	/* check loop termination, proceed if required */
3023		"emms                   \n\t"	/* exit MMX state */
3024		"popa                   \n\t":"=m" (Dest)	/* %0 */
3025		:"m"(Src1),		/* %1 */
3026		"m"(SrcLength),		/* %2 */
3027		"m"(N),			/* %3 */
3028		"m"(C)			/* %4 */
3029		);
3030#endif
3031	return (0);
3032#else
3033	return (-1);
3034#endif
3035}
3036
3037/*!
3038\brief Filter using ShiftRightAndMultByByte: D = saturation255((S >> N) * C) 
3039
3040\param Src1 Pointer to the start of the source byte array (S).
3041\param Dest Pointer to the start of the destination byte array (D).
3042\param length The number of bytes in the source array.
3043\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
3044\param C Constant to multiply with (C).
3045
3046\return Returns 0 for success or -1 for error.
3047*/
3048int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N,
3049										   unsigned char C)
3050{
3051	unsigned int i, istart;
3052	int iC;
3053	unsigned char *cursrc1;
3054	unsigned char *curdest;
3055	int result;
3056
3057	/* Validate input parameters */
3058	if ((Src1 == NULL) || (Dest == NULL))
3059		return(-1);
3060	if (length == 0)
3061		return(0);
3062
3063	/* Check shift */
3064	if (N > 8) {
3065		return (-1);
3066	}
3067
3068	/* Special case: N==0 && C==1 */
3069	if ((N == 0) && (C == 1)) {
3070		memcpy(Src1, Dest, length);
3071		return (0); 
3072	}
3073
3074	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
3075
3076		SDL_imageFilterShiftRightAndMultByByteMMX(Src1, Dest, length, N, C);
3077
3078		/* Check for unaligned bytes */
3079		if ((length & 7) > 0) {
3080			/* Setup to process unaligned bytes */
3081			istart = length & 0xfffffff8;
3082			cursrc1 = &Src1[istart];
3083			curdest = &Dest[istart];
3084		} else {
3085			/* No unaligned bytes - we are done */
3086			return (0);
3087		}
3088	} else {
3089		/* Setup to process whole image */
3090		istart = 0;
3091		cursrc1 = Src1;
3092		curdest = Dest;
3093	}
3094
3095	/* C routine to process image */
3096	iC = (int) C;
3097	for (i = istart; i < length; i++) {
3098		result = (int) (*cursrc1 >> N) * iC;
3099		if (result > 255)
3100			result = 255;
3101		*curdest = (unsigned char) result;
3102		/* Advance pointers */
3103		cursrc1++;
3104		curdest++;
3105	}
3106
3107	return (0);
3108}
3109
3110/*!
3111\brief Internal MMX Filter using ShiftLeftByte: D = (S << N)
3112
3113\param Src1 Pointer to the start of the source byte array (S).
3114\param Dest Pointer to the start of the destination byte array (D).
3115\param SrcLength The number of bytes in the source arrays.
3116\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
3117\param Mask Byte array containing 8 bytes of 0xFE value.
3118
3119\return Returns 0 for success or -1 for error.
3120*/
3121int SDL_imageFilterShiftLeftByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
3122									unsigned char *Mask)
3123{
3124#ifdef USE_MMX
3125#if !defined(GCC__)
3126	__asm
3127	{
3128		pusha
3129			mov edx, Mask   	/* load Mask address into edx */
3130			movq mm0, [edx]   	/* load Mask into mm0 */
3131		xor ecx, ecx   	/* zero ECX */
3132			mov cl, N   	/* load loop counter (N) into CL */
3133			movd mm3, ecx   	/* copy (N) into MM3  */
3134			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
3135L10270:                  	/* ** Prepare proper bit-Mask in MM1 ** */
3136		psllw mm1, 1   	/* shift 4 WORDS of MM1 1 bit to the left */
3137			pand mm1, mm0        // apply Mask to 8 BYTES of MM1 */
3138			/*  byte     0x0f, 0xdb, 0xc8 */
3139			dec cl                  	/* decrease loop counter */
3140			jnz            L10270    	/* check loop termination, proceed if required */
3141			/* ** Shift all bytes of the image ** */
3142			mov eax, Src1   	/* load Src1 address into eax */
3143			mov edi, Dest   	/* load SrcDest address into edi */
3144			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
3145			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
3146			align 16                 	/* 16 byte alignment of the loop entry */
3147L10271:
3148		movq mm0, [eax]   	/* load 8 bytes from Src1 into MM0 */
3149		psllw mm0, mm3   	/* shift 4 WORDS of MM0 (N) bits to the left */
3150			pand mm0, mm1    // apply proper bit-Mask to 8 BYTES of MM0 */
3151			/* byte     0x0f, 0xdb, 0xc1 */
3152			movq [edi], mm0   	/* store result in Dest */
3153			add eax, 8   	/* increase Src1 register pointer by 8 */
3154			add edi, 8   	/* increase Dest register pointer by 8 */
3155			dec              ecx    	/* decrease loop counter */
3156			jnz            L10271    	/* check loop termination, proceed if required */
3157			emms                      	/* exit MMX state */
3158			popa
3159	}
3160#else
3161	asm volatile
3162		("pusha		     \n\t" "movl         %4, %%edx \n\t"	/* load Mask address into edx */
3163		"movq    (%%edx), %%mm0 \n\t"	/* load Mask into mm0 */
3164		"xor       %%ecx, %%ecx \n\t"	/* zero ECX */
3165		"mov           %3, %%cl \n\t"	/* load loop counter (N) into CL */
3166		"movd      %%ecx, %%mm3 \n\t"	/* copy (N) into MM3  */
3167		"pcmpeqb   %%mm1, %%mm1 \n\t"	/* generate all 1's in mm1 */
3168		"1:                     \n\t"	/* ** Prepare proper bit-Mask in MM1 ** */
3169		"psllw        $1, %%mm1 \n\t"	/* shift 4 WORDS of MM1 1 bit to the left */
3170		/*    "pand      %%mm0, %%mm1 \n\t"    // apply Mask to 8 BYTES of MM1 */
3171		".byte     0x0f, 0xdb, 0xc8 \n\t" "dec %%cl               \n\t"	/* decrease loop counter */
3172		"jnz                 1b \n\t"	/* check loop termination, proceed if required */
3173		/* ** Shift all bytes of the image ** */
3174		"mov          %1, %%eax \n\t"	/* load Src1 address into eax */
3175		"mov          %0, %%edi \n\t"	/* load SrcDest address into edi */
3176		"mov          %2, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
3177		"shr          $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
3178		".align 16              \n\t"	/* 16 byte alignment of the loop entry */
3179		"2: movq (%%eax), %%mm0 \n\t"	/* load 8 bytes from Src1 into MM0 */
3180		"psllw     %%mm3, %%mm0 \n\t"	/* shift 4 WORDS of MM0 (N) bits to the left */
3181		/*    "pand      %%mm1, %%mm0 \n\t"    // apply proper bit-Mask to 8 BYTES of MM0 */
3182		".byte     0x0f, 0xdb, 0xc1 \n\t" "movq    %%mm0, (%%edi) \n\t"	/* store result in Dest */
3183		"add          $8, %%eax \n\t"	/* increase Src1 register pointer by 8 */
3184		"add          $8, %%edi \n\t"	/* increase Dest register pointer by 8 */
3185		"dec              %%ecx \n\t"	/* decrease loop counter */
3186		"jnz                 2b \n\t"	/* check loop termination, proceed if required */
3187		"emms                   \n\t"	/* exit MMX state */
3188		"popa                   \n\t":"=m" (Dest)	/* %0 */
3189		:"m"(Src1),		/* %1 */
3190		"m"(SrcLength),		/* %2 */
3191		"m"(N),			/* %3 */
3192		"m"(Mask)			/* %4 */
3193		);
3194#endif
3195	return (0);
3196#else
3197	return (-1);
3198#endif
3199}
3200
3201/*!
3202\brief Filter using ShiftLeftByte: D = (S << N)
3203
3204\param Src1 Pointer to the start of the source byte array (S).
3205\param Dest Pointer to the start of the destination byte array (D).
3206\param length The number of bytes in the source arrays.
3207\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
3208
3209\return Returns 0 for success or -1 for error.
3210*/
3211int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
3212{
3213	static unsigned char Mask[8] = { 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE };
3214	unsigned int i, istart;
3215	unsigned char *cursrc1, *curdest;
3216	int result;
3217
3218	/* Validate input parameters */
3219	if ((Src1 == NULL) || (Dest == NULL))
3220		return(-1);
3221	if (length == 0)
3222		return(0);
3223
3224	if (N > 8) {
3225		return (-1);
3226	}
3227
3228	/* Special case: N==0 */
3229	if (N == 0) {
3230		memcpy(Src1, Dest, length);
3231		return (0); 
3232	}
3233
3234	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
3235
3236		SDL_imageFilterShiftLeftByteMMX(Src1, Dest, length, N, Mask);
3237
3238		/* Check for unaligned bytes */
3239		if ((length & 7) > 0) {
3240			/* Setup to process unaligned bytes */
3241			istart = length & 0xfffffff8;
3242			cursrc1 = &Src1[istart];
3243			curdest = &Dest[istart];
3244		} else {
3245			/* No unaligned bytes - we are done */
3246			return (0);
3247		}
3248	} else {
3249		/* Setup to process whole image */
3250		istart = 0;
3251		cursrc1 = Src1;
3252		curdest = Dest;
3253	}
3254
3255	/* C routine to process image */
3256	for (i = istart; i < length; i++) {
3257		result = ((int) *cursrc1 << N) & 0xff;
3258		*curdest = (unsigned char) result;
3259		/* Advance pointers */
3260		cursrc1++;
3261		curdest++;
3262	}
3263
3264	return (0);
3265}
3266
3267/*!
3268\brief Internal MMX Filter using ShiftLeftUint: D = ((uint)S << N)
3269
3270\param Src1 Pointer to the start of the source byte array (S).
3271\param Dest Pointer to the start of the destination byte array (D).
3272\param SrcLength The number of bytes in the source array.
3273\param N Number of bit-positions to shift (N). Valid range is 0 to 32.
3274
3275\return Returns 0 for success or -1 for error.
3276*/
3277int SDL_imageFilterShiftLeftUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
3278{
3279#ifdef USE_MMX
3280#if !defined(GCC__)
3281	__asm
3282	{
3283		pusha
3284			mov eax, Src1   	/* load Src1 address into eax */
3285			mov edi, Dest   	/* load Dest address into edi */
3286			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
3287			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
3288			align 16                 	/* 16 byte alignment of the loop entry */
3289L12023:
3290		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
3291		pslld mm0, N   	/* MM0=SrcDest+C (add 8 bytes with saturation) */
3292			movq [edi], mm0   	/* store result in SrcDest */
3293			add eax, 8   	/* increase Src1 register pointer by 8 */
3294			add edi, 8   	/* increase Dest register pointer by 8 */
3295			dec              ecx    	/* decrease loop counter */
3296			jnz             L12023    	/* check loop termination, proceed if required */
3297			emms                      	/* exit MMX state */
3298			popa
3299	}
3300#else
3301	asm volatile
3302		("pusha		     \n\t"
3303		"mov          %1, %%eax \n\t"	/* load Src1 address into eax */
3304		"mov          %0, %%edi \n\t"	/* load Dest address into edi */
3305		"mov          %2, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
3306		"shr          $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
3307		".align 16              \n\t"	/* 16 byte alignment of the loop entry */
3308		"1: movq (%%eax), %%mm0 \n\t"	/* load 8 bytes from SrcDest into MM0 */
3309		"pslld   %3, %%mm0 \n\t"	/* MM0=SrcDest+C (add 8 bytes with saturation) */
3310		"movq    %%mm0, (%%edi) \n\t"	/* store result in SrcDest */
3311		"add          $8, %%eax \n\t"	/* increase Src1 register pointer by 8 */
3312		"add          $8, %%edi \n\t"	/* increase Dest register pointer by 8 */
3313		"dec              %%ecx \n\t"	/* decrease loop counter */
3314		"jnz                 1b \n\t"	/* check loop termination, proceed if required */
3315		"emms                   \n\t"	/* exit MMX state */
3316		"popa                   \n\t":"=m" (Dest)	/* %0 */
3317		:"m"(Src1),		/* %1 */
3318		"m"(SrcLength),		/* %2 */
3319		"m"(N)			/* %3 */
3320		);
3321#endif
3322	return (0);
3323#else
3324	return (-1);
3325#endif
3326}
3327
3328/*!
3329\brief Filter using ShiftLeftUint: D = ((uint)S << N)
3330
3331\param Src1 Pointer to the start of the source byte array (S).
3332\param Dest Pointer to the start of the destination byte array (D).
3333\param length The number of bytes in the source array.
3334\param N Number of bit-positions to shift (N). Valid range is 0 to 32.
3335
3336\return Returns 0 for success or -1 for error.
3337*/
3338int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
3339{
3340	unsigned int i, istart;
3341	unsigned char *cursrc1, *curdest;
3342	unsigned int *icursrc1, *icurdest;
3343	int result;
3344
3345	/* Validate input parameters */
3346	if ((Src1 == NULL) || (Dest == NULL))
3347		return(-1);
3348	if (length == 0)
3349		return(0);
3350
3351	if (N > 32) {
3352		return (-1);
3353	}
3354
3355	/* Special case: N==0 */
3356	if (N == 0) {
3357		memcpy(Src1, Dest, length);
3358		return (0); 
3359	}
3360
3361	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
3362
3363		SDL_imageFilterShiftLeftUintMMX(Src1, Dest, length, N);
3364
3365		/* Check for unaligned bytes */
3366		if ((length & 7) > 0) {
3367			/* Setup to process unaligned bytes */
3368			istart = length & 0xfffffff8;
3369			cursrc1 = &Src1[istart];
3370			curdest = &Dest[istart];
3371		} else {
3372			/* No unaligned bytes - we are done */
3373			return (0);
3374		}
3375	} else {
3376		/* Setup to process whole image */
3377		istart = 0;
3378		cursrc1 = Src1;
3379		curdest = Dest;
3380	}
3381
3382	/* C routine to process image */
3383	icursrc1=(unsigned int *)cursrc1;
3384	icurdest=(unsigned int *)curdest;
3385	for (i = istart; i < length; i += 4) {
3386		if ((i+4)<length) {
3387			result = ((unsigned int)*icursrc1 << N);
3388			*icurdest = (unsigned int)result;
3389		}
3390		/* Advance pointers */
3391		icursrc1++;
3392		icurdest++;
3393	}
3394
3395	return (0);
3396}
3397
3398/*!
3399\brief Internal MMX Filter ShiftLeft: D = saturation255(S << N)
3400
3401\param Src1 Pointer to the start of the source byte array (S1).
3402\param Dest Pointer to the start of the destination byte array (D).
3403\param SrcLength The number of bytes in the source array.
3404\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
3405
3406\return Returns 0 for success or -1 for error.
3407*/
3408int SDL_imageFilterShiftLeftMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
3409{
3410#ifdef USE_MMX
3411#if !defined(GCC__)
3412	__asm
3413	{
3414		pusha
3415			xor eax, eax   	/* zero EAX */
3416			mov al, N   	/* load N into AL */
3417			movd mm7, eax   	/* copy N into MM7 */
3418			pxor mm0, mm0   	/* zero MM0 register */
3419			mov eax, Src1   	/* load Src1 address into eax */
3420			mov edi, Dest   	/* load Dest address into edi */
3421			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
3422			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
3423			cmp al, 7   	/* if (N <= 7) execute more efficient code */
3424			jg             L10281
3425			align 16                 	/* 16 byte alignment of the loop entry */
3426L10280:
3427		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
3428		movq mm4, mm3   	/* copy MM3 into MM4  */
3429			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
3430			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
3431			psllw mm3, mm7   	/* shift 4 WORDS of MM3 (N) bits to the right */
3432			psllw mm4, mm7   	/* shift 4 WORDS of MM4 (N) bits to the right */
3433			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
3434			movq [edi], mm3   	/* store result in Dest */
3435			add eax, 8   	/* increase Src1 register pointer by 8 */
3436			add edi, 8   	/* increase Dest register pointer by 8 */
3437			dec              ecx    	/* decrease loop counter */
3438			jnz            L10280    	/* check loop termination, proceed if required */
3439			jmp            L10282
3440			align 16                 	/* 16 byte alignment of the loop entry */
3441L10281:
3442		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
3443		movq mm4, mm3   	/* copy MM3 into MM4  */
3444			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
3445			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
3446			psllw mm3, mm7   	/* shift 4 WORDS of MM3 (N) bits to the right */
3447			psllw mm4, mm7   	/* shift 4 WORDS of MM4 (N) bits to the right */
3448			/* ** Take abs value of the signed words ** */
3449			movq mm5, mm3   	/* copy mm3 into mm5 */
3450			movq mm6, mm4   	/* copy mm4 into mm6 */
3451			psraw mm5, 15   	/* fill mm5 words with word sign bit */
3452			psraw mm6, 15   	/* fill mm6 words with word sign bit */
3453			pxor mm3, mm5   	/* take 1's compliment of only neg words */
3454			pxor mm4, mm6   	/* take 1's compliment of only neg words */
3455			psubsw mm3, mm5   	/* add 1 to only neg words, W-(-1) or W-0 */
3456			psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
3457			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
3458			movq [edi], mm3   	/* store result in Dest */
3459			add eax, 8   	/* increase Src1 register pointer by 8 */
3460			add edi, 8   	/* increase Dest register pointer by 8 */
3461			dec              ecx    	/* decrease loop counter */
3462			jnz            L10281    	/* check loop termination, proceed if required */
3463L10282:
3464		emms                      	/* exit MMX state */
3465			popa
3466	}
3467#else
3468	asm volatile
3469		("pusha		     \n\t" "xor       %%eax, %%eax \n\t"	/* zero EAX */
3470		"mov           %3, %%al \n\t"	/* load N into AL */
3471		"movd      %%eax, %%mm7 \n\t"	/* copy N into MM7 */
3472		"pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 register */
3473		"mov         %1, %%eax  \n\t"	/* load Src1 address into eax */
3474		"mov         %0, %%edi  \n\t"	/* load Dest address into edi */
3475		"mov         %2, %%ecx  \n\t"	/* load loop counter (SIZE) into ecx */
3476		"shr         $3, %%ecx  \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
3477		"cmp           $7, %%al \n\t"	/* if (N <= 7) execute more efficient code */
3478		"jg                  2f \n\t" ".align 16              \n\t"	/* 16 byte alignment of the loop entry */
3479		"1: movq (%%eax), %%mm3 \n\t"	/* load 8 bytes from Src1 into MM3 */
3480		"movq      %%mm3, %%mm4 \n\t"	/* copy MM3 into MM4  */
3481		"punpcklbw %%mm0, %%mm3 \n\t"	/* unpack low  bytes of SrcDest into words */
3482		"punpckhbw %%mm0, %%mm4 \n\t"	/* unpack high bytes of SrcDest into words */
3483		"psllw     %%mm7, %%mm3 \n\t"	/* shift 4 WORDS of MM3 (N) bits to the right */
3484		"psllw     %%mm7, %%mm4 \n\t"	/* shift 4 WORDS of MM4 (N) bits to the right */
3485		"packuswb  %%mm4, %%mm3 \n\t"	/* pack words back into bytes with saturation */
3486		"movq    %%mm3, (%%edi) \n\t"	/* store result in Dest */
3487		"add          $8, %%eax \n\t"	/* increase Src1 register pointer by 8 */
3488		"add          $8, %%edi \n\t"	/* increase Dest register pointer by 8 */
3489		"dec              %%ecx \n\t"	/* decrease loop counter */
3490		"jnz                 1b \n\t"	/* check loop termination, proceed if required */
3491		"jmp                 3f \n\t" ".align 16              \n\t"	/* 16 byte alignment of the loop entry */
3492		"2: movq (%%eax), %%mm3 \n\t"	/* load 8 bytes from Src1 into MM3 */
3493		"movq      %%mm3, %%mm4 \n\t"	/* copy MM3 into MM4  */
3494		"punpcklbw %%mm0, %%mm3 \n\t"	/* unpack low  bytes of SrcDest into words */
3495		"punpckhbw %%mm0, %%mm4 \n\t"	/* unpack high bytes of SrcDest into words */
3496		"psllw     %%mm7, %%mm3 \n\t"	/* shift 4 WORDS of MM3 (N) bits to the right */
3497		"psllw     %%mm7, %%mm4 \n\t"	/* shift 4 WORDS of MM4 (N) bits to the right */
3498		/* ** Take abs value of the signed words ** */
3499		"movq      %%mm3, %%mm5 \n\t"	/* copy mm3 into mm5 */
3500		"movq      %%mm4, %%mm6 \n\t"	/* copy mm4 into mm6 */
3501		"psraw       $15, %%mm5 \n\t"	/* fill mm5 words with word sign bit */
3502		"psraw       $15, %%mm6 \n\t"	/* fill mm6 words with word sign bit */
3503		"pxor      %%mm5, %%mm3 \n\t"	/* take 1's compliment of only neg. words */
3504		"pxor      %%mm6, %%mm4 \n\t"	/* take 1's compliment of only neg. words */
3505		"psubsw    %%mm5, %%mm3 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
3506		"psubsw    %%mm6, %%mm4 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
3507		"packuswb  %%mm4, %%mm3 \n\t"	/* pack words back into bytes with saturation */
3508		"movq    %%mm3, (%%edi) \n\t"	/* store result in Dest */
3509		"add          $8, %%eax \n\t"	/* increase Src1 register pointer by 8 */
3510		"add          $8, %%edi \n\t"	/* increase Dest register pointer by 8 */
3511		"dec              %%ecx \n\t"	/* decrease loop counter */
3512		"jnz                 2b \n\t"	/* check loop termination, proceed if required */
3513		"3: emms                \n\t"	/* exit MMX state */
3514		"popa                   \n\t":"=m" (Dest)	/* %0 */
3515		:"m"(Src1),		/* %1 */
3516		"m"(SrcLength),		/* %2 */
3517		"m"(N)			/* %3 */
3518		);
3519#endif
3520	return (0);
3521#else
3522	return (-1);
3523#endif
3524}
3525
3526/*!
3527\brief Filter ShiftLeft: D = saturation255(S << N)
3528
3529\param Src1 Pointer to the start of the source byte array (S1).
3530\param Dest Pointer to the start of the destination byte array (D).
3531\param length The number of bytes in the source array.
3532\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
3533
3534\return Returns 0 for success or -1 for error.
3535*/
3536int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
3537{
3538	unsigned int i, istart;
3539	unsigned char *cursrc1, *curdest;
3540	int result;
3541
3542	/* Validate input parameters */
3543	if ((Src1 == NULL) || (Dest == NULL))
3544		return(-1);
3545	if (length == 0)
3546		return(0);
3547
3548	if (N > 8) {
3549		return (-1);
3550	}
3551
3552	/* Special case: N==0 */
3553	if (N == 0) {
3554		memcpy(Src1, Dest, length);
3555		return (0); 
3556	}
3557
3558	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
3559
3560		SDL_imageFilterShiftLeftMMX(Src1, Dest, length, N);
3561
3562		/* Check for unaligned bytes */
3563		if ((length & 7) > 0) {
3564			/* Setup to process unaligned bytes */
3565			istart = length & 0xfffffff8;
3566			cursrc1 = &Src1[istart];
3567			curdest = &Dest[istart];
3568		} else {
3569			/* No unaligned bytes - we are done */
3570			return (0);
3571		}
3572	} else {
3573		/* Setup to process whole image */
3574		istart = 0;
3575		cursrc1 = Src1;
3576		curdest = Dest;
3577	}
3578
3579	/* C routine to process image */
3580	for (i = istart; i < length; i++) {
3581		result = (int) *cursrc1 << N;
3582		if (result > 255)
3583			result = 255;
3584		*curdest = (unsigned char) result;
3585		/* Advance pointers */
3586		cursrc1++;
3587		curdest++;
3588	}
3589
3590	return (0);
3591}
3592
3593/*!
3594\brief MMX BinarizeUsingThreshold: D = (S >= T) ? 255:0
3595
3596\param Src1 Pointer to the start of the source byte array (S).
3597\param Dest Pointer to the start of the destination byte array (D).
3598\param SrcLength The number of bytes in the source array.
3599\param T The threshold boundary (inclusive).
3600
3601\return Returns 0 for success or -1 for error.
3602*/
3603int SDL_imageFilterBinarizeUsingThresholdMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char T)
3604{
3605#ifdef USE_MMX
3606#if !defined(GCC__)
3607	__asm
3608	{
3609		pusha
3610			/* ** Duplicate T in 8 bytes of MM3 ** */
3611			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
3612			pcmpeqb mm2, mm2   	/* generate all 1's in mm2 */
3613			mov al, T   	/* load T into AL */
3614			mov ah, al   	/* copy AL into AH */
3615			mov bx, ax   	/* copy AX into BX */
3616			shl eax, 16   	/* shift 2 bytes of EAX left */
3617			mov ax, bx   	/* copy BX into AX */
3618			movd mm3, eax   	/* copy EAX into MM3 */
3619			movd mm4, eax   	/* copy EAX into MM4 */
3620			punpckldq mm3, mm4   	/* fill higher bytes of MM3 with T */
3621			psubusb mm2, mm3   	/* store 0xFF - T in MM2 */
3622			mov eax, Src1   	/* load Src1 address into eax */
3623			mov edi, Dest   	/* load Dest address into edi */
3624			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
3625			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
3626			align 16                 	/* 16 byte alignment of the loop entry */
3627L1029:
3628		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
3629		paddusb mm0, mm2   	/* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */
3630			pcmpeqb mm0, mm1   	/* binarize 255:0, comparing to 255 */
3631			movq [edi], mm0   	/* store result in SrcDest */
3632			add eax, 8   	/* increase Src1 register pointer by 8 */
3633			add edi, 8   	/* increase Dest register pointer by 8 */
3634			dec              ecx    	/* decrease loop counter */
3635			jnz             L1029    	/* check loop termination, proceed if required */
3636			emms                      	/* exit MMX state */
3637			popa
3638	}
3639#else
3640	asm volatile
3641		("pusha		     \n\t"
3642		/* ** Duplicate T in 8 bytes of MM3 ** */
3643		"pcmpeqb   %%mm1, %%mm1 \n\t"	/* generate all 1's in mm1 */
3644		"pcmpeqb   %%mm2, %%mm2 \n\t"	/* generate all 1's in mm2 */
3645		"mov           %3, %%al \n\t"	/* load T into AL */
3646		"mov         %%al, %%ah \n\t"	/* copy AL into AH */
3647		"mov         %%ax, %%bx \n\t"	/* copy AX into BX */
3648		"shl         $16, %%eax \n\t"	/* shift 2 bytes of EAX left */
3649		"mov         %%bx, %%ax \n\t"	/* copy BX into AX */
3650		"movd      %%eax, %%mm3 \n\t"	/* copy EAX into MM3 */
3651		"movd      %%eax, %%mm4 \n\t"	/* copy EAX into MM4 */
3652		"punpckldq %%mm4, %%mm3 \n\t"	/* fill higher bytes of MM3 with T */
3653		"psubusb   %%mm3, %%mm2 \n\t"	/* store 0xFF - T in MM2 */
3654		"mov          %1, %%eax \n\t"	/* load Src1 address into eax */
3655		"mov          %0, %%edi \n\t"	/* load Dest address into edi */
3656		"mov          %2, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
3657		"shr          $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
3658		".align 16              \n\t"	/* 16 byte alignment of the loop entry */
3659		"1:                     \n\t" 
3660		"movq    (%%eax), %%mm0 \n\t"	/* load 8 bytes from SrcDest into MM0 */
3661		"paddusb   %%mm2, %%mm0 \n\t"	/* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */
3662		"pcmpeqb   %%mm1, %%mm0 \n\t"	/* binarize 255:0, comparing to 255 */
3663		"movq    %%mm0, (%%edi) \n\t"	/* store result in SrcDest */
3664		"add          $8, %%eax \n\t"	/* increase Src1 register pointer by 8 */
3665		"add          $8, %%edi \n\t"	/* increase Dest register pointer by 8 */
3666		"dec              %%ecx \n\t"	/* decrease loop counter */
3667		"jnz                 1b \n\t"	/* check loop termination, proceed if required */
3668		"emms                   \n\t"	/* exit MMX state */
3669		"popa                   \n\t":"=m" (Dest)	/* %0 */
3670		:"m"(Src1),		/* %1 */
3671		"m"(SrcLength),		/* %2 */
3672		"m"(T)			/* %3 */
3673		);
3674#endif
3675	return (0);
3676#else
3677	return (-1);
3678#endif
3679}
3680
3681/*!
3682\brief Filter using BinarizeUsingThreshold: D = (S >= T) ? 255:0
3683
3684\param Src1 Pointer to the start of the source byte array (S).
3685\param Dest Pointer to the start of the destination byte array (D).
3686\param length The number of bytes in the source array.
3687\param T The threshold boundary (inclusive).
3688
3689\return Returns 0 for success or -1 for error.
3690*/
3691int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char T)
3692{
3693	unsigned int i, istart;
3694	unsigned char *cursrc1;
3695	unsigned char *curdest;
3696
3697	/* Validate input parameters */
3698	if ((Src1 == NULL) || (Dest == NULL))
3699		return(-1);
3700	if (length == 0)
3701		return(0);
3702
3703	/* Special case: T==0 */
3704	if (T == 0) {
3705		memset(Dest, 255, length);
3706		return (0); 
3707	}
3708
3709	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
3710
3711		SDL_imageFilterBinarizeUsingThresholdMMX(Src1, Dest, length, T);
3712
3713		/* Check for unaligned bytes */
3714		if ((length & 7) > 0) {
3715			/* Setup to process unaligned bytes */
3716			istart = length & 0xfffffff8;
3717			cursrc1 = &Src1[istart];
3718			curdest = &Dest[istart];
3719		} else {
3720			/* No unaligned bytes - we are done */
3721			return (0);
3722		}
3723	} else {
3724		/* Setup to process whole image */
3725		istart = 0;
3726		cursrc1 = Src1;
3727		curdest = Dest;
3728	}
3729
3730	/* C routine to process image */
3731	for (i = istart; i < length; i++) {
3732		*curdest = ((unsigned char) *cursrc1 >= T) ? 255 : 0;
3733		/* Advance pointers */
3734		cursrc1++;
3735		curdest++;
3736	}
3737
3738	return (0);
3739}
3740
3741/*!
3742\brief Internal MMX Filter using ClipToRange: D = (S >= Tmin) & (S <= Tmax) S:Tmin | Tmax
3743
3744\param Src1 Pointer to the start of the source byte array (S).
3745\param Dest Pointer to the start of the destination byte array (D).
3746\param SrcLength The number of bytes in the source array.
3747\param Tmin Lower (inclusive) boundary of the clipping range.
3748\param Tmax Upper (inclusive) boundary of the clipping range.
3749
3750\return Returns 0 for success or -1 for error.
3751*/
3752int SDL_imageFilterClipToRangeMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char Tmin,
3753								  unsigned char Tmax)
3754{
3755#ifdef USE_MMX
3756#if !defined(GCC__)
3757	__asm
3758	{
3759		pusha
3760			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
3761			/* ** Duplicate Tmax in 8 bytes of MM3 ** */
3762			mov al, Tmax   	/* load Tmax into AL */
3763			mov ah, al   	/* copy AL into AH */
3764			mov bx, ax   	/* copy AX into BX */
3765			shl eax, 16   	/* shift 2 bytes of EAX left */
3766			mov ax, bx   	/* copy BX into AX */
3767			movd mm3, eax   	/* copy EAX into MM3 */
3768			movd mm4, eax   	/* copy EAX into MM4 */
3769			punpckldq mm3, mm4   	/* fill higher bytes of MM3 with Tmax */
3770			psubusb mm1, mm3   	/* store 0xFF - Tmax in MM1 */
3771			/* ** Duplicate Tmin in 8 bytes of MM5 ** */
3772			mov al, Tmin   	/* load Tmin into AL */
3773			mov ah, al   	/* copy AL into AH */
3774			mov bx, ax   	/* copy AX into BX */
3775			shl eax, 16   	/* shift 2 bytes of EAX left */
3776			mov ax, bx   	/* copy BX into AX */
3777			movd mm5, eax   	/* copy EAX into MM5 */
3778			movd mm4, eax   	/* copy EAX into MM4 */
3779			punpckldq mm5, mm4   	/* fill higher bytes of MM5 with Tmin */
3780			movq mm7, mm5   	/* copy MM5 into MM7 */
3781			paddusb mm7, mm1   	/* store 0xFF - Tmax + Tmin in MM7 */
3782			mov eax, Src1   	/* load Src1 address into eax */
3783			mov edi, Dest   	/* load Dest address into edi */
3784			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
3785			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
3786			align 16                 	/* 16 byte alignment of the loop entry */
3787L1030:
3788		movq mm0, [eax]   	/* load 8 bytes from Src1 into MM0 */
3789		paddusb mm0, mm1   	/* MM0=SrcDest+(0xFF-Tmax) */
3790			psubusb mm0, mm7   	/* MM0=MM0-(0xFF-Tmax+Tmin) */
3791			paddusb mm0, mm5   	/* MM0=MM0+Tmin */
3792			movq [edi], mm0   	/* store result in Dest */
3793			add eax, 8   	/* increase Src1 register pointer by 8 */
3794			add edi, 8   	/* increase Dest register pointer by 8 */
3795			dec              ecx    	/* decrease loop counter */
3796			jnz             L1030    	/* check loop termination, proceed if required */
3797			emms                      	/* exit MMX state */
3798			popa
3799	}
3800#else
3801	asm volatile
3802		("pusha		     \n\t" "pcmpeqb   %%mm1, %%mm1 \n\t"	/* generate all 1's in mm1 */
3803		/* ** Duplicate Tmax in 8 bytes of MM3 ** */
3804		"mov           %4, %%al \n\t"	/* load Tmax into AL */
3805		"mov         %%al, %%ah \n\t"	/* copy AL into AH */
3806		"mov         %%ax, %%bx \n\t"	/* copy AX into BX */
3807		"shl         $16, %%eax \n\t"	/* shift 2 bytes of EAX left */
3808		"mov         %%bx, %%ax \n\t"	/* copy BX into AX */
3809		"movd      %%eax, %%mm3 \n\t"	/* copy EAX into MM3 */
3810		"movd      %%eax, %%mm4 \n\t"	/* copy EAX into MM4 */
3811		"punpckldq %%mm4, %%mm3 \n\t"	/* fill higher bytes of MM3 with Tmax */
3812		"psubusb   %%mm3, %%mm1 \n\t"	/* store 0xFF - Tmax in MM1 */
3813		/* ** Duplicate Tmin in 8 bytes of MM5 ** */
3814		"mov           %3, %%al \n\t"	/* load Tmin into AL */
3815		"mov         %%al, %%ah \n\t"	/* copy AL into AH */
3816		"mov         %%ax, %%bx \n\t"	/* copy AX into BX */
3817		"shl         $16, %%eax \n\t"	/* shift 2 bytes of EAX left */
3818		"mov         %%bx, %%ax \n\t"	/* copy BX into AX */
3819		"movd      %%eax, %%mm5 \n\t"	/* copy EAX into MM5 */
3820		"movd      %%eax, %%mm4 \n\t"	/* copy EAX into MM4 */
3821		"punpckldq %%mm4, %%mm5 \n\t"	/* fill higher bytes of MM5 with Tmin */
3822		"movq      %%mm5, %%mm7 \n\t"	/* copy MM5 into MM7 */
3823		"paddusb   %%mm1, %%mm7 \n\t"	/* store 0xFF - Tmax + Tmin in MM7 */
3824		"mov          %1, %%eax \n\t"	/* load Src1 address into eax */
3825		"mov          %0, %%edi \n\t"	/* load Dest address into edi */
3826		"mov          %2, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
3827		"shr          $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
3828		".align 16              \n\t"	/* 16 byte alignment of the loop entry */
3829		"1:                     \n\t" 
3830		"movq    (%%eax), %%mm0 \n\t"	/* load 8 bytes from Src1 into MM0 */
3831		"paddusb   %%mm1, %%mm0 \n\t"	/* MM0=SrcDest+(0xFF-Tmax) */
3832		"psubusb   %%mm7, %%mm0 \n\t"	/* MM0=MM0-(0xFF-Tmax+Tmin) */
3833		"paddusb   %%mm5, %%mm0 \n\t"	/* MM0=MM0+Tmin */
3834		"movq    %%mm0, (%%edi) \n\t"	/* store result in Dest */
3835		"add          $8, %%eax \n\t"	/* increase Src1 register pointer by 8 */
3836		"add          $8, %%edi \n\t"	/* increase Dest register pointer by 8 */
3837		"dec              %%ecx \n\t"	/* decrease loop counter */
3838		"jnz                 1b \n\t"	/* check loop termination, proceed if required */
3839		"emms                   \n\t"	/* exit MMX state */
3840		"popa                   \n\t":"=m" (Dest)	/* %0 */
3841		:"m"(Src1),		/* %1 */
3842		"m"(SrcLength),		/* %2 */
3843		"m"(Tmin),		/* %3 */
3844		"m"(Tmax)			/* %4 */
3845		);
3846#endif
3847	return (0);
3848#else
3849	return (-1);
3850#endif
3851}
3852
3853/*!
3854\brief Filter using ClipToRange: D = (S >= Tmin) & (S <= Tmax) S:Tmin | Tmax
3855
3856\param Src1 Pointer to the start of the source byte array (S).
3857\param Dest Pointer to the start of the destination byte array (D).
3858\param length The number of bytes in the source array.
3859\param Tmin Lower (inclusive) boundary of the clipping range.
3860\param Tmax Upper (inclusive) boundary of the clipping range.
3861
3862\return Returns 0 for success or -1 for error.
3863*/
3864int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char Tmin,
3865							   unsigned char Tmax)
3866{
3867	unsigned int i, istart;
3868	unsigned char *cursrc1;
3869	unsigned char *curdest;
3870
3871	/* Validate input parameters */
3872	if ((Src1 == NULL) || (Dest == NULL))
3873		return(-1);
3874	if (length == 0)
3875		return(0);
3876
3877	/* Special case: Tmin==0 && Tmax = 255 */
3878	if ((Tmin == 0) && (Tmax == 25)) {
3879		memcpy(Src1, Dest, length);
3880		return (0); 
3881	}
3882
3883	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
3884
3885		SDL_imageFilterClipToRangeMMX(Src1, Dest, length, Tmin, Tmax);
3886
3887		/* Check for unaligned bytes */
3888		if ((length & 7) > 0) {
3889			/* Setup to process unaligned bytes */
3890			istart = length & 0xfffffff8;
3891			cursrc1 = &Src1[istart];
3892			curdest = &Dest[istart];
3893		} else {
3894			/* No unaligned bytes - we are done */
3895			return (0);
3896		}
3897	} else {
3898		/* Setup to process whole image */
3899		istart = 0;
3900		cursrc1 = Src1;
3901		curdest = Dest;
3902	}
3903
3904	/* C routine to process image */
3905	for (i = istart; i < length; i++) {
3906		if (*cursrc1 < Tmin) {
3907			*curdest = Tmin;
3908		} else if (*cursrc1 > Tmax) {
3909			*curdest = Tmax;
3910		} else {
3911			*curdest = *cursrc1;
3912		}
3913		/* Advance pointers */
3914		cursrc1++;
3915		curdest++;
3916	}
3917
3918	return (0);
3919}
3920
3921/*!
3922\brief Internal MMX Filter using NormalizeLinear: D = saturation255((Nmax - Nmin)/(Cmax - Cmin)*(S - Cmin) + Nmin)
3923
3924\param Src1 Pointer to the start of the source byte array (S).
3925\param Dest Pointer to the start of the destination byte array (D).
3926\param SrcLength The number of bytes in the source array.
3927\param Cmin Normalization constant (Cmin).
3928\param Cmax Normalization constant (Cmax).
3929\param Nmin Normalization constant (Nmin).
3930\param Nmax Normalization constant (Nmax).
3931
3932\return Returns 0 for success or -1 for error.
3933*/
3934int SDL_imageFilterNormalizeLinearMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, int Cmin, int Cmax,
3935									  int Nmin, int Nmax)
3936{
3937#ifdef USE_MMX
3938#if !defined(GCC__)
3939	__asm
3940	{
3941		pusha
3942			mov ax, WORD PTR Nmax   	/* load Nmax in AX */
3943			mov bx, WORD PTR Cmax   	/* load Cmax in BX */
3944			sub ax, WORD PTR Nmin   	/* AX = Nmax - Nmin */
3945			sub bx, WORD PTR Cmin   	/* BX = Cmax - Cmin */
3946			jz             L10311    	/* check division by zero */
3947			xor dx, dx   	/* prepare for division, zero DX */
3948			div               bx    	/* AX = AX/BX */
3949			jmp            L10312
3950L10311:
3951		mov ax, 255   	/* if div by zero, assume result max byte value */
3952L10312:                  	/* ** Duplicate AX in 4 words of MM0 ** */
3953		mov bx, ax   	/* copy AX into BX */
3954			shl eax, 16   	/* shift 2 bytes of EAX left */
3955			mov ax, bx   	/* copy BX into AX */
3956			movd mm0, eax   	/* copy EAX into MM0 */
3957			movd mm1, eax   	/* copy EAX into MM1 */
3958			punpckldq mm0, mm1   	/* fill higher words of MM0 with AX */
3959			/* ** Duplicate Cmin in 4 words of MM1 ** */
3960			mov ax, WORD PTR Cmin   	/* load Cmin into AX */
3961			mov bx, ax   	/* copy AX into BX */
3962			shl eax, 16   	/* shift 2 bytes of EAX left */
3963			mov ax, bx   	/* copy BX into AX */
3964			movd mm1, eax   	/* copy EAX into MM1 */
3965			movd mm2, eax   	/* copy EAX into MM2 */
3966			punpckldq mm1, mm2   	/* fill higher words of MM1 with Cmin */
3967			/* ** Duplicate Nmin in 4 words of MM2 ** */
3968			mov ax, WORD PTR Nmin   	/* load Nmin into AX */
3969			mov bx, ax   	/* copy AX into BX */
3970			shl eax, 16   	/* shift 2 bytes of EAX left */
3971			mov ax, bx   	/* copy BX into AX */
3972			movd mm2, eax   	/* copy EAX into MM2 */
3973			movd mm3, eax   	/* copy EAX into MM3 */
3974			punpckldq mm2, mm3   	/* fill higher words of MM2 with Nmin */
3975			pxor mm7, mm7   	/* zero MM7 register */
3976			mov eax, Src1   	/* load Src1 address into eax */
3977			mov edi, Dest   	/* load Dest address into edi */
3978			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
3979			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
3980			align 16                 	/* 16 byte alignment of the loop entry */
3981L1031:
3982		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
3983		movq mm4, mm3   	/* copy MM3 into MM4  */
3984			punpcklbw mm3, mm7   	/* unpack low  bytes of SrcDest into words */
3985			punpckhbw mm4, mm7   	/* unpack high bytes of SrcDest into words */
3986			psubusb mm3, mm1   	/* S-Cmin, low  bytes */
3987			psubusb mm4, mm1   	/* S-Cmin, high bytes */
3988			pmullw mm3, mm0   	/* MM0*(S-Cmin), low  bytes */
3989			pmullw mm4, mm0   	/* MM0*(S-Cmin), high bytes */
3990			paddusb mm3, mm2   	/* MM0*(S-Cmin)+Nmin, low  bytes */
3991			paddusb mm4, mm2   	/* MM0*(S-Cmin)+Nmin, high bytes */
3992			/* ** Take abs value of the signed words ** */
3993			movq mm5, mm3   	/* copy mm3 into mm5 */
3994			movq mm6, mm4   	/* copy mm4 into mm6 */
3995			psraw mm5, 15   	/* fill mm5 words with word sign bit */
3996			psraw mm6, 15   	/* fill mm6 words with word sign bit */
3997			pxor mm3, mm5   	/* take 1's compliment of only neg words */
3998			pxor mm4, mm6   	/* take 1's compliment of only neg words */
3999			psubsw mm3, mm5   	/* add 1 to only neg words, W-(-1) or W-0 */
4000			psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
4001			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
4002			movq [edi], mm3   	/* store result in Dest */
4003			add eax, 8   	/* increase Src1 register pointer by 8 */
4004			add edi, 8   	/* increase Dest register pointer by 8 */
4005			dec              ecx    	/* decrease loop counter */
4006			jnz             L1031    	/* check loop termination, proceed if required */
4007			emms                      	/* exit MMX state */
4008			popa
4009	}
4010#else
4011	asm volatile
4012		("pusha		     \n\t" "mov           %6, %%ax \n\t"	/* load Nmax in AX */
4013		"mov           %4, %%bx \n\t"	/* load Cmax in BX */
4014		"sub           %5, %%ax \n\t"	/* AX = Nmax - Nmin */
4015		"sub           %3, %%bx \n\t"	/* BX = Cmax - Cmin */
4016		"jz                  1f \n\t"	/* check division by zero */
4017		"xor         %%dx, %%dx \n\t"	/* prepare for division, zero DX */
4018		"div               %%bx \n\t"	/* AX = AX/BX */
4019		"jmp                 2f \n\t" "1:                     \n\t" "mov         $255, %%ax \n\t"	/* if div by zero, assume result max. byte value */
4020		"2:                    \n\t"	/* ** Duplicate AX in 4 words of MM0 ** */
4021		"mov         %%ax, %%bx \n\t"	/* copy AX into BX */
4022		"shl         $16, %%eax \n\t"	/* shift 2 bytes of EAX left */
4023		"mov         %%bx, %%ax \n\t"	/* copy BX into AX */
4024		"movd      %%eax, %%mm0 \n\t"	/* copy EAX into MM0 */
4025		"movd      %%eax, %%mm1 \n\t"	/* copy EAX into MM1 */
4026		"punpckldq %%mm1, %%mm0 \n\t"	/* fill higher words of MM0 with AX */
4027		/* ** Duplicate Cmin in 4 words of MM1 ** */
4028		"mov           %3, %%ax \n\t"	/* load Cmin into AX */
4029		"mov         %%ax, %%bx \n\t"	/* copy AX into BX */
4030		"shl         $16, %%eax \n\t"	/* shift 2 bytes of EAX left */
4031		"mov         %%bx, %%ax \n\t"	/* copy BX into AX */
4032		"movd      %%eax, %%mm1 \n\t"	/* copy EAX into MM1 */
4033		"movd      %%eax, %%mm2 \n\t"	/* copy EAX into MM2 */
4034		"punpckldq %%mm2, %%mm1 \n\t"	/* fill higher words of MM1 with Cmin */
4035		/* ** Duplicate Nmin in 4 words of MM2 ** */
4036		"mov           %5, %%ax \n\t"	/* load Nmin into AX */
4037		"mov         %%ax, %%bx \n\t"	/* copy AX into BX */
4038		"shl         $16, %%eax \n\t"	/* shift 2 bytes of EAX left */
4039		"mov         %%bx, %%ax \n\t"	/* copy BX into AX */
4040		"movd      %%eax, %%mm2 \n\t"	/* copy EAX into MM2 */
4041		"movd      %%eax, %%mm3 \n\t"	/* copy EAX into MM3 */
4042		"punpckldq %%mm3, %%mm2 \n\t"	/* fill higher words of MM2 with Nmin */
4043		"pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 register */
4044		"mov          %1, %%eax \n\t"	/* load Src1 address into eax */
4045		"mov          %0, %%edi \n\t"	/* load Dest address into edi */
4046		"mov          %2, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
4047		"shr          $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
4048		".align 16              \n\t"	/* 16 byte alignment of the loop entry */
4049		"1:                     \n\t" 
4050		"movq    (%%eax), %%mm3 \n\t"	/* load 8 bytes from Src1 into MM3 */
4051		"movq      %%mm3, %%mm4 \n\t"	/* copy MM3 into MM4  */
4052		"punpcklbw %%mm7, %%mm3 \n\t"	/* unpack low  bytes of SrcDest into words */
4053		"punpckhbw %%mm7, %%mm4 \n\t"	/* unpack high bytes of SrcDest into words */
4054		"psubusb   %%mm1, %%mm3 \n\t"	/* S-Cmin, low  bytes */
4055		"psubusb   %%mm1, %%mm4 \n\t"	/* S-Cmin, high bytes */
4056		"pmullw    %%mm0, %%mm3 \n\t"	/* MM0*(S-Cmin), low  bytes */
4057		"pmullw    %%mm0, %%mm4 \n\t"	/* MM0*(S-Cmin), high bytes */
4058		"paddusb   %%mm2, %%mm3 \n\t"	/* MM0*(S-Cmin)+Nmin, low  bytes */
4059		"paddusb   %%mm2, %%mm4 \n\t"	/* MM0*(S-Cmin)+Nmin, high bytes */
4060		/* ** Take abs value of the signed words ** */
4061		"movq      %%mm3, %%mm5 \n\t"	/* copy mm3 into mm5 */
4062		"movq      %%mm4, %%mm6 \n\t"	/* copy mm4 into mm6 */
4063		"psraw       $15, %%mm5 \n\t"	/* fill mm5 words with word sign bit */
4064		"psraw       $15, %%mm6 \n\t"	/* fill mm6 words with word sign bit */
4065		"pxor      %%mm5, %%mm3 \n\t"	/* take 1's compliment of only neg. words */
4066		"pxor      %%mm6, %%mm4 \n\t"	/* take 1's compliment of only neg. words */
4067		"psubsw    %%mm5, %%mm3 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
4068		"psubsw    %%mm6, %%mm4 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
4069		"packuswb  %%mm4, %%mm3 \n\t"	/* pack words back into bytes with saturation */
4070		"movq    %%mm3, (%%edi) \n\t"	/* store result in Dest */
4071		"add          $8, %%eax \n\t"	/* increase Src1 register pointer by 8 */
4072		"add          $8, %%edi \n\t"	/* increase Dest register pointer by 8 */
4073		"dec              %%ecx \n\t"	/* decrease loop counter */
4074		"jnz                 1b \n\t"	/* check loop termination, proceed if required */
4075		"emms                   \n\t"	/* exit MMX state */
4076		"popa                   \n\t":"=m" (Dest)	/* %0 */
4077		:"m"(Src1),		/* %1 */
4078		"m"(SrcLength),		/* %2 */
4079		"m"(Cmin),		/* %3 */
4080		"m"(Cmax),		/* %4 */
4081		"m"(Nmin),		/* %5 */
4082		"m"(Nmax)			/* %6 */
4083		);
4084#endif
4085	return (0);
4086#else
4087	return (-1);
4088#endif
4089}
4090
4091/*!
4092\brief Filter using NormalizeLinear: D = saturation255((Nmax - Nmin)/(Cmax - Cmin)*(S - Cmin) + Nmin)
4093
4094\param Src Pointer to the start of the source byte array (S).
4095\param Dest Pointer to the start of the destination byte array (D).
4096\param length The number of bytes in the source array.
4097\param Cmin Normalization constant.
4098\param Cmax Normalization constant.
4099\param Nmin Normalization constant.
4100\param Nmax Normalization constant.
4101
4102\return Returns 0 for success or -1 for error.
4103*/
4104int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin, int Cmax, int Nmin,
4105								   int Nmax)
4106{
4107	unsigned int i, istart;
4108	unsigned char *cursrc;
4109	unsigned char *curdest;
4110	int dN, dC, factor;
4111	int result;
4112
4113	/* Validate input parameters */
4114	if ((Src == NULL) || (Dest == NULL))
4115		return(-1);
4116	if (length == 0)
4117		return(0);
4118
4119	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
4120
4121		SDL_imageFilterNormalizeLinearMMX(Src, Dest, length, Cmin, Cmax, Nmin, Nmax);
4122
4123		/* Check for unaligned bytes */
4124		if ((length & 7) > 0) {
4125			/* Setup to process unaligned bytes */
4126			istart = length & 0xfffffff8;
4127			cursrc = &Src[istart];
4128			curdest = &Dest[istart];
4129		} else {
4130			/* No unaligned bytes - we are done */
4131			return (0);
4132		}
4133	} else {
4134		/* Setup to process whole image */
4135		istart = 0;
4136		cursrc = Src;
4137		curdest = Dest;
4138	}
4139
4140	/* C routine to process image */
4141	dC = Cmax - Cmin;
4142	if (dC == 0)
4143		return (0);
4144	dN = Nmax - Nmin;
4145	factor = dN / dC;
4146	for (i = istart; i < length; i++) {
4147		result = factor * ((int) (*cursrc) - Cmin) + Nmin;
4148		if (result > 255)
4149			result = 255;
4150		*curdest = (unsigned char) result;
4151		/* Advance pointers */
4152		cursrc++;
4153		curdest++;
4154	}
4155
4156	return (0);
4157}
4158
4159/* ------------------------------------------------------------------------------------ */
4160
4161/*!
4162\brief Filter using ConvolveKernel3x3Divide: Dij = saturation0and255( ... ) 
4163
4164\param Src The source 2D byte array to convolve. Should be different from destination.
4165\param Dest The destination 2D byte array to store the result in. Should be different from source.
4166\param rows Number of rows in source/destination array. Must be >2.
4167\param columns Number of columns in source/destination array. Must be >2.
4168\param Kernel The 2D convolution kernel of size 3x3.
4169\param Divisor The divisor of the convolution sum. Must be >0.
4170
4171Note: Non-MMX implementation not available for this function.
4172
4173\return Returns 1 if filter was applied, 0 otherwise.
4174*/
4175int SDL_imageFilterConvolveKernel3x3Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
4176										   signed short *Kernel, unsigned char Divisor)
4177{
4178	/* Validate input parameters */
4179	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
4180		return(-1);
4181
4182	if ((columns < 3) || (rows < 3) || (Divisor == 0))
4183		return (-1);
4184
4185	if ((SDL_imageFilterMMXdetect())) {
4186#ifdef USE_MMX
4187#if !defined(GCC__)
4188		__asm
4189		{
4190			pusha
4191				pxor mm0, mm0   	/* zero MM0 */
4192				xor ebx, ebx   	/* zero EBX */
4193				mov bl, Divisor   	/* load Divisor into BL */
4194				mov edx, Kernel   	/* load Kernel address into EDX */
4195				movq mm5, [edx]   	/* MM5 = {0,K2,K1,K0} */
4196			add edx, 8   	/* second row              |K0 K1 K2 0| */
4197				movq mm6, [edx]   	/* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
4198			add edx, 8   	/* third row               |K6 K7 K8 0| */
4199				movq mm7, [edx]   	/* MM7 = {0,K8,K7,K6} */
4200			/* ---, */
4201			mov eax, columns   	/* load columns into EAX */
4202				mov esi, Src   	/* ESI = Src row 0 address */
4203				mov edi, Dest   	/* load Dest address to EDI */
4204				add edi, eax   	/* EDI = EDI + columns */
4205				inc              edi    	/* 1 byte offset from the left edge */
4206				mov edx, rows   	/* initialize ROWS counter */
4207				sub edx, 2   	/* do not use first and last row */
4208				/* ---, */
4209L10320:
4210			mov ecx, eax   	/* initialize COLUMS counter */
4211				sub ecx, 2   	/* do not use first and last column */
4212				align 16                 	/* 16 byte alignment of the loop entry */
4213L10322:
4214			/* ---, */
4215			movq mm1, [esi]   	/* load 8 bytes of the image first row */
4216			add esi, eax   	/* move one row below */
4217				movq mm2, [esi]   	/* load 8 bytes of the image second row */
4218			add esi, eax   	/* move one row below */
4219				movq mm3, [esi]   	/* load 8 bytes of the image third row */
4220			punpcklbw mm1, mm0   	/* unpack first 4 bytes into words */
4221				punpcklbw mm2, mm0   	/* unpack first 4 bytes into words */
4222				punpcklbw mm3, mm0   	/* unpack first 4 bytes into words */
4223				pmullw mm1, mm5   	/* multiply words first row  image*Kernel */
4224				pmullw mm2, mm6   	/* multiply words second row image*Kernel */
4225				pmullw mm3, mm7   	/* multiply words third row  image*Kernel */
4226				paddsw mm1, mm2   	/* add 4 words of the first and second rows */
4227				paddsw mm1, mm3   	/* add 4 words of the third row and result */
4228				movq mm2, mm1   	/* copy MM1 into MM2 */
4229				psrlq mm1, 32   	/* shift 2 left words to the right */
4230				paddsw mm1, mm2   	/* add 2 left and 2 right result words */
4231				movq mm3, mm1   	/* copy MM1 into MM3 */
4232				psrlq mm1, 16   	/* shift 1 left word to the right */
4233				paddsw mm1, mm3   	/* add 1 left and 1 right result words */
4234				/* --, */
4235				movd mm2, eax   	/* save EAX in MM2 */
4236				movd mm3, edx   	/* save EDX in MM3 */
4237				movd eax, mm1   	/* copy MM1 into EAX */
4238				psraw mm1, 15   	/* spread sign bit of the result */
4239				movd edx, mm1   	/* fill EDX with a sign bit */
4240				idiv bx    	/* IDIV - VERY EXPENSIVE */
4241				movd mm1, eax   	/* move result of division into MM1 */
4242				packuswb mm1, mm0   	/* pack division result with saturation */
4243				movd eax, mm1   	/* copy saturated result into EAX */
4244				mov [edi], al   	/* copy a byte result into Dest */
4245				movd edx, mm3   	/* restore saved EDX */
4246				movd eax, mm2   	/* restore saved EAX */
4247				/* --, */
4248				sub esi, eax   	/* move two rows up */
4249				sub esi, eax   	/* */
4250				inc              esi    	/* move Src  pointer to the next pixel */
4251				inc              edi    	/* move Dest pointer to the next pixel */
4252				/* ---, */
4253				dec              ecx    	/* decrease loop counter COLUMNS */
4254				jnz            L10322    	/* check loop termination, proceed if required */
4255				add esi, 2   	/* move to the next row in Src */
4256				add edi, 2   	/* move to the next row in Dest */
4257				dec              edx    	/* decrease loop counter ROWS */
4258				jnz            L10320    	/* check loop termination, proceed if required */
4259				/* ---, */
4260				emms                      	/* exit MMX state */
4261				popa
4262		}
4263#else
4264		asm volatile
4265			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
4266			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
4267			"mov           %5, %%bl \n\t"	/* load Divisor into BL */
4268			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
4269			"movq    (%%edx), %%mm5 \n\t"	/* MM5 = {0,K2,K1,K0} */
4270			"add          $8, %%edx \n\t"	/* second row              |K0 K1 K2 0| */
4271			"movq    (%%edx), %%mm6 \n\t"	/* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
4272			"add          $8, %%edx \n\t"	/* third row               |K6 K7 K8 0| */
4273			"movq    (%%edx), %%mm7 \n\t"	/* MM7 = {0,K8,K7,K6} */
4274			/* --- */
4275			"mov          %3, %%eax \n\t"	/* load columns into EAX */
4276			"mov          %1, %%esi \n\t"	/* ESI = Src row 0 address */
4277			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
4278			"add       %%eax, %%edi \n\t"	/* EDI = EDI + columns */
4279			"inc              %%edi \n\t"	/* 1 byte offset from the left edge */
4280			"mov          %2, %%edx \n\t"	/* initialize ROWS counter */
4281			"sub          $2, %%edx \n\t"	/* do not use first and last row */
4282			/* --- */
4283			".L10320:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMS counter */
4284			"sub          $2, %%ecx \n\t"	/* do not use first and last column */
4285			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
4286			".L10322:               \n\t"
4287			/* --- */
4288			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the image first row */
4289			"add       %%eax, %%esi \n\t"	/* move one row below */
4290			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes of the image second row */
4291			"add       %%eax, %%esi \n\t"	/* move one row below */
4292			"movq    (%%esi), %%mm3 \n\t"	/* load 8 bytes of the image third row */
4293			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first 4 bytes into words */
4294			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack first 4 bytes into words */
4295			"punpcklbw %%mm0, %%mm3 \n\t"	/* unpack first 4 bytes into words */
4296			"pmullw    %%mm5, %%mm1 \n\t"	/* multiply words first row  image*Kernel */
4297			"pmullw    %%mm6, %%mm2 \n\t"	/* multiply words second row image*Kernel */
4298			"pmullw    %%mm7, %%mm3 \n\t"	/* multiply words third row  image*Kernel */
4299			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the first and second rows */
4300			"paddsw    %%mm3, %%mm1 \n\t"	/* add 4 words of the third row and result */
4301			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
4302			"psrlq       $32, %%mm1 \n\t"	/* shift 2 left words to the right */
4303			"paddsw    %%mm2, %%mm1 \n\t"	/* add 2 left and 2 right result words */
4304			"movq      %%mm1, %%mm3 \n\t"	/* copy MM1 into MM3 */
4305			"psrlq       $16, %%mm1 \n\t"	/* shift 1 left word to the right */
4306			"paddsw    %%mm3, %%mm1 \n\t"	/* add 1 left and 1 right result words */
4307			/* -- */
4308			"movd      %%eax, %%mm2 \n\t"	/* save EAX in MM2 */
4309			"movd      %%edx, %%mm3 \n\t"	/* save EDX in MM3 */
4310			"movd      %%mm1, %%eax \n\t"	/* copy MM1 into EAX */
4311			"psraw       $15, %%mm1 \n\t"	/* spread sign bit of the result */
4312			"movd      %%mm1, %%edx \n\t"	/* fill EDX with a sign bit */
4313			"idivw             %%bx \n\t"	/* IDIV - VERY EXPENSIVE */
4314			"movd      %%eax, %%mm1 \n\t"	/* move result of division into MM1 */
4315			"packuswb  %%mm0, %%mm1 \n\t"	/* pack division result with saturation */
4316			"movd      %%mm1, %%eax \n\t"	/* copy saturated result into EAX */
4317			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
4318			"movd      %%mm3, %%edx \n\t"	/* restore saved EDX */
4319			"movd      %%mm2, %%eax \n\t"	/* restore saved EAX */
4320			/* -- */
4321			"sub       %%eax, %%esi \n\t"	/* move two rows up */
4322			"sub       %%eax, %%esi \n\t"	/* */
4323			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
4324			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
4325			/* --- */
4326			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
4327			"jnz            .L10322 \n\t"	/* check loop termination, proceed if required */
4328			"add          $2, %%esi \n\t"	/* move to the next row in Src */
4329			"add          $2, %%edi \n\t"	/* move to the next row in Dest */
4330			"dec              %%edx \n\t"	/* decrease loop counter ROWS */
4331			"jnz            .L10320 \n\t"	/* check loop termination, proceed if required */
4332			/* --- */
4333			"emms                   \n\t"	/* exit MMX state */
4334			"popa                   \n\t":"=m" (Dest)	/* %0 */
4335			:"m"(Src),		/* %1 */
4336			"m"(rows),		/* %2 */
4337			"m"(columns),		/* %3 */
4338			"m"(Kernel),		/* %4 */
4339			"m"(Divisor)		/* %5 */
4340			);
4341#endif
4342#endif
4343		return (0);
4344	} else {
4345		/* No non-MMX implementation yet */
4346		return (-1);
4347	}
4348}
4349
4350/*!
4351\brief Filter using ConvolveKernel5x5Divide: Dij = saturation0and255( ... ) 
4352
4353\param Src The source 2D byte array to convolve. Should be different from destination.
4354\param Dest The destination 2D byte array to store the result in. Should be different from source.
4355\param rows Number of rows in source/destination array. Must be >4.
4356\param columns Number of columns in source/destination array. Must be >4.
4357\param Kernel The 2D convolution kernel of size 5x5.
4358\param Divisor The divisor of the convolution sum. Must be >0.
4359
4360Note: Non-MMX implementation not available for this function.
4361
4362\return Returns 1 if filter was applied, 0 otherwise.
4363*/
4364int SDL_imageFilterConvolveKernel5x5Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
4365										   signed short *Kernel, unsigned char Divisor)
4366{
4367	/* Validate input parameters */
4368	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
4369		return(-1);
4370
4371	if ((columns < 5) || (rows < 5) || (Divisor == 0))
4372		return (-1);
4373
4374	if ((SDL_imageFilterMMXdetect())) {
4375#ifdef USE_MMX
4376#if !defined(GCC__)
4377		__asm
4378		{
4379			pusha
4380				pxor mm0, mm0   	/* zero MM0 */
4381				xor ebx, ebx   	/* zero EBX */
4382				mov bl, Divisor   	/* load Divisor into BL */
4383				movd mm5, ebx   	/* copy Divisor into MM5 */
4384				mov edx, Kernel   	/* load Kernel address into EDX */
4385				mov esi, Src   	/* load Src  address to ESI */
4386				mov edi, Dest   	/* load Dest address to EDI */
4387				add edi, 2   	/* 2 column offset from the left edge */
4388				mov eax, columns   	/* load columns into EAX */
4389				shl eax, 1   	/* EAX = columns * 2 */
4390				add edi, eax   	/* 2 row offset from the top edge */
4391				shr eax, 1   	/* EAX = columns */
4392				mov ebx, rows   	/* initialize ROWS counter */
4393				sub ebx, 4   	/* do not use first 2 and last 2 rows */
4394				/* ---, */
4395L10330:
4396			mov ecx, eax   	/* initialize COLUMNS counter */
4397				sub ecx, 4   	/* do not use first 2 and last 2 columns */
4398				align 16                 	/* 16 byte alignment of the loop entry */
4399L10332:
4400			pxor mm7, mm7   	/* zero MM7 (accumulator) */
4401				movd mm6, esi   	/* save ESI in MM6 */
4402				/* --- 1 */
4403				movq mm1, [esi]   	/* load 8 bytes of the Src */
4404			movq mm2, mm1   	/* copy MM1 into MM2 */
4405				add esi, eax   	/* move Src pointer 1 row below */
4406				movq mm3, [edx]   	/* load 4 words of Kernel */
4407			add edx, 8   	/* move pointer to other 4 words */
4408				movq mm4, [edx]   	/* load 4 words of Kernel */
4409			add edx, 8   	/* move pointer to other 4 words */
4410				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
4411				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
4412				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
4413				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
4414				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
4415				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
4416				/* --- 2 */
4417				movq mm1, [esi]   	/* load 8 bytes of the Src */
4418			movq mm2, mm1   	/* copy MM1 into MM2 */
4419				add esi, eax   	/* move Src pointer 1 row below */
4420				movq mm3, [edx]   	/* load 4 words of Kernel */
4421			add edx, 8   	/* move pointer to other 4 words */
4422				movq mm4, [edx]   	/* load 4 words of Kernel */
4423			add edx, 8   	/* move pointer to other 4 words */
4424				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
4425				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
4426				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
4427				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
4428				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
4429				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
4430				/* --- 3 */
4431				movq mm1, [esi]   	/* load 8 bytes of the Src */
4432			movq mm2, mm1   	/* copy MM1 into MM2 */
4433				add esi, eax   	/* move Src pointer 1 row below */
4434				movq mm3, [edx]   	/* load 4 words of Kernel */
4435			add edx, 8   	/* move pointer to other 4 words */
4436				movq mm4, [edx]   	/* load 4 words of Kernel */
4437			add edx, 8   	/* move pointer to other 4 words */
4438				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
4439				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
4440				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
4441				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
4442				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
4443				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
4444				/* --- 4 */
4445				movq mm1, [esi]   	/* load 8 bytes of the Src */
4446			movq mm2, mm1   	/* copy MM1 into MM2 */
4447				add esi, eax   	/* move Src pointer 1 row below */
4448				movq mm3, [edx]   	/* load 4 words of Kernel */
4449			add edx, 8   	/* move pointer to other 4 words */
4450				movq mm4, [edx]   	/* load 4 words of Kernel */
4451			add edx, 8   	/* move pointer to other 4 words */
4452				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
4453				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
4454				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
4455				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
4456				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
4457				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
4458				/* --- 5 */
4459				movq mm1, [esi]   	/* load 8 bytes of the Src */
4460			movq mm2, mm1   	/* copy MM1 into MM2 */
4461				movq mm3, [edx]   	/* load 4 words of Kernel */
4462			add edx, 8   	/* move pointer to other 4 words */
4463				movq mm4, [edx]   	/* load 4 words of Kernel */
4464			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
4465				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
4466				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
4467				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
4468				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
4469				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
4470				/* ---, */
4471				movq mm3, mm7   	/* copy MM7 into MM3 */
4472				psrlq mm7, 32   	/* shift 2 left words to the right */
4473				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
4474				movq mm2, mm7   	/* copy MM7 into MM2 */
4475				psrlq mm7, 16   	/* shift 1 left word to the right */
4476				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
4477				/* ---, */
4478				movd mm1, eax   	/* save EDX in MM1 */
4479				movd mm2, ebx   	/* save EDX in MM2 */
4480				movd mm3, edx   	/* save EDX in MM3 */
4481				movd eax, mm7   	/* load summation result into EAX */
4482				psraw mm7, 15   	/* spread sign bit of the result */
4483				movd ebx, mm5   	/* load Divisor into EBX */
4484				movd edx, mm7   	/* fill EDX with a sign bit */
4485				idiv bx    	/* IDIV - VERY EXPENSIVE */
4486				movd mm7, eax   	/* move result of division into MM7 */
4487				packuswb mm7, mm0   	/* pack division result with saturation */
4488				movd eax, mm7   	/* copy saturated result into EAX */
4489				mov [edi], al   	/* copy a byte result into Dest */
4490				movd edx, mm3   	/* restore saved EDX */
4491				movd ebx, mm2   	/* restore saved EBX */
4492				movd eax, mm1   	/* restore saved EAX */
4493				/* --, */
4494				movd esi, mm6   	/* move Src pointer to the top pixel */
4495				sub edx, 72   	/* EDX = Kernel address */
4496				inc              esi    	/* move Src  pointer to the next pixel */
4497				inc              edi    	/* move Dest pointer to the next pixel */
4498				/* ---, */
4499				dec              ecx    	/* decrease loop counter COLUMNS */
4500				jnz            L10332    	/* check loop termination, proceed if required */
4501				add esi, 4   	/* move to the next row in Src */
4502				add edi, 4   	/* move to the next row in Dest */
4503				dec              ebx    	/* decrease loop counter ROWS */
4504				jnz            L10330    	/* check loop termination, proceed if required */
4505				/* ---, */
4506				emms                      	/* exit MMX state */
4507				popa
4508		}
4509#else
4510		asm volatile
4511			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
4512			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
4513			"mov           %5, %%bl \n\t"	/* load Divisor into BL */
4514			"movd      %%ebx, %%mm5 \n\t"	/* copy Divisor into MM5 */
4515			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
4516			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
4517			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
4518			"add          $2, %%edi \n\t"	/* 2 column offset from the left edge */
4519			"mov          %3, %%eax \n\t"	/* load columns into EAX */
4520			"shl          $1, %%eax \n\t"	/* EAX = columns * 2 */
4521			"add       %%eax, %%edi \n\t"	/* 2 row offset from the top edge */
4522			"shr          $1, %%eax \n\t"	/* EAX = columns */
4523			"mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
4524			"sub          $4, %%ebx \n\t"	/* do not use first 2 and last 2 rows */
4525			/* --- */
4526			".L10330:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
4527			"sub          $4, %%ecx \n\t"	/* do not use first 2 and last 2 columns */
4528			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
4529			".L10332:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
4530			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
4531			/* --- 1 */
4532			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
4533			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
4534			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
4535			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
4536			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4537			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
4538			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4539			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
4540			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
4541			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
4542			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
4543			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
4544			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
4545			/* --- 2 */
4546			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
4547			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
4548			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
4549			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
4550			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4551			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
4552			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4553			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
4554			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
4555			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
4556			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
4557			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
4558			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
4559			/* --- 3 */
4560			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
4561			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
4562			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
4563			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
4564			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4565			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
4566			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4567			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
4568			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
4569			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
4570			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
4571			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
4572			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
4573			/* --- 4 */
4574			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
4575			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
4576			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
4577			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
4578			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4579			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
4580			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4581			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
4582			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
4583			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
4584			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
4585			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
4586			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
4587			/* --- 5 */
4588			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
4589			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
4590			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
4591			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4592			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
4593			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
4594			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
4595			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
4596			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
4597			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
4598			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
4599			/* --- */
4600			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
4601			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
4602			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
4603			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
4604			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
4605			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
4606			/* --- */
4607			"movd      %%eax, %%mm1 \n\t"	/* save EDX in MM1 */
4608			"movd      %%ebx, %%mm2 \n\t"	/* save EDX in MM2 */
4609			"movd      %%edx, %%mm3 \n\t"	/* save EDX in MM3 */
4610			"movd      %%mm7, %%eax \n\t"	/* load summation result into EAX */
4611			"psraw       $15, %%mm7 \n\t"	/* spread sign bit of the result */
4612			"movd      %%mm5, %%ebx \n\t"	/* load Divisor into EBX */
4613			"movd      %%mm7, %%edx \n\t"	/* fill EDX with a sign bit */
4614			"idivw             %%bx \n\t"	/* IDIV - VERY EXPENSIVE */
4615			"movd      %%eax, %%mm7 \n\t"	/* move result of division into MM7 */
4616			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
4617			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
4618			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
4619			"movd      %%mm3, %%edx \n\t"	/* restore saved EDX */
4620			"movd      %%mm2, %%ebx \n\t"	/* restore saved EBX */
4621			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
4622			/* -- */
4623			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
4624			"sub         $72, %%edx \n\t"	/* EDX = Kernel address */
4625			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
4626			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
4627			/* --- */
4628			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
4629			"jnz            .L10332 \n\t"	/* check loop termination, proceed if required */
4630			"add          $4, %%esi \n\t"	/* move to the next row in Src */
4631			"add          $4, %%edi \n\t"	/* move to the next row in Dest */
4632			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
4633			"jnz            .L10330 \n\t"	/* check loop termination, proceed if required */
4634			/* --- */
4635			"emms                   \n\t"	/* exit MMX state */
4636			"popa                   \n\t":"=m" (Dest)	/* %0 */
4637			:"m"(Src),		/* %1 */
4638			"m"(rows),		/* %2 */
4639			"m"(columns),		/* %3 */
4640			"m"(Kernel),		/* %4 */
4641			"m"(Divisor)		/* %5 */
4642			);
4643#endif
4644#endif
4645		return (0);
4646	} else {
4647		/* No non-MMX implementation yet */
4648		return (-1);
4649	}
4650}
4651
4652/*!
4653\brief Filter using ConvolveKernel7x7Divide: Dij = saturation0and255( ... ) 
4654
4655\param Src The source 2D byte array to convolve. Should be different from destination.
4656\param Dest The destination 2D byte array to store the result in. Should be different from source.
4657\param rows Number of rows in source/destination array. Must be >6.
4658\param columns Number of columns in source/destination array. Must be >6.
4659\param Kernel The 2D convolution kernel of size 7x7.
4660\param Divisor The divisor of the convolution sum. Must be >0.
4661
4662Note: Non-MMX implementation not available for this function.
4663
4664\return Returns 1 if filter was applied, 0 otherwise.
4665*/
4666int SDL_imageFilterConvolveKernel7x7Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
4667										   signed short *Kernel, unsigned char Divisor)
4668{
4669	/* Validate input parameters */
4670	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
4671		return(-1);
4672
4673	if ((columns < 7) || (rows < 7) || (Divisor == 0))
4674		return (-1);
4675
4676	if ((SDL_imageFilterMMXdetect())) {
4677#ifdef USE_MMX
4678#if !defined(GCC__)
4679		__asm
4680		{
4681			pusha
4682				pxor mm0, mm0   	/* zero MM0 */
4683				xor ebx, ebx   	/* zero EBX */
4684				mov bl, Divisor   	/* load Divisor into BL */
4685				movd mm5, ebx   	/* copy Divisor into MM5 */
4686				mov edx, Kernel  	/* load Kernel address into EDX */
4687				mov esi, Src   	/* load Src  address to ESI */
4688				mov edi, Dest   	/* load Dest address to EDI */
4689				add edi, 3   	/* 3 column offset from the left edge */
4690				mov eax, columns   	/* load columns into EAX */
4691				add edi, eax   	/* 3 row offset from the top edge */
4692				add edi, eax
4693				add edi, eax
4694				mov ebx, rows   	/* initialize ROWS counter */
4695				sub ebx, 6   	/* do not use first 3 and last 3 rows */
4696				/* ---, */
4697L10340:
4698			mov ecx, eax   	/* initialize COLUMNS counter */
4699				sub ecx, 6   	/* do not use first 3 and last 3 columns */
4700				align 16                 	/* 16 byte alignment of the loop entry */
4701L10342:
4702			pxor mm7, mm7   	/* zero MM7 (accumulator) */
4703				movd mm6, esi   	/* save ESI in MM6 */
4704				/* --- 1 */
4705				movq mm1, [esi]   	/* load 8 bytes of the Src */
4706			movq mm2, mm1   	/* copy MM1 into MM2 */
4707				add esi, eax   	/* move Src pointer 1 row below */
4708				movq mm3, [edx]   	/* load 4 words of Kernel */
4709			add edx, 8   	/* move pointer to other 4 words */
4710				movq mm4, [edx]   	/* load 4 words of Kernel */
4711			add edx, 8   	/* move pointer to other 4 words */
4712				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
4713				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
4714				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
4715				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
4716				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
4717				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
4718				/* --- 2 */
4719				movq mm1, [esi]   	/* load 8 bytes of the Src */
4720			movq mm2, mm1   	/* copy MM1 into MM2 */
4721				add esi, eax   	/* move Src pointer 1 row below */
4722				movq mm3, [edx]   	/* load 4 words of Kernel */
4723			add edx, 8   	/* move pointer to other 4 words */
4724				movq mm4, [edx]   	/* load 4 words of Kernel */
4725			add edx, 8   	/* move pointer to other 4 words */
4726				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
4727				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
4728				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
4729				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
4730				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
4731				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
4732				/* --- 3 */
4733				movq mm1, [esi]   	/* load 8 bytes of the Src */
4734			movq mm2, mm1   	/* copy MM1 into MM2 */
4735				add esi, eax   	/* move Src pointer 1 row below */
4736				movq mm3, [edx]   	/* load 4 words of Kernel */
4737			add edx, 8   	/* move pointer to other 4 words */
4738				movq mm4, [edx]   	/* load 4 words of Kernel */
4739			add edx, 8   	/* move pointer to other 4 words */
4740				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
4741				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
4742				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
4743				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
4744				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
4745				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
4746				/* --- 4 */
4747				movq mm1, [esi]   	/* load 8 bytes of the Src */
4748			movq mm2, mm1   	/* copy MM1 into MM2 */
4749				add esi, eax   	/* move Src pointer 1 row below */
4750				movq mm3, [edx]   	/* load 4 words of Kernel */
4751			add edx, 8   	/* move pointer to other 4 words */
4752				movq mm4, [edx]   	/* load 4 words of Kernel */
4753			add edx, 8   	/* move pointer to other 4 words */
4754				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
4755				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
4756				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
4757				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
4758				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
4759				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
4760				/* --- 5 */
4761				movq mm1, [esi]   	/* load 8 bytes of the Src */
4762			movq mm2, mm1   	/* copy MM1 into MM2 */
4763				add esi, eax   	/* move Src pointer 1 row below */
4764				movq mm3, [edx]   	/* load 4 words of Kernel */
4765			add edx, 8   	/* move pointer to other 4 words */
4766				movq mm4, [edx]   	/* load 4 words of Kernel */
4767			add edx, 8   	/* move pointer to other 4 words */
4768				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
4769				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
4770				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
4771				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
4772				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
4773				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
4774				/* --- 6 */
4775				movq mm1, [esi]   	/* load 8 bytes of the Src */
4776			movq mm2, mm1   	/* copy MM1 into MM2 */
4777				add esi, eax   	/* move Src pointer 1 row below */
4778				movq mm3, [edx]   	/* load 4 words of Kernel */
4779			add edx, 8   	/* move pointer to other 4 words */
4780				movq mm4, [edx]   	/* load 4 words of Kernel */
4781			add edx, 8   	/* move pointer to other 4 words */
4782				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
4783				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
4784				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
4785				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
4786				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
4787				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
4788				/* --- 7 */
4789				movq mm1, [esi]   	/* load 8 bytes of the Src */
4790			movq mm2, mm1   	/* copy MM1 into MM2 */
4791				movq mm3, [edx]   	/* load 4 words of Kernel */
4792			add edx, 8   	/* move pointer to other 4 words */
4793				movq mm4, [edx]   	/* load 4 words of Kernel */
4794			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
4795				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
4796				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
4797				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
4798				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
4799				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
4800				/* ---, */
4801				movq mm3, mm7   	/* copy MM7 into MM3 */
4802				psrlq mm7, 32   	/* shift 2 left words to the right */
4803				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
4804				movq mm2, mm7   	/* copy MM7 into MM2 */
4805				psrlq mm7, 16   	/* shift 1 left word to the right */
4806				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
4807				/* ---, */
4808				movd mm1, eax   	/* save EDX in MM1 */
4809				movd mm2, ebx   	/* save EDX in MM2 */
4810				movd mm3, edx   	/* save EDX in MM3 */
4811				movd eax, mm7   	/* load summation result into EAX */
4812				psraw mm7, 15   	/* spread sign bit of the result */
4813				movd ebx, mm5   	/* load Divisor into EBX */
4814				movd edx, mm7   	/* fill EDX with a sign bit */
4815				idiv bx    	/* IDIV - VERY EXPENSIVE */
4816				movd mm7, eax   	/* move result of division into MM7 */
4817				packuswb mm7, mm0   	/* pack division result with saturation */
4818				movd eax, mm7   	/* copy saturated result into EAX */
4819				mov [edi], al   	/* copy a byte result into Dest */
4820				movd edx, mm3   	/* restore saved EDX */
4821				movd ebx, mm2   	/* restore saved EBX */
4822				movd eax, mm1   	/* restore saved EAX */
4823				/* --, */
4824				movd esi, mm6   	/* move Src pointer to the top pixel */
4825				sub edx, 104   	/* EDX = Kernel address */
4826				inc              esi    	/* move Src  pointer to the next pixel */
4827				inc              edi    	/* move Dest pointer to the next pixel */
4828				/* ---, */
4829				dec              ecx    	/* decrease loop counter COLUMNS */
4830				jnz            L10342    	/* check loop termination, proceed if required */
4831				add esi, 6   	/* move to the next row in Src */
4832				add edi, 6   	/* move to the next row in Dest */
4833				dec              ebx    	/* decrease loop counter ROWS */
4834				jnz            L10340    	/* check loop termination, proceed if required */
4835				/* ---, */
4836				emms                      	/* exit MMX state */
4837				popa
4838		}
4839#else
4840		asm volatile
4841			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
4842			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
4843			"mov           %5, %%bl \n\t"	/* load Divisor into BL */
4844			"movd      %%ebx, %%mm5 \n\t"	/* copy Divisor into MM5 */
4845			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
4846			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
4847			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
4848			"add          $3, %%edi \n\t"	/* 3 column offset from the left edge */
4849			"mov          %3, %%eax \n\t"	/* load columns into EAX */
4850			"add       %%eax, %%edi \n\t"	/* 3 row offset from the top edge */
4851			"add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
4852			"sub          $6, %%ebx \n\t"	/* do not use first 3 and last 3 rows */
4853			/* --- */
4854			".L10340:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
4855			"sub          $6, %%ecx \n\t"	/* do not use first 3 and last 3 columns */
4856			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
4857			".L10342:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
4858			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
4859			/* --- 1 */
4860			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
4861			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
4862			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
4863			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
4864			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4865			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
4866			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4867			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
4868			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
4869			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
4870			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
4871			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
4872			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
4873			/* --- 2 */
4874			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
4875			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
4876			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
4877			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
4878			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4879			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
4880			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4881			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
4882			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
4883			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
4884			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
4885			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
4886			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
4887			/* --- 3 */
4888			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
4889			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
4890			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
4891			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
4892			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4893			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
4894			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4895			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
4896			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
4897			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
4898			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
4899			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
4900			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
4901			/* --- 4 */
4902			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
4903			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
4904			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
4905			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
4906			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4907			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
4908			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4909			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
4910			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
4911			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
4912			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
4913			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
4914			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
4915			/* --- 5 */
4916			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
4917			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
4918			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
4919			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
4920			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4921			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
4922			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4923			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
4924			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
4925			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
4926			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
4927			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
4928			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
4929			/* --- 6 */
4930			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
4931			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
4932			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
4933			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
4934			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4935			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
4936			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4937			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
4938			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
4939			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
4940			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
4941			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
4942			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
4943			/* --- 7 */
4944			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
4945			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
4946			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
4947			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
4948			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
4949			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
4950			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
4951			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
4952			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
4953			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
4954			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
4955			/* --- */
4956			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
4957			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
4958			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
4959			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
4960			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
4961			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
4962			/* --- */
4963			"movd      %%eax, %%mm1 \n\t"	/* save EDX in MM1 */
4964			"movd      %%ebx, %%mm2 \n\t"	/* save EDX in MM2 */
4965			"movd      %%edx, %%mm3 \n\t"	/* save EDX in MM3 */
4966			"movd      %%mm7, %%eax \n\t"	/* load summation result into EAX */
4967			"psraw       $15, %%mm7 \n\t"	/* spread sign bit of the result */
4968			"movd      %%mm5, %%ebx \n\t"	/* load Divisor into EBX */
4969			"movd      %%mm7, %%edx \n\t"	/* fill EDX with a sign bit */
4970			"idivw             %%bx \n\t"	/* IDIV - VERY EXPENSIVE */
4971			"movd      %%eax, %%mm7 \n\t"	/* move result of division into MM7 */
4972			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
4973			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
4974			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
4975			"movd      %%mm3, %%edx \n\t"	/* restore saved EDX */
4976			"movd      %%mm2, %%ebx \n\t"	/* restore saved EBX */
4977			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
4978			/* -- */
4979			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
4980			"sub        $104, %%edx \n\t"	/* EDX = Kernel address */
4981			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
4982			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
4983			/* --- */
4984			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
4985			"jnz            .L10342 \n\t"	/* check loop termination, proceed if required */
4986			"add          $6, %%esi \n\t"	/* move to the next row in Src */
4987			"add          $6, %%edi \n\t"	/* move to the next row in Dest */
4988			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
4989			"jnz            .L10340 \n\t"	/* check loop termination, proceed if required */
4990			/* --- */
4991			"emms                   \n\t"	/* exit MMX state */
4992			"popa                   \n\t":"=m" (Dest)	/* %0 */
4993			:"m"(Src),		/* %1 */
4994			"m"(rows),		/* %2 */
4995			"m"(columns),		/* %3 */
4996			"m"(Kernel),		/* %4 */
4997			"m"(Divisor)		/* %5 */
4998			);
4999#endif
5000#endif
5001		return (0);
5002	} else {
5003		/* No non-MMX implementation yet */
5004		return (-1);
5005	}
5006}
5007
5008/*!
5009\brief Filter using ConvolveKernel9x9Divide: Dij = saturation0and255( ... ) 
5010
5011\param Src The source 2D byte array to convolve. Should be different from destination.
5012\param Dest The destination 2D byte array to store the result in. Should be different from source.
5013\param rows Number of rows in source/destination array. Must be >8.
5014\param columns Number of columns in source/destination array. Must be >8.
5015\param Kernel The 2D convolution kernel of size 9x9.
5016\param Divisor The divisor of the convolution sum. Must be >0.
5017
5018Note: Non-MMX implementation not available for this function.
5019
5020\return Returns 1 if filter was applied, 0 otherwise.
5021*/
5022int SDL_imageFilterConvolveKernel9x9Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
5023										   signed short *Kernel, unsigned char Divisor)
5024{
5025	/* Validate input parameters */
5026	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
5027		return(-1);
5028
5029	if ((columns < 9) || (rows < 9) || (Divisor == 0))
5030		return (-1);
5031
5032	if ((SDL_imageFilterMMXdetect())) {
5033#ifdef USE_MMX
5034#if !defined(GCC__)
5035		__asm
5036		{
5037			pusha
5038				pxor mm0, mm0   	/* zero MM0 */
5039				xor ebx, ebx   	/* zero EBX */
5040				mov bl, Divisor   	/* load Divisor into BL */
5041				movd mm5, ebx   	/* copy Divisor into MM5 */
5042				mov edx, Kernel   	/* load Kernel address into EDX */
5043				mov esi, Src   	/* load Src  address to ESI */
5044				mov edi, Dest   	/* load Dest address to EDI */
5045				add edi, 4   	/* 4 column offset from the left edge */
5046				mov eax, columns   	/* load columns into EAX */
5047				add edi, eax   	/* 4 row offset from the top edge */
5048				add edi, eax
5049				add edi, eax
5050				add edi, eax
5051				mov ebx, rows   	/* initialize ROWS counter */
5052				sub ebx, 8   	/* do not use first 4 and last 4 rows */
5053				/* ---, */
5054L10350:
5055			mov ecx, eax   	/* initialize COLUMNS counter */
5056				sub ecx, 8   	/* do not use first 4 and last 4 columns */
5057				align 16                 	/* 16 byte alignment of the loop entry */
5058L10352:
5059			pxor mm7, mm7   	/* zero MM7 (accumulator) */
5060				movd mm6, esi   	/* save ESI in MM6 */
5061				/* --- 1 */
5062				movq mm1, [esi]   	/* load 8 bytes of the Src */
5063			movq mm2, mm1   	/* copy MM1 into MM2 */
5064				inc              esi    	/* move pointer to the next 8 bytes of Src */
5065				movq mm3, [edx]   	/* load 4 words of Kernel */
5066			add edx, 8   	/* move pointer to other 4 words */
5067				movq mm4, [edx]   	/* load 4 words of Kernel */
5068			add edx, 8   	/* move pointer to other 4 words */
5069				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
5070				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
5071				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
5072				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
5073				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
5074				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
5075				movq mm1, [esi]   	/* load 8 bytes of the Src */
5076			dec              esi
5077				add esi, eax   	/* move Src pointer 1 row below */
5078				movq mm3, [edx]   	/* load 4 words of Kernel */
5079			add edx, 8   	/* move pointer to other 4 words */
5080				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
5081				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
5082				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
5083				/* --- 2 */
5084				movq mm1, [esi]   	/* load 8 bytes of the Src */
5085			movq mm2, mm1   	/* copy MM1 into MM2 */
5086				inc              esi    	/* move pointer to the next 8 bytes of Src */
5087				movq mm3, [edx]   	/* load 4 words of Kernel */
5088			add edx, 8   	/* move pointer to other 4 words */
5089				movq mm4, [edx]   	/* load 4 words of Kernel */
5090			add edx, 8   	/* move pointer to other 4 words */
5091				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
5092				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
5093				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
5094				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
5095				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
5096				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
5097				movq mm1, [esi]   	/* load 8 bytes of the Src */
5098			dec              esi
5099				add esi, eax   	/* move Src pointer 1 row below */
5100				movq mm3, [edx]   	/* load 4 words of Kernel */
5101			add edx, 8   	/* move pointer to other 4 words */
5102				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
5103				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
5104				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
5105				/* --- 3 */
5106				movq mm1, [esi]   	/* load 8 bytes of the Src */
5107			movq mm2, mm1   	/* copy MM1 into MM2 */
5108				inc              esi    	/* move pointer to the next 8 bytes of Src */
5109				movq mm3, [edx]   	/* load 4 words of Kernel */
5110			add edx, 8   	/* move pointer to other 4 words */
5111				movq mm4, [edx]   	/* load 4 words of Kernel */
5112			add edx, 8   	/* move pointer to other 4 words */
5113				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
5114				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
5115				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
5116				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
5117				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
5118				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
5119				movq mm1, [esi]   	/* load 8 bytes of the Src */
5120			dec              esi
5121				add esi, eax   	/* move Src pointer 1 row below */
5122				movq mm3, [edx]   	/* load 4 words of Kernel */
5123			add edx, 8   	/* move pointer to other 4 words */
5124				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
5125				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
5126				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
5127				/* --- 4 */
5128				movq mm1, [esi]   	/* load 8 bytes of the Src */
5129			movq mm2, mm1   	/* copy MM1 into MM2 */
5130				inc              esi    	/* move pointer to the next 8 bytes of Src */
5131				movq mm3, [edx]   	/* load 4 words of Kernel */
5132			add edx, 8   	/* move pointer to other 4 words */
5133				movq mm4, [edx]   	/* load 4 words of Kernel */
5134			add edx, 8   	/* move pointer to other 4 words */
5135				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
5136				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
5137				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
5138				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
5139				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
5140				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
5141				movq mm1, [esi]   	/* load 8 bytes of the Src */
5142			dec              esi
5143				add esi, eax   	/* move Src pointer 1 row below */
5144				movq mm3, [edx]   	/* load 4 words of Kernel */
5145			add edx, 8   	/* move pointer to other 4 words */
5146				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
5147				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
5148				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
5149				/* --- 5 */
5150				movq mm1, [esi]   	/* load 8 bytes of the Src */
5151			movq mm2, mm1   	/* copy MM1 into MM2 */
5152				inc              esi    	/* move pointer to the next 8 bytes of Src */
5153				movq mm3, [edx]   	/* load 4 words of Kernel */
5154			add edx, 8   	/* move pointer to other 4 words */
5155				movq mm4, [edx]   	/* load 4 words of Kernel */
5156			add edx, 8   	/* move pointer to other 4 words */
5157				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
5158				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
5159				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
5160				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
5161				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
5162				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
5163				movq mm1, [esi]   	/* load 8 bytes of the Src */
5164			dec              esi
5165				add esi, eax   	/* move Src pointer 1 row below */
5166				movq mm3, [edx]   	/* load 4 words of Kernel */
5167			add edx, 8   	/* move pointer to other 4 words */
5168				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
5169				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
5170				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
5171				/* --- 6 */
5172				movq mm1, [esi]   	/* load 8 bytes of the Src */
5173			movq mm2, mm1   	/* copy MM1 into MM2 */
5174				inc              esi    	/* move pointer to the next 8 bytes of Src */
5175				movq mm3, [edx]   	/* load 4 words of Kernel */
5176			add edx, 8   	/* move pointer to other 4 words */
5177				movq mm4, [edx]   	/* load 4 words of Kernel */
5178			add edx, 8   	/* move pointer to other 4 words */
5179				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
5180				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
5181				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
5182				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
5183				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
5184				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
5185				movq mm1, [esi]   	/* load 8 bytes of the Src */
5186			dec              esi
5187				add esi, eax   	/* move Src pointer 1 row below */
5188				movq mm3, [edx]   	/* load 4 words of Kernel */
5189			add edx, 8   	/* move pointer to other 4 words */
5190				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
5191				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
5192				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
5193				/* --- 7 */
5194				movq mm1, [esi]   	/* load 8 bytes of the Src */
5195			movq mm2, mm1   	/* copy MM1 into MM2 */
5196				inc              esi    	/* move pointer to the next 8 bytes of Src */
5197				movq mm3, [edx]   	/* load 4 words of Kernel */
5198			add edx, 8   	/* move pointer to other 4 words */
5199				movq mm4, [edx]   	/* load 4 words of Kernel */
5200			add edx, 8   	/* move pointer to other 4 words */
5201				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
5202				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
5203				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
5204				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
5205				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
5206				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
5207				movq mm1, [esi]   	/* load 8 bytes of the Src */
5208			dec              esi
5209				add esi, eax   	/* move Src pointer 1 row below */
5210				movq mm3, [edx]   	/* load 4 words of Kernel */
5211			add edx, 8   	/* move pointer to other 4 words */
5212				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
5213				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
5214				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
5215				/* --- 8 */
5216				movq mm1, [esi]   	/* load 8 bytes of the Src */
5217			movq mm2, mm1   	/* copy MM1 into MM2 */
5218				inc              esi    	/* move pointer to the next 8 bytes of Src */
5219				movq mm3, [edx]   	/* load 4 words of Kernel */
5220			add edx, 8   	/* move pointer to other 4 words */
5221				movq mm4, [edx]   	/* load 4 words of Kernel */
5222			add edx, 8   	/* move pointer to other 4 words */
5223				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
5224				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
5225				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
5226				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
5227				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
5228				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
5229				movq mm1, [esi]   	/* load 8 bytes of the Src */
5230			dec              esi
5231				add esi, eax   	/* move Src pointer 1 row below */
5232				movq mm3, [edx]   	/* load 4 words of Kernel */
5233			add edx, 8   	/* move pointer to other 4 words */
5234				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
5235				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
5236				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
5237				/* --- 9 */
5238				movq mm1, [esi]   	/* load 8 bytes of the Src */
5239			movq mm2, mm1   	/* copy MM1 into MM2 */
5240				inc              esi    	/* move pointer to the next 8 bytes of Src */
5241				movq mm3, [edx]   	/* load 4 words of Kernel */
5242			add edx, 8   	/* move pointer to other 4 words */
5243				movq mm4, [edx]   	/* load 4 words of Kernel */
5244			add edx, 8   	/* move pointer to other 4 words */
5245				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
5246				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
5247				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
5248				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
5249				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
5250				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
5251				movq mm1, [esi]   	/* load 8 bytes of the Src */
5252			movq mm3, [edx]   	/* load 4 words of Kernel */
5253			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
5254				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
5255				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
5256				/* ---, */
5257				movq mm3, mm7   	/* copy MM7 into MM3 */
5258				psrlq mm7, 32   	/* shift 2 left words to the right */
5259				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
5260				movq mm2, mm7   	/* copy MM7 into MM2 */
5261				psrlq mm7, 16   	/* shift 1 left word to the right */
5262				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
5263				/* ---, */
5264				movd mm1, eax   	/* save EDX in MM1 */
5265				movd mm2, ebx   	/* save EDX in MM2 */
5266				movd mm3, edx   	/* save EDX in MM3 */
5267				movd eax, mm7   	/* load summation result into EAX */
5268				psraw mm7, 15   	/* spread sign bit of the result */
5269				movd ebx, mm5   	/* load Divisor into EBX */
5270				movd edx, mm7   	/* fill EDX with a sign bit */
5271				idiv bx    	/* IDIV - VERY EXPENSIVE */
5272				movd mm7, eax   	/* move result of division into MM7 */
5273				packuswb mm7, mm0   	/* pack division result with saturation */
5274				movd eax, mm7   	/* copy saturated result into EAX */
5275				mov [edi], al   	/* copy a byte result into Dest */
5276				movd edx, mm3   	/* restore saved EDX */
5277				movd ebx, mm2   	/* restore saved EBX */
5278				movd eax, mm1   	/* restore saved EAX */
5279				/* --, */
5280				movd esi, mm6   	/* move Src pointer to the top pixel */
5281				sub edx, 208   	/* EDX = Kernel address */
5282				inc              esi    	/* move Src  pointer to the next pixel */
5283				inc              edi    	/* move Dest pointer to the next pixel */
5284				/* ---, */
5285				dec              ecx    	/* decrease loop counter COLUMNS */
5286				jnz            L10352    	/* check loop termination, proceed if required */
5287				add esi, 8   	/* move to the next row in Src */
5288				add edi, 8   	/* move to the next row in Dest */
5289				dec              ebx    	/* decrease loop counter ROWS */
5290				jnz            L10350    	/* check loop termination, proceed if required */
5291				/* ---, */
5292				emms                      	/* exit MMX state */
5293				popa
5294		}
5295#else
5296		asm volatile
5297			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
5298			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
5299			"mov           %5, %%bl \n\t"	/* load Divisor into BL */
5300			"movd      %%ebx, %%mm5 \n\t"	/* copy Divisor into MM5 */
5301			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
5302			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
5303			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
5304			"add          $4, %%edi \n\t"	/* 4 column offset from the left edge */
5305			"mov          %3, %%eax \n\t"	/* load columns into EAX */
5306			"add       %%eax, %%edi \n\t"	/* 4 row offset from the top edge */
5307			"add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
5308			"sub          $8, %%ebx \n\t"	/* do not use first 4 and last 4 rows */
5309			/* --- */
5310			".L10350:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
5311			"sub          $8, %%ecx \n\t"	/* do not use first 4 and last 4 columns */
5312			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
5313			".L10352:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
5314			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
5315			/* --- 1 */
5316			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
5317			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
5318			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
5319			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
5320			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5321			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
5322			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5323			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
5324			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
5325			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
5326			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
5327			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
5328			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
5329			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
5330			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
5331			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
5332			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5333			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
5334			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
5335			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
5336			/* --- 2 */
5337			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
5338			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
5339			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
5340			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
5341			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5342			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
5343			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5344			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
5345			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
5346			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
5347			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
5348			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
5349			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
5350			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
5351			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
5352			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
5353			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5354			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
5355			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
5356			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
5357			/* --- 3 */
5358			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
5359			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
5360			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
5361			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
5362			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5363			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
5364			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5365			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
5366			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
5367			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
5368			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
5369			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
5370			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
5371			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
5372			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
5373			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
5374			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5375			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
5376			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
5377			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
5378			/* --- 4 */
5379			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
5380			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
5381			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
5382			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
5383			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5384			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
5385			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5386			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
5387			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
5388			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
5389			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
5390			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
5391			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
5392			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
5393			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
5394			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
5395			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5396			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
5397			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
5398			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
5399			/* --- 5 */
5400			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
5401			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
5402			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
5403			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
5404			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5405			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
5406			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5407			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
5408			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
5409			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
5410			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
5411			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
5412			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
5413			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
5414			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
5415			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
5416			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5417			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
5418			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
5419			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
5420			/* --- 6 */
5421			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
5422			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
5423			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
5424			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
5425			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5426			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
5427			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5428			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
5429			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
5430			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
5431			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
5432			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
5433			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
5434			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
5435			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
5436			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
5437			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5438			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
5439			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
5440			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
5441			/* --- 7 */
5442			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
5443			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
5444			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
5445			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
5446			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5447			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
5448			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5449			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
5450			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
5451			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
5452			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
5453			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
5454			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
5455			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
5456			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
5457			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
5458			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5459			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
5460			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
5461			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
5462			/* --- 8 */
5463			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
5464			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
5465			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
5466			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
5467			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5468			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
5469			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5470			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
5471			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
5472			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
5473			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
5474			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
5475			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
5476			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
5477			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
5478			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
5479			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5480			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
5481			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
5482			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
5483			/* --- 9 */
5484			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
5485			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
5486			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
5487			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
5488			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5489			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
5490			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
5491			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
5492			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
5493			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
5494			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
5495			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
5496			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
5497			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
5498			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
5499			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
5500			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
5501			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
5502			/* --- */
5503			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
5504			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
5505			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
5506			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
5507			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
5508			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
5509			/* --- */
5510			"movd      %%eax, %%mm1 \n\t"	/* save EDX in MM1 */
5511			"movd      %%ebx, %%mm2 \n\t"	/* save EDX in MM2 */
5512			"movd      %%edx, %%mm3 \n\t"	/* save EDX in MM3 */
5513			"movd      %%mm7, %%eax \n\t"	/* load summation result into EAX */
5514			"psraw       $15, %%mm7 \n\t"	/* spread sign bit of the result */
5515			"movd      %%mm5, %%ebx \n\t"	/* load Divisor into EBX */
5516			"movd      %%mm7, %%edx \n\t"	/* fill EDX with a sign bit */
5517			"idivw             %%bx \n\t"	/* IDIV - VERY EXPENSIVE */
5518			"movd      %%eax, %%mm7 \n\t"	/* move result of division into MM7 */
5519			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
5520			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
5521			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
5522			"movd      %%mm3, %%edx \n\t"	/* restore saved EDX */
5523			"movd      %%mm2, %%ebx \n\t"	/* restore saved EBX */
5524			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
5525			/* -- */
5526			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
5527			"sub        $208, %%edx \n\t"	/* EDX = Kernel address */
5528			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
5529			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
5530			/* --- */
5531			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
5532			"jnz            .L10352 \n\t"	/* check loop termination, proceed if required */
5533			"add          $8, %%esi \n\t"	/* move to the next row in Src */
5534			"add          $8, %%edi \n\t"	/* move to the next row in Dest */
5535			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
5536			"jnz            .L10350 \n\t"	/* check loop termination, proceed if required */
5537			/* --- */
5538			"emms                   \n\t"	/* exit MMX state */
5539			"popa                   \n\t":"=m" (Dest)	/* %0 */
5540			:"m"(Src),		/* %1 */
5541			"m"(rows),		/* %2 */
5542			"m"(columns),		/* %3 */
5543			"m"(Kernel),		/* %4 */
5544			"m"(Divisor)		/* %5 */
5545			);
5546#endif
5547#endif
5548		return (0);
5549	} else {
5550		/* No non-MMX implementation yet */
5551		return (-1);
5552	}
5553}
5554
5555/*!
5556\brief Filter using ConvolveKernel3x3ShiftRight: Dij = saturation0and255( ... ) 
5557
5558\param Src The source 2D byte array to convolve. Should be different from destination.
5559\param Dest The destination 2D byte array to store the result in. Should be different from source.
5560\param rows Number of rows in source/destination array. Must be >2.
5561\param columns Number of columns in source/destination array. Must be >2.
5562\param Kernel The 2D convolution kernel of size 3x3.
5563\param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.
5564
5565Note: Non-MMX implementation not available for this function.
5566
5567\return Returns 1 if filter was applied, 0 otherwise.
5568*/
5569int SDL_imageFilterConvolveKernel3x3ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
5570											   signed short *Kernel, unsigned char NRightShift)
5571{
5572	/* Validate input parameters */
5573	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
5574		return(-1);
5575
5576	if ((columns < 3) || (rows < 3) || (NRightShift > 7))
5577		return (-1);
5578
5579	if ((SDL_imageFilterMMXdetect())) {
5580#ifdef USE_MMX
5581#if !defined(GCC__)
5582		__asm
5583		{
5584			pusha
5585				pxor mm0, mm0   	/* zero MM0 */
5586				xor ebx, ebx   	/* zero EBX */
5587				mov bl, NRightShift   	/* load NRightShift into BL */
5588				movd mm4, ebx   	/* copy NRightShift into MM4 */
5589				mov edx, Kernel   	/* load Kernel address into EDX */
5590				movq mm5, [edx]   	/* MM5 = {0,K2,K1,K0} */
5591			add edx, 8   	/* second row              |K0 K1 K2 0| */
5592				movq mm6, [edx]   	/* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
5593			add edx, 8   	/* third row               |K6 K7 K8 0| */
5594				movq mm7, [edx]   	/* MM7 = {0,K8,K7,K6} */
5595			/* ---, */
5596			mov eax, columns   	/* load columns into EAX */
5597				mov esi, Src   	/* ESI = Src row 0 address */
5598				mov edi, Dest   	/* load Dest address to EDI */
5599				add edi, eax   	/* EDI = EDI + columns */
5600				inc              edi    	/* 1 byte offset from the left edge */
5601				mov edx, rows   	/* initialize ROWS counter */
5602				sub edx, 2   	/* do not use first and last row */
5603				/* ---, */
5604L10360:
5605			mov ecx, eax   	/* initialize COLUMS counter */
5606				sub ecx, 2   	/* do not use first and last column */
5607				align 16                 	/* 16 byte alignment of the loop entry */
5608L10362:
5609			/* ---, */
5610			movq mm1, [esi]   	/* load 8 bytes of the image first row */
5611			add esi, eax   	/* move one row below */
5612				movq mm2, [esi]   	/* load 8 bytes of the image second row */
5613			add esi, eax   	/* move one row below */
5614				movq mm3, [esi]   	/* load 8 bytes of the image third row */
5615			punpcklbw mm1, mm0   	/* unpack first 4 bytes into words */
5616				punpcklbw mm2, mm0   	/* unpack first 4 bytes into words */
5617				punpcklbw mm3, mm0   	/* unpack first 4 bytes into words */
5618				psrlw mm1, mm4   	/* shift right each pixel NshiftRight times */
5619				psrlw mm2, mm4   	/* shift right each pixel NshiftRight times */
5620				psrlw mm3, mm4   	/* shift right each pixel NshiftRight times */
5621				pmullw mm1, mm5   	/* multiply words first row  image*Kernel */
5622				pmullw mm2, mm6   	/* multiply words second row image*Kernel */
5623				pmullw mm3, mm7   	/* multiply words third row  image*Kernel */
5624				paddsw mm1, mm2   	/* add 4 words of the first and second rows */
5625				paddsw mm1, mm3   	/* add 4 words of the third row and result */
5626				movq mm2, mm1   	/* copy MM1 into MM2 */
5627				psrlq mm1, 32   	/* shift 2 left words to the right */
5628				paddsw mm1, mm2   	/* add 2 left and 2 right result words */
5629				movq mm3, mm1   	/* copy MM1 into MM3 */
5630				psrlq mm1, 16   	/* shift 1 left word to the right */
5631				paddsw mm1, mm3   	/* add 1 left and 1 right result words */
5632				packuswb mm1, mm0   	/* pack shift result with saturation */
5633				movd ebx, mm1   	/* copy saturated result into EBX */
5634				mov [edi], bl   	/* copy a byte result into Dest */
5635				/* --, */
5636				sub esi, eax   	/* move two rows up */
5637				sub esi, eax
5638				inc              esi    	/* move Src  pointer to the next pixel */
5639				inc              edi    	/* move Dest pointer to the next pixel */
5640				/* ---, */
5641				dec              ecx    	/* decrease loop counter COLUMNS */
5642				jnz            L10362    	/* check loop termination, proceed if required */
5643				add esi, 2   	/* move to the next row in Src */
5644				add edi, 2   	/* move to the next row in Dest */
5645				dec              edx    	/* decrease loop counter ROWS */
5646				jnz            L10360    	/* check loop termination, proceed if required */
5647				/* ---, */
5648				emms                      	/* exit MMX state */
5649				popa
5650		}
5651#else
5652		asm volatile
5653			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
5654			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
5655			"mov           %5, %%bl \n\t"	/* load NRightShift into BL */
5656			"movd      %%ebx, %%mm4 \n\t"	/* copy NRightShift into MM4 */
5657			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
5658			"movq    (%%edx), %%mm5 \n\t"	/* MM5 = {0,K2,K1,K0} */
5659			"add          $8, %%edx \n\t"	/* second row              |K0 K1 K2 0| */
5660			"movq    (%%edx), %%mm6 \n\t"	/* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
5661			"add          $8, %%edx \n\t"	/* third row               |K6 K7 K8 0| */
5662			"movq    (%%edx), %%mm7 \n\t"	/* MM7 = {0,K8,K7,K6} */
5663			/* --- */
5664			"mov          %3, %%eax \n\t"	/* load columns into EAX */
5665			"mov          %1, %%esi \n\t"	/* ESI = Src row 0 address */
5666			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
5667			"add       %%eax, %%edi \n\t"	/* EDI = EDI + columns */
5668			"inc              %%edi