PageRenderTime 86ms CodeModel.GetById 14ms app.highlight 54ms RepoModel.GetById 1ms app.codeStats 2ms

/project/jni/sdl_gfx/SDL_imageFilter.c

https://github.com/aichunyu/FFPlayer
C | 7556 lines | 6038 code | 274 blank | 1244 comment | 520 complexity | d43bab1767b6590f1804da8a41766b90 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/*
   2
   3SDL_imageFilter - bytes-image "filter" routines.
   4(Uses inline x86 MMX or ASM optimizations if available and enabled.)
   5
   6LGPL (c) A. Schiffler
   7
   8Note: Most of the MMX code is based on published routines 
   9by Vladimir Kravtchenko at vk@cs.ubc.ca - credits go to 
  10him for his work.
  11
  12*/
  13
  14#include <stdio.h>
  15#include <stdlib.h>
  16#include <string.h>
  17
  18#include "SDL_imageFilter.h"
  19
  20/*!
  21\brief Swaps the byte order in a 32bit integer (LSB becomes MSB, etc.). 
  22*/
  23#define SWAP_32(x) (((x) >> 24) | (((x) & 0x00ff0000) >> 8)  | (((x) & 0x0000ff00) << 8)  | ((x) << 24))
  24
  25/* ------ Static variables ----- */
  26
  27/*! 
  28\brief Static state which enables the use of the MMX routines. Enabled by default 
  29*/
  30static int SDL_imageFilterUseMMX = 1;
  31
  32/* Detect GCC */
  33#if defined(__GNUC__)
  34#define GCC__
  35#endif
  36
  37/*!
  38\brief Internal function returning the CPU flags. 
  39
  40\returns Flags of system CPU.
  41*/
  42unsigned int _cpuFlags()
  43{
  44	int flags = 0;
  45
  46#ifdef USE_MMX
  47#if !defined(GCC__)
  48	__asm
  49	{
  50		pusha
  51			mov eax, 1
  52			cpuid	/* get CPU ID flag */
  53			mov flags,edx	/* move result to mmx_bit */
  54			popa
  55	}
  56#else
  57	asm volatile ("pusha		     \n\t" "mov    %1, %%eax     \n\t"	/* request feature flag */
  58		"cpuid                \n\t"	/* get CPU ID flag */
  59		"mov    %%edx, %0     \n\t"	/* move result to mmx_bit */
  60		"popa		     \n\t":"=m" (flags)	/* %0 */
  61		:"i"(0x00000001)	/* %1 */
  62		);
  63#endif
  64#endif
  65
  66	return (flags);
  67}
  68
  69/*!
  70\brief MMX detection routine (with override flag). 
  71
  72\returns 1 of MMX was detected, 0 otherwise.
  73*/
  74int SDL_imageFilterMMXdetect(void)
  75{
  76	unsigned int mmx_bit;
  77
  78	/* Check override flag */
  79	if (SDL_imageFilterUseMMX == 0) {
  80		return (0);
  81	}
  82
  83	mmx_bit = _cpuFlags();
  84	mmx_bit &= 0x00800000;
  85	mmx_bit = (mmx_bit && 0x00800000);
  86
  87	return (mmx_bit);
  88}
  89
  90/*!
  91\brief Disable MMX check for filter functions and and force to use non-MMX C based code.
  92*/
  93void SDL_imageFilterMMXoff()
  94{
  95	SDL_imageFilterUseMMX = 0;
  96}
  97
  98/*!
  99\brief Enable MMX check for filter functions and use MMX code if available.
 100*/
 101void SDL_imageFilterMMXon()
 102{
 103	SDL_imageFilterUseMMX = 1;
 104}
 105
 106/* ------------------------------------------------------------------------------------ */
 107
 108/*!
 109\brief Internal MMX Filter using Add: D = saturation255(S1 + S2) 
 110
 111\param Src1 Pointer to the start of the first source byte array (S1).
 112\param Src2 Pointer to the start of the second source byte array (S2).
 113\param Dest Pointer to the start of the destination byte array (D).
 114\param SrcLength The number of bytes in the source arrays.
 115
 116\return Returns 0 for success or -1 for error.
 117*/
 118int SDL_imageFilterAddMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
 119{
 120#ifdef USE_MMX
 121#if !defined(GCC__)
 122	__asm
 123	{
 124		pusha
 125			mov eax, Src1	/* load Src1 address into eax */
 126			mov ebx, Src2	/* load Src2 address into ebx */
 127			mov edi, Dest	/* load Dest address into edi */
 128			mov ecx, SrcLength	/* load loop counter (SIZE) into ecx */
 129			shr ecx, 3	/* counter/8 (MMX loads 8 bytes at a time) */
 130			align 16	/* 16 byte alignment of the loop entry */
 131L1010:
 132		movq mm1, [eax]	/* load 8 bytes from Src1 into mm1 */
 133		paddusb mm1, [ebx]	/* mm1=Src1+Src2 (add 8 bytes with saturation) */
 134		movq [edi], mm1	/* store result in Dest */
 135			add eax, 8	/* increase Src1, Src2 and Dest  */
 136			add ebx, 8	/* register pointers by 8 */
 137			add edi, 8
 138			dec ecx	/* decrease loop counter */
 139			jnz L1010	/* check loop termination, proceed if required */
 140			emms /* exit MMX state */
 141			popa
 142	}
 143#else
 144	asm volatile
 145		("pusha		     \n\t" "mov          %2, %%eax \n\t"	/* load Src1 address into eax */
 146		"mov          %1, %%ebx \n\t"	/* load Src2 address into ebx */
 147		"mov          %0, %%edi \n\t"	/* load Dest address into edi */
 148		"mov          %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
 149		"shr          $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
 150		".align 16              \n\t"	/* 16 byte alignment of the loop entry */
 151		"1: movq (%%eax), %%mm1 \n\t"    	/* load 8 bytes from Src1 into mm1 */
 152		"paddusb (%%ebx), %%mm1 \n\t"	/* mm1=Src1+Src2 (add 8 bytes with saturation) */
 153		"movq    %%mm1, (%%edi) \n\t"	/* store result in Dest */
 154		"add          $8, %%eax \n\t"	/* increase Src1, Src2 and Dest  */
 155		"add          $8, %%ebx \n\t"	/* register pointers by 8 */
 156		"add          $8, %%edi \n\t" "dec              %%ecx \n\t"	/* decrease loop counter */
 157		"jnz             1b     \n\t"     /* check loop termination, proceed if required */
 158		"emms                   \n\t"	/* exit MMX state */
 159		"popa                   \n\t":"=m" (Dest)	/* %0 */
 160		:"m"(Src2),		/* %1 */
 161		"m"(Src1),		/* %2 */
 162		"m"(SrcLength)		/* %3 */
 163		);
 164#endif
 165	return (0);
 166#else
 167	return (-1);
 168#endif
 169}
 170
 171/*!
 172\brief Filter using Add: D = saturation255(S1 + S2) 
 173
 174\param Src1 Pointer to the start of the first source byte array (S1).
 175\param Src2 Pointer to the start of the second source byte array (S2).
 176\param Dest Pointer to the start of the destination byte array (D).
 177\param length The number of bytes in the source arrays.
 178
 179\return Returns 0 for success or -1 for error.
 180*/
 181int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
 182{
 183	unsigned int i, istart;
 184	unsigned char *cursrc1, *cursrc2, *curdst;
 185	int result;
 186
 187	/* Validate input parameters */
 188	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
 189		return(-1);
 190	if (length == 0)
 191		return(0);
 192
 193	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
 194
 195		/* Use MMX assembly routine */
 196		SDL_imageFilterAddMMX(Src1, Src2, Dest, length);
 197
 198		/* Check for unaligned bytes */
 199		if ((length & 7) > 0) {
 200			/* Setup to process unaligned bytes */
 201			istart = length & 0xfffffff8;
 202			cursrc1 = &Src1[istart];
 203			cursrc2 = &Src2[istart];
 204			curdst = &Dest[istart];
 205		} else {
 206			/* No unaligned bytes - we are done */
 207			return (0);
 208		}
 209	} else {
 210		/* Setup to process whole image */
 211		istart = 0;
 212		cursrc1 = Src1;
 213		cursrc2 = Src2;
 214		curdst = Dest;
 215	}
 216
 217	/* C routine to process image */
 218	for (i = istart; i < length; i++) {
 219		result = (int) *cursrc1 + (int) *cursrc2;
 220		if (result > 255)
 221			result = 255;
 222		*curdst = (unsigned char) result;
 223		/* Advance pointers */
 224		cursrc1++;
 225		cursrc2++;
 226		curdst++;
 227	}
 228
 229	return (0);
 230}
 231
 232/*!
 233\brief Internal MMX Filter using Mean: D = S1/2 + S2/2
 234
 235\param Src1 Pointer to the start of the first source byte array (S1).
 236\param Src2 Pointer to the start of the second source byte array (S2).
 237\param Dest Pointer to the start of the destination byte array (D).
 238\param SrcLength The number of bytes in the source arrays.
 239\param Mask Mask array containing 8 bytes with 0x7F value.
 240]
 241\return Returns 0 for success or -1 for error.
 242*/
 243int SDL_imageFilterMeanMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength,
 244						   unsigned char *Mask)
 245{
 246#ifdef USE_MMX
 247#if !defined(GCC__)
 248	__asm
 249	{ 
 250		pusha
 251			mov edx, Mask /* load Mask address into edx */
 252			movq mm0, [edx] /* load Mask into mm0 */
 253		mov eax, Src1 /* load Src1 address into eax */
 254			mov ebx, Src2 /* load Src2 address into ebx */
 255			mov edi, Dest /* load Dest address into edi */
 256			mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
 257			shr ecx, 3 	/* counter/8 (MMX loads 8 bytes at a time) */
 258			align 16	/* 16 byte alignment of the loop entry */
 259L21011:
 260		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
 261		movq mm2,  [ebx] 	/* load 8 bytes from Src2 into mm2 */
 262		/* --- Byte shift via Word shift --- */
 263		psrlw mm1, 1 	/* shift 4 WORDS of mm1 1 bit to the right */
 264			psrlw mm2, 1 	/* shift 4 WORDS of mm2 1 bit to the right */
 265			pand mm1, mm0   // apply Mask to 8 BYTES of mm1 */
 266			/* byte     0x0f, 0xdb, 0xc8 */
 267			pand mm2, mm0   // apply Mask to 8 BYTES of mm2 */
 268			/* byte     0x0f, 0xdb, 0xd0 */
 269			paddusb mm1,  mm2 	/* mm1=mm1+mm2 (add 8 bytes with saturation) */
 270			movq [edi],  mm1 	/* store result in Dest */
 271			add eax,  8 	/* increase Src1, Src2 and Dest  */
 272			add ebx,  8 	/* register pointers by 8 */
 273			add edi,  8
 274			dec ecx 	/* decrease loop counter */
 275			jnz L21011	/* check loop termination, proceed if required */
 276			emms	/* exit MMX state */
 277			popa
 278	}
 279#else
 280	asm volatile
 281		("pusha		     \n\t" "movl         %4, %%edx \n\t"	/* load Mask address into edx */
 282		"movq    (%%edx), %%mm0 \n\t"	/* load Mask into mm0 */
 283		"mov          %2, %%eax \n\t"	/* load Src1 address into eax */
 284		"mov          %1, %%ebx \n\t"	/* load Src2 address into ebx */
 285		"mov          %0, %%edi \n\t"	/* load Dest address into edi */
 286		"mov          %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
 287		"shr          $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
 288		".align 16              \n\t"	/* 16 byte alignment of the loop entry */
 289		"1:                      \n\t"
 290		"movq    (%%eax), %%mm1 \n\t"	/* load 8 bytes from Src1 into mm1 */
 291		"movq    (%%ebx), %%mm2 \n\t"	/* load 8 bytes from Src2 into mm2 */
 292		/* --- Byte shift via Word shift --- */
 293		"psrlw        $1, %%mm1 \n\t"	/* shift 4 WORDS of mm1 1 bit to the right */
 294		"psrlw        $1, %%mm2 \n\t"	/* shift 4 WORDS of mm2 1 bit to the right */
 295		/*      "pand      %%mm0, %%mm1 \n\t"    // apply Mask to 8 BYTES of mm1 */
 296		".byte     0x0f, 0xdb, 0xc8 \n\t"
 297		/*      "pand      %%mm0, %%mm2 \n\t"    // apply Mask to 8 BYTES of mm2 */
 298		".byte     0x0f, 0xdb, 0xd0 \n\t" 
 299		"paddusb   %%mm2, %%mm1 \n\t"	/* mm1=mm1+mm2 (add 8 bytes with saturation) */
 300		"movq    %%mm1, (%%edi) \n\t"	/* store result in Dest */
 301		"add          $8, %%eax \n\t"	/* increase Src1, Src2 and Dest  */
 302		"add          $8, %%ebx \n\t"	/* register pointers by 8 */
 303		"add          $8, %%edi \n\t" 
 304		"dec              %%ecx \n\t"	/* decrease loop counter */
 305		"jnz                 1b \n\t"     /* check loop termination, proceed if required */
 306		"emms                   \n\t"	/* exit MMX state */
 307		"popa                   \n\t":"=m" (Dest)	/* %0 */
 308		:"m"(Src2),		/* %1 */
 309		"m"(Src1),		/* %2 */
 310		"m"(SrcLength),		/* %3 */
 311		"m"(Mask)			/* %4 */
 312		);
 313#endif
 314	return (0);
 315#else
 316	return (-1);
 317#endif
 318}
 319
 320/*!
 321\brief Filter using Mean: D = S1/2 + S2/2
 322
 323\param Src1 Pointer to the start of the first source byte array (S1).
 324\param Src2 Pointer to the start of the second source byte array (S2).
 325\param Dest Pointer to the start of the destination byte array (D).
 326\param length The number of bytes in the source arrays.
 327
 328\return Returns 0 for success or -1 for error.
 329*/
 330int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
 331{
 332	static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
 333	unsigned int i, istart;
 334	unsigned char *cursrc1, *cursrc2, *curdst;
 335	int result;
 336
 337	/* Validate input parameters */
 338	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
 339		return(-1);
 340	if (length == 0)
 341		return(0);
 342
 343	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
 344		/* MMX routine */
 345		SDL_imageFilterMeanMMX(Src1, Src2, Dest, length, Mask);
 346
 347		/* Check for unaligned bytes */
 348		if ((length & 7) > 0) {
 349			/* Setup to process unaligned bytes */
 350			istart = length & 0xfffffff8;
 351			cursrc1 = &Src1[istart];
 352			cursrc2 = &Src2[istart];
 353			curdst = &Dest[istart];
 354		} else {
 355			/* No unaligned bytes - we are done */
 356			return (0);
 357		}
 358	} else {
 359		/* Setup to process whole image */
 360		istart = 0;
 361		cursrc1 = Src1;
 362		cursrc2 = Src2;
 363		curdst = Dest;
 364	}
 365
 366	/* C routine to process image */
 367	for (i = istart; i < length; i++) {
 368		result = (int) *cursrc1 / 2 + (int) *cursrc2 / 2;
 369		*curdst = (unsigned char) result;
 370		/* Advance pointers */
 371		cursrc1++;
 372		cursrc2++;
 373		curdst++;
 374	}
 375
 376	return (0);
 377}
 378
 379/*!
 380\brief Internal MMX Filter using Sub: D = saturation0(S1 - S2)
 381
 382\param Src1 Pointer to the start of the first source byte array (S1).
 383\param Src2 Pointer to the start of the second source byte array (S2).
 384\param Dest Pointer to the start of the destination byte array (D).
 385\param SrcLength The number of bytes in the source arrays.
 386
 387\return Returns 0 for success or -1 for error.
 388*/
 389int SDL_imageFilterSubMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
 390{
 391#ifdef USE_MMX
 392#if !defined(GCC__)
 393	__asm
 394	{
 395		pusha
 396			mov eax,  Src1 	/* load Src1 address into eax */
 397			mov ebx,  Src2 	/* load Src2 address into ebx */
 398			mov edi,  Dest 	/* load Dest address into edi */
 399			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
 400			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
 401			align 16 /* 16 byte alignment of the loop entry */
 402L1012:
 403		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
 404		psubusb mm1,  [ebx] 	/* mm1=Src1-Src2 (sub 8 bytes with saturation) */
 405		movq [edi],  mm1 	/* store result in Dest */
 406			add eax, 8 	/* increase Src1, Src2 and Dest  */
 407			add ebx, 8 	/* register pointers by 8 */
 408			add edi, 8
 409			dec ecx	/* decrease loop counter */
 410			jnz L1012	/* check loop termination, proceed if required */
 411			emms /* exit MMX state */
 412			popa
 413	}
 414#else
 415	asm volatile
 416		("pusha		     \n\t" "mov %2, %%eax \n\t"	/* load Src1 address into eax */
 417		"mov %1, %%ebx \n\t"	/* load Src2 address into ebx */
 418		"mov %0, %%edi \n\t"	/* load Dest address into edi */
 419		"mov %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
 420		"shr $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
 421		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
 422		"1: movq (%%eax), %%mm1 \n\t"     /* load 8 bytes from Src1 into mm1 */
 423		"psubusb (%%ebx), %%mm1 \n\t"	/* mm1=Src1-Src2 (sub 8 bytes with saturation) */
 424		"movq    %%mm1, (%%edi) \n\t"	/* store result in Dest */
 425		"add $8, %%eax \n\t"	/* increase Src1, Src2 and Dest  */
 426		"add $8, %%ebx \n\t"	/* register pointers by 8 */
 427		"add $8, %%edi \n\t" "dec %%ecx     \n\t"	/* decrease loop counter */
 428		"jnz 1b         \n\t"     /* check loop termination, proceed if required */
 429		"emms          \n\t"	/* exit MMX state */
 430		"popa                   \n\t":"=m" (Dest)	/* %0 */
 431		:"m"(Src2),		/* %1 */
 432		"m"(Src1),		/* %2 */
 433		"m"(SrcLength)		/* %3 */
 434		);
 435#endif
 436	return (0);
 437#else
 438	return (-1);
 439#endif
 440}
 441
 442/*!
 443\brief Filter using Sub: D = saturation0(S1 - S2)
 444
 445\param Src1 Pointer to the start of the first source byte array (S1).
 446\param Src2 Pointer to the start of the second source byte array (S2).
 447\param Dest Pointer to the start of the destination byte array (D).
 448\param length The number of bytes in the source arrays.
 449
 450\return Returns 0 for success or -1 for error.
 451*/
 452int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
 453{
 454	unsigned int i, istart;
 455	unsigned char *cursrc1, *cursrc2, *curdst;
 456	int result;
 457
 458	/* Validate input parameters */
 459	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
 460		return(-1);
 461	if (length == 0)
 462		return(0);
 463
 464	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
 465		/* MMX routine */
 466		SDL_imageFilterSubMMX(Src1, Src2, Dest, length);
 467
 468		/* Check for unaligned bytes */
 469		if ((length & 7) > 0) {
 470			/* Setup to process unaligned bytes */
 471			istart = length & 0xfffffff8;
 472			cursrc1 = &Src1[istart];
 473			cursrc2 = &Src2[istart];
 474			curdst = &Dest[istart];
 475		} else {
 476			/* No unaligned bytes - we are done */
 477			return (0);
 478		}
 479	} else {
 480		/* Setup to process whole image */
 481		istart = 0;
 482		cursrc1 = Src1;
 483		cursrc2 = Src2;
 484		curdst = Dest;
 485	}
 486
 487	/* C routine to process image */
 488	for (i = istart; i < length; i++) {
 489		result = (int) *cursrc1 - (int) *cursrc2;
 490		if (result < 0)
 491			result = 0;
 492		*curdst = (unsigned char) result;
 493		/* Advance pointers */
 494		cursrc1++;
 495		cursrc2++;
 496		curdst++;
 497	}
 498
 499	return (0);
 500}
 501
 502/*!
 503\brief Internal MMX Filter using AbsDiff: D = | S1 - S2 |
 504
 505\param Src1 Pointer to the start of the first source byte array (S1).
 506\param Src2 Pointer to the start of the second source byte array (S2).
 507\param Dest Pointer to the start of the destination byte array (D).
 508\param SrcLength The number of bytes in the source arrays.
 509
 510\return Returns 0 for success or -1 for error.
 511*/
 512int SDL_imageFilterAbsDiffMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
 513{
 514#ifdef USE_MMX
 515#if !defined(GCC__)
 516	__asm
 517	{
 518		pusha
 519			mov eax, Src1  	/* load Src1 address into eax */
 520			mov ebx, Src2 	/* load Src2 address into ebx */
 521			mov edi, Dest 	/* load Dest address into edi */
 522			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
 523			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
 524			align 16	/* 16 byte alignment of the loop entry */
 525L1013:
 526		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
 527		movq mm2,  [ebx] 	/* load 8 bytes from Src2 into mm2 */
 528		psubusb mm1,  [ebx] 	/* mm1=Src1-Src2 (sub 8 bytes with saturation) */
 529		psubusb mm2,  [eax] 	/* mm2=Src2-Src1 (sub 8 bytes with saturation) */
 530		por mm1,  mm2 	/* combine both mm2 and mm1 results */
 531			movq [edi],  mm1 	/* store result in Dest */
 532			add eax, 8 	/* increase Src1, Src2 and Dest  */
 533			add ebx, 8 	/* register pointers by 8 */
 534			add edi, 8
 535			dec ecx 	/* decrease loop counter */
 536			jnz L1013    	/* check loop termination, proceed if required */
 537			emms         /* exit MMX state */
 538			popa
 539	}
 540#else
 541	asm volatile
 542		("pusha		     \n\t" "mov %2, %%eax \n\t"	/* load Src1 address into eax */
 543		"mov %1, %%ebx \n\t"	/* load Src2 address into ebx */
 544		"mov %0, %%edi \n\t"	/* load Dest address into edi */
 545		"mov %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
 546		"shr $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
 547		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
 548		"1: movq (%%eax), %%mm1 \n\t"     /* load 8 bytes from Src1 into mm1 */
 549		"movq    (%%ebx), %%mm2 \n\t"	/* load 8 bytes from Src2 into mm2 */
 550		"psubusb (%%ebx), %%mm1 \n\t"	/* mm1=Src1-Src2 (sub 8 bytes with saturation) */
 551		"psubusb (%%eax), %%mm2 \n\t"	/* mm2=Src2-Src1 (sub 8 bytes with saturation) */
 552		"por       %%mm2, %%mm1 \n\t"	/* combine both mm2 and mm1 results */
 553		"movq    %%mm1, (%%edi) \n\t"	/* store result in Dest */
 554		"add $8, %%eax \n\t"	/* increase Src1, Src2 and Dest  */
 555		"add $8, %%ebx \n\t"	/* register pointers by 8 */
 556		"add $8, %%edi \n\t" "dec %%ecx     \n\t"	/* decrease loop counter */
 557		"jnz 1b        \n\t"      /* check loop termination, proceed if required */
 558		"emms          \n\t"	/* exit MMX state */
 559		"popa                   \n\t":"=m" (Dest)	/* %0 */
 560		:"m"(Src2),		/* %1 */
 561		"m"(Src1),		/* %2 */
 562		"m"(SrcLength)		/* %3 */
 563		);
 564#endif
 565	return (0);
 566#else
 567	return (-1);
 568#endif
 569}
 570
 571/*!
 572\brief Filter using AbsDiff: D = | S1 - S2 |
 573
 574\param Src1 Pointer to the start of the first source byte array (S1).
 575\param Src2 Pointer to the start of the second source byte array (S2).
 576\param Dest Pointer to the start of the destination byte array (D).
 577\param length The number of bytes in the source arrays.
 578
 579\return Returns 0 for success or -1 for error.
 580*/
 581int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
 582{
 583	unsigned int i, istart;
 584	unsigned char *cursrc1, *cursrc2, *curdst;
 585	int result;
 586
 587	/* Validate input parameters */
 588	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
 589		return(-1);
 590	if (length == 0)
 591		return(0);
 592
 593	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
 594		/* MMX routine */
 595		SDL_imageFilterAbsDiffMMX(Src1, Src2, Dest, length);
 596
 597		/* Check for unaligned bytes */
 598		if ((length & 7) > 0) {
 599			/* Setup to process unaligned bytes */
 600			istart = length & 0xfffffff8;
 601			cursrc1 = &Src1[istart];
 602			cursrc2 = &Src2[istart];
 603			curdst = &Dest[istart];
 604		} else {
 605			/* No unaligned bytes - we are done */
 606			return (0);
 607		}
 608	} else {
 609		/* Setup to process whole image */
 610		istart = 0;
 611		cursrc1 = Src1;
 612		cursrc2 = Src2;
 613		curdst = Dest;
 614	}
 615
 616	/* C routine to process image */
 617	for (i = istart; i < length; i++) {
 618		result = abs((int) *cursrc1 - (int) *cursrc2);
 619		*curdst = (unsigned char) result;
 620		/* Advance pointers */
 621		cursrc1++;
 622		cursrc2++;
 623		curdst++;
 624	}
 625
 626	return (0);
 627}
 628
 629/*!
 630\brief Internal MMX Filter using Mult: D = saturation255(S1 * S2)
 631
 632\param Src1 Pointer to the start of the first source byte array (S1).
 633\param Src2 Pointer to the start of the second source byte array (S2).
 634\param Dest Pointer to the start of the destination byte array (D).
 635\param SrcLength The number of bytes in the source arrays.
 636
 637\return Returns 0 for success or -1 for error.
 638*/
 639int SDL_imageFilterMultMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
 640{
 641#ifdef USE_MMX
 642#if !defined(GCC__)
 643	__asm
 644	{
 645		pusha
 646			mov eax, Src1   /* load Src1 address into eax */
 647			mov ebx, Src2   /* load Src2 address into ebx */
 648			mov edi, Dest   /* load Dest address into edi */
 649			mov ecx, SrcLength   /* load loop counter (SIZE) into ecx */
 650			shr ecx, 3   /* counter/8 (MMX loads 8 bytes at a time) */
 651			pxor mm0, mm0   /* zero mm0 register */
 652			align 16      	/* 16 byte alignment of the loop entry */
 653L1014:
 654		movq mm1, [eax]   /* load 8 bytes from Src1 into mm1 */
 655		movq mm3, [ebx]   /* load 8 bytes from Src2 into mm3 */
 656		movq mm2, mm1   /* copy mm1 into mm2 */
 657			movq mm4, mm3   /* copy mm3 into mm4  */
 658			punpcklbw mm1, mm0   /* unpack low  bytes of Src1 into words */
 659			punpckhbw mm2, mm0   /* unpack high bytes of Src1 into words */
 660			punpcklbw mm3, mm0   /* unpack low  bytes of Src2 into words */
 661			punpckhbw mm4, mm0   /* unpack high bytes of Src2 into words */
 662			pmullw mm1, mm3   /* mul low  bytes of Src1 and Src2  */
 663			pmullw mm2, mm4   /* mul high bytes of Src1 and Src2 */
 664			/* Take abs value of the results (signed words) */
 665			movq mm5, mm1   /* copy mm1 into mm5 */
 666			movq mm6, mm2   /* copy mm2 into mm6 */
 667			psraw mm5, 15   /* fill mm5 words with word sign bit */
 668			psraw mm6, 15   /* fill mm6 words with word sign bit */
 669			pxor mm1, mm5   /* take 1's compliment of only neg. words */
 670			pxor mm2, mm6   /* take 1's compliment of only neg. words */
 671			psubsw mm1, mm5   /* add 1 to only neg. words, W-(-1) or W-0 */
 672			psubsw mm2, mm6   /* add 1 to only neg. words, W-(-1) or W-0 */
 673			packuswb mm1, mm2   /* pack words back into bytes with saturation */
 674			movq [edi], mm1   /* store result in Dest */
 675			add eax, 8   /* increase Src1, Src2 and Dest  */
 676			add ebx, 8   /* register pointers by 8 */
 677			add edi, 8
 678			dec ecx 	/* decrease loop counter */
 679			jnz L1014	/* check loop termination, proceed if required */
 680			emms /* exit MMX state */
 681			popa
 682	}
 683#else
 684	asm volatile
 685		("pusha		     \n\t" "mov %2, %%eax \n\t"	/* load Src1 address into eax */
 686		"mov %1, %%ebx \n\t"	/* load Src2 address into ebx */
 687		"mov %0, %%edi \n\t"	/* load Dest address into edi */
 688		"mov %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
 689		"shr $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
 690		"pxor      %%mm0, %%mm0 \n\t"	/* zero mm0 register */
 691		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
 692		"1: movq (%%eax), %%mm1 \n\t"     /* load 8 bytes from Src1 into mm1 */
 693		"movq    (%%ebx), %%mm3 \n\t"	/* load 8 bytes from Src2 into mm3 */
 694		"movq      %%mm1, %%mm2 \n\t"	/* copy mm1 into mm2 */
 695		"movq      %%mm3, %%mm4 \n\t"	/* copy mm3 into mm4  */
 696		"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack low  bytes of Src1 into words */
 697		"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack high bytes of Src1 into words */
 698		"punpcklbw %%mm0, %%mm3 \n\t"	/* unpack low  bytes of Src2 into words */
 699		"punpckhbw %%mm0, %%mm4 \n\t"	/* unpack high bytes of Src2 into words */
 700		"pmullw    %%mm3, %%mm1 \n\t"	/* mul low  bytes of Src1 and Src2  */
 701		"pmullw    %%mm4, %%mm2 \n\t"	/* mul high bytes of Src1 and Src2 */
 702		/* Take abs value of the results (signed words) */
 703		"movq      %%mm1, %%mm5 \n\t"	/* copy mm1 into mm5 */
 704		"movq      %%mm2, %%mm6 \n\t"	/* copy mm2 into mm6 */
 705		"psraw       $15, %%mm5 \n\t"	/* fill mm5 words with word sign bit */
 706		"psraw       $15, %%mm6 \n\t"	/* fill mm6 words with word sign bit */
 707		"pxor      %%mm5, %%mm1 \n\t"	/* take 1's compliment of only neg. words */
 708		"pxor      %%mm6, %%mm2 \n\t"	/* take 1's compliment of only neg. words */
 709		"psubsw    %%mm5, %%mm1 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
 710		"psubsw    %%mm6, %%mm2 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
 711		"packuswb  %%mm2, %%mm1 \n\t"	/* pack words back into bytes with saturation */
 712		"movq    %%mm1, (%%edi) \n\t"	/* store result in Dest */
 713		"add $8, %%eax \n\t"	/* increase Src1, Src2 and Dest  */
 714		"add $8, %%ebx \n\t"	/* register pointers by 8 */
 715		"add $8, %%edi \n\t" "dec %%ecx     \n\t"	/* decrease loop counter */
 716		"jnz 1b        \n\t"      /* check loop termination, proceed if required */
 717		"emms          \n\t"	/* exit MMX state */
 718		"popa \n\t":"=m" (Dest)	/* %0 */
 719		:"m"(Src2),		/* %1 */
 720		"m"(Src1),		/* %2 */
 721		"m"(SrcLength)		/* %3 */
 722		);
 723#endif
 724	return (0);
 725#else
 726	return (-1);
 727#endif
 728}
 729
 730/*!
 731\brief Filter using Mult: D = saturation255(S1 * S2)
 732
 733\param Src1 Pointer to the start of the first source byte array (S1).
 734\param Src2 Pointer to the start of the second source byte array (S2).
 735\param Dest Pointer to the start of the destination byte array (D).
 736\param length The number of bytes in the source arrays.
 737
 738\return Returns 0 for success or -1 for error.
 739*/
 740int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
 741{
 742	unsigned int i, istart;
 743	unsigned char *cursrc1, *cursrc2, *curdst;
 744	int result;
 745
 746	/* Validate input parameters */
 747	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
 748		return(-1);
 749	if (length == 0)
 750		return(0);
 751
 752	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
 753		/* MMX routine */
 754		SDL_imageFilterMultMMX(Src1, Src2, Dest, length);
 755
 756		/* Check for unaligned bytes */
 757		if ((length & 7) > 0) {
 758			/* Setup to process unaligned bytes */
 759			istart = length & 0xfffffff8;
 760			cursrc1 = &Src1[istart];
 761			cursrc2 = &Src2[istart];
 762			curdst = &Dest[istart];
 763		} else {
 764			/* No unaligned bytes - we are done */
 765			return (0);
 766		}
 767	} else {
 768		/* Setup to process whole image */
 769		istart = 0;
 770		cursrc1 = Src1;
 771		cursrc2 = Src2;
 772		curdst = Dest;
 773	}
 774
 775	/* C routine to process image */
 776	for (i = istart; i < length; i++) {
 777
 778		/* NOTE: this is probably wrong - dunno what the MMX code does */
 779
 780		result = (int) *cursrc1 * (int) *cursrc2;
 781		if (result > 255)
 782			result = 255;
 783		*curdst = (unsigned char) result;
 784		/* Advance pointers */
 785		cursrc1++;
 786		cursrc2++;
 787		curdst++;
 788	}
 789
 790	return (0);
 791}
 792
 793/*!
 794\brief Internal ASM Filter using MultNor: D = S1 * S2
 795
 796\param Src1 Pointer to the start of the first source byte array (S1).
 797\param Src2 Pointer to the start of the second source byte array (S2).
 798\param Dest Pointer to the start of the destination byte array (D).
 799\param SrcLength The number of bytes in the source arrays.
 800
 801\return Returns 0 for success or -1 for error.
 802*/
 803int SDL_imageFilterMultNorASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
 804{
 805#ifdef USE_MMX
 806#if !defined(GCC__)
 807	__asm
 808	{
 809		pusha
 810			mov edx, Src1   /* load Src1 address into edx */
 811			mov esi, Src2   /* load Src2 address into esi */
 812			mov edi, Dest   /* load Dest address into edi */
 813			mov ecx, SrcLength   /* load loop counter (SIZE) into ecx */
 814			align 16 	/* 16 byte alignment of the loop entry */
 815L10141:
 816		mov al, [edx]   /* load a byte from Src1 */
 817		mul [esi] 	/* mul with a byte from Src2 */
 818		mov [edi], al   /* move a byte result to Dest */
 819			inc edx 	/* increment Src1, Src2, Dest */
 820			inc esi 		/* pointer registers by one */
 821			inc edi
 822			dec ecx	/* decrease loop counter */
 823			jnz L10141  	/* check loop termination, proceed if required */
 824			popa
 825	}
 826#else
 827	asm volatile
 828		("pusha		     \n\t" "mov %2, %%edx \n\t"	/* load Src1 address into edx */
 829		"mov %1, %%esi \n\t"	/* load Src2 address into esi */
 830		"mov %0, %%edi \n\t"	/* load Dest address into edi */
 831		"mov %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
 832		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
 833		"1:mov  (%%edx), %%al \n\t"      /* load a byte from Src1 */
 834		"mulb (%%esi)       \n\t"	/* mul with a byte from Src2 */
 835		"mov %%al, (%%edi)  \n\t"       /* move a byte result to Dest */
 836		"inc %%edx \n\t"		/* increment Src1, Src2, Dest */
 837		"inc %%esi \n\t"		/* pointer registers by one */
 838		"inc %%edi \n\t" "dec %%ecx      \n\t"	/* decrease loop counter */
 839		"jnz 1b         \n\t"     /* check loop termination, proceed if required */
 840		"popa                   \n\t":"=m" (Dest)	/* %0 */
 841		:"m"(Src2),		/* %1 */
 842		"m"(Src1),		/* %2 */
 843		"m"(SrcLength)		/* %3 */
 844		);
 845#endif
 846	return (0);
 847#else
 848	return (-1);
 849#endif
 850}
 851
 852/*!
 853\brief Filter using MultNor: D = S1 * S2
 854
 855\param Src1 Pointer to the start of the first source byte array (S1).
 856\param Src2 Pointer to the start of the second source byte array (S2).
 857\param Dest Pointer to the start of the destination byte array (D).
 858\param length The number of bytes in the source arrays.
 859
 860\return Returns 0 for success or -1 for error.
 861*/
 862int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
 863{
 864	unsigned int i, istart;
 865	unsigned char *cursrc1, *cursrc2, *curdst;
 866	int result;
 867
 868	/* Validate input parameters */
 869	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
 870		return(-1);
 871	if (length == 0)
 872		return(0);
 873
 874	if (SDL_imageFilterMMXdetect()) {
 875		if (length > 0) {
 876			/* ASM routine */
 877			SDL_imageFilterMultNorASM(Src1, Src2, Dest, length);
 878
 879			/* Check for unaligned bytes */
 880			if ((length & 7) > 0) {
 881				/* Setup to process unaligned bytes */
 882				istart = length & 0xfffffff8;
 883				cursrc1 = &Src1[istart];
 884				cursrc2 = &Src2[istart];
 885				curdst = &Dest[istart];
 886			} else {
 887				/* No unaligned bytes - we are done */
 888				return (0);
 889			}
 890		} else {
 891			/* No bytes - we are done */
 892			return (0);
 893		}
 894	} else {
 895		/* Setup to process whole image */
 896		istart = 0;
 897		cursrc1 = Src1;
 898		cursrc2 = Src2;
 899		curdst = Dest;
 900	}
 901
 902	/* C routine to process image */
 903	for (i = istart; i < length; i++) {
 904		result = (int) *cursrc1 * (int) *cursrc2;
 905		*curdst = (unsigned char) result;
 906		/* Advance pointers */
 907		cursrc1++;
 908		cursrc2++;
 909		curdst++;
 910	}
 911
 912	return (0);
 913}
 914
 915/*!
 916\brief Internal MMX Filter using MultDivby2: D = saturation255(S1/2 * S2)
 917
 918\param Src1 Pointer to the start of the first source byte array (S1).
 919\param Src2 Pointer to the start of the second source byte array (S2).
 920\param Dest Pointer to the start of the destination byte array (D).
 921\param SrcLength The number of bytes in the source arrays.
 922
 923\return Returns 0 for success or -1 for error.
 924*/
 925int SDL_imageFilterMultDivby2MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
 926{
 927#ifdef USE_MMX
 928#if !defined(GCC__)
 929	__asm
 930	{ 
 931		pusha
 932			mov eax, Src1   	/* load Src1 address into eax */
 933			mov ebx, Src2   	/* load Src2 address into ebx */
 934			mov edi, Dest   	/* load Dest address into edi */
 935			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
 936			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
 937			pxor mm0,  mm0 	/* zero mm0 register */
 938			align 16          	/* 16 byte alignment of the loop entry */
 939L1015:
 940		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
 941		movq mm3,  [ebx] 	/* load 8 bytes from Src2 into mm3 */
 942		movq mm2,  mm1 	/* copy mm1 into mm2 */
 943			movq mm4,  mm3 	/* copy mm3 into mm4  */
 944			punpcklbw mm1,  mm0 	/* unpack low  bytes of Src1 into words */
 945			punpckhbw mm2,  mm0 	/* unpack high bytes of Src1 into words */
 946			punpcklbw mm3,  mm0 	/* unpack low  bytes of Src2 into words */
 947			punpckhbw mm4,  mm0 	/* unpack high bytes of Src2 into words */
 948			psrlw mm1,  1 	/* divide mm1 words by 2, Src1 low bytes */
 949			psrlw mm2,  1 	/* divide mm2 words by 2, Src1 high bytes */
 950			pmullw mm1,  mm3 	/* mul low  bytes of Src1 and Src2  */
 951			pmullw mm2,  mm4 	/* mul high bytes of Src1 and Src2 */
 952			packuswb mm1,  mm2 	/* pack words back into bytes with saturation */
 953			movq [edi],  mm1 	/* store result in Dest */
 954			add eax,  8 	/* increase Src1, Src2 and Dest  */
 955			add ebx,  8 	/* register pointers by 8 */
 956			add edi,  8
 957			dec ecx        	/* decrease loop counter */
 958			jnz L1015       	/* check loop termination, proceed if required */
 959			emms             	/* exit MMX state */
 960			popa
 961	}
 962#else
 963	asm volatile
 964		("pusha \n\t" "mov %2, %%eax \n\t"	/* load Src1 address into eax */
 965		"mov %1, %%ebx \n\t"	/* load Src2 address into ebx */
 966		"mov %0, %%edi \n\t"	/* load Dest address into edi */
 967		"mov %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
 968		"shr $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
 969		"pxor      %%mm0, %%mm0 \n\t"	/* zero mm0 register */
 970		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
 971		"1: movq (%%eax), %%mm1 \n\t"	/* load 8 bytes from Src1 into mm1 */
 972		"movq    (%%ebx), %%mm3 \n\t"	/* load 8 bytes from Src2 into mm3 */
 973		"movq      %%mm1, %%mm2 \n\t"	/* copy mm1 into mm2 */
 974		"movq      %%mm3, %%mm4 \n\t"	/* copy mm3 into mm4  */
 975		"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack low  bytes of Src1 into words */
 976		"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack high bytes of Src1 into words */
 977		"punpcklbw %%mm0, %%mm3 \n\t"	/* unpack low  bytes of Src2 into words */
 978		"punpckhbw %%mm0, %%mm4 \n\t"	/* unpack high bytes of Src2 into words */
 979		"psrlw        $1, %%mm1 \n\t"	/* divide mm1 words by 2, Src1 low bytes */
 980		"psrlw        $1, %%mm2 \n\t"	/* divide mm2 words by 2, Src1 high bytes */
 981		"pmullw    %%mm3, %%mm1 \n\t"	/* mul low  bytes of Src1 and Src2  */
 982		"pmullw    %%mm4, %%mm2 \n\t"	/* mul high bytes of Src1 and Src2 */
 983		"packuswb  %%mm2, %%mm1 \n\t"	/* pack words back into bytes with saturation */
 984		"movq    %%mm1, (%%edi) \n\t"	/* store result in Dest */
 985		"add $8, %%eax \n\t"	/* increase Src1, Src2 and Dest  */
 986		"add $8, %%ebx \n\t"	/* register pointers by 8 */
 987		"add $8, %%edi \n\t" "dec %%ecx     \n\t"	/* decrease loop counter */
 988		"jnz 1b        \n\t"	/* check loop termination, proceed if required */
 989		"emms          \n\t"	/* exit MMX state */
 990		"popa \n\t":"=m" (Dest)	/* %0 */
 991		:"m"(Src2),		/* %1 */
 992		"m"(Src1),		/* %2 */
 993		"m"(SrcLength)		/* %3 */
 994		);
 995#endif
 996	return (0);
 997#else
 998	return (-1);
 999#endif
1000}
1001
1002/*!
1003\brief Filter using MultDivby2: D = saturation255(S1/2 * S2)
1004
1005\param Src1 Pointer to the start of the first source byte array (S1).
1006\param Src2 Pointer to the start of the second source byte array (S2).
1007\param Dest Pointer to the start of the destination byte array (D).
1008\param length The number of bytes in the source arrays.
1009
1010\return Returns 0 for success or -1 for error.
1011*/
1012int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
1013{
1014	unsigned int i, istart;
1015	unsigned char *cursrc1, *cursrc2, *curdst;
1016	int result;
1017
1018	/* Validate input parameters */
1019	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
1020		return(-1);
1021	if (length == 0)
1022		return(0);
1023
1024	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1025		/* MMX routine */
1026		SDL_imageFilterMultDivby2MMX(Src1, Src2, Dest, length);
1027
1028		/* Check for unaligned bytes */
1029		if ((length & 7) > 0) {
1030			/* Setup to process unaligned bytes */
1031			istart = length & 0xfffffff8;
1032			cursrc1 = &Src1[istart];
1033			cursrc2 = &Src2[istart];
1034			curdst = &Dest[istart];
1035		} else {
1036			/* No unaligned bytes - we are done */
1037			return (0);
1038		}
1039	} else {
1040		/* Setup to process whole image */
1041		istart = 0;
1042		cursrc1 = Src1;
1043		cursrc2 = Src2;
1044		curdst = Dest;
1045	}
1046
1047	/* C routine to process image */
1048	for (i = istart; i < length; i++) {
1049		result = ((int) *cursrc1 / 2) * (int) *cursrc2;
1050		if (result > 255)
1051			result = 255;
1052		*curdst = (unsigned char) result;
1053		/* Advance pointers */
1054		cursrc1++;
1055		cursrc2++;
1056		curdst++;
1057	}
1058
1059	return (0);
1060}
1061
1062/*!
1063\brief Internal MMX Filter using MultDivby4: D = saturation255(S1/2 * S2/2)
1064
1065\param Src1 Pointer to the start of the first source byte array (S1).
1066\param Src2 Pointer to the start of the second source byte array (S2).
1067\param Dest Pointer to the start of the destination byte array (D).
1068\param SrcLength The number of bytes in the source arrays.
1069
1070\return Returns 0 for success or -1 for error.
1071*/
1072int SDL_imageFilterMultDivby4MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
1073{
1074#ifdef USE_MMX
1075#if !defined(GCC__)
1076	__asm
1077	{
1078		pusha
1079			mov eax, Src1   	/* load Src1 address into eax */
1080			mov ebx, Src2   	/* load Src2 address into ebx */
1081			mov edi, Dest   	/* load Dest address into edi */
1082			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
1083			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
1084			pxor mm0, mm0   	/* zero mm0 register */
1085			align 16          	/* 16 byte alignment of the loop entry */
1086L1016:
1087		movq mm1, [eax]   	/* load 8 bytes from Src1 into mm1 */
1088		movq mm3, [ebx]   	/* load 8 bytes from Src2 into mm3 */
1089		movq mm2, mm1   	/* copy mm1 into mm2 */
1090			movq mm4, mm3   	/* copy mm3 into mm4  */
1091			punpcklbw mm1, mm0   	/* unpack low  bytes of Src1 into words */
1092			punpckhbw mm2, mm0   	/* unpack high bytes of Src1 into words */
1093			punpcklbw mm3, mm0   	/* unpack low  bytes of Src2 into words */
1094			punpckhbw mm4, mm0   	/* unpack high bytes of Src2 into words */
1095			psrlw mm1, 1   	/* divide mm1 words by 2, Src1 low bytes */
1096			psrlw mm2, 1   	/* divide mm2 words by 2, Src1 high bytes */
1097			psrlw mm3, 1   	/* divide mm3 words by 2, Src2 low bytes */
1098			psrlw mm4, 1   	/* divide mm4 words by 2, Src2 high bytes */
1099			pmullw mm1, mm3   	/* mul low  bytes of Src1 and Src2  */
1100			pmullw mm2, mm4   	/* mul high bytes of Src1 and Src2 */
1101			packuswb mm1, mm2   	/* pack words back into bytes with saturation */
1102			movq [edi], mm1   	/* store result in Dest */
1103			add eax, 8   	/* increase Src1, Src2 and Dest  */
1104			add ebx, 8   	/* register pointers by 8 */
1105			add edi,  8
1106			dec ecx        	/* decrease loop counter */
1107			jnz L1016       	/* check loop termination, proceed if required */
1108			emms             	/* exit MMX state */
1109			popa
1110	}
1111#else
1112	asm volatile
1113		("pusha		     \n\t" "mov %2, %%eax \n\t"	/* load Src1 address into eax */
1114		"mov %1, %%ebx \n\t"	/* load Src2 address into ebx */
1115		"mov %0, %%edi \n\t"	/* load Dest address into edi */
1116		"mov %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
1117		"shr $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
1118		"pxor      %%mm0, %%mm0 \n\t"	/* zero mm0 register */
1119		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
1120		"1: movq (%%eax), %%mm1 \n\t"	/* load 8 bytes from Src1 into mm1 */
1121		"movq    (%%ebx), %%mm3 \n\t"	/* load 8 bytes from Src2 into mm3 */
1122		"movq      %%mm1, %%mm2 \n\t"	/* copy mm1 into mm2 */
1123		"movq      %%mm3, %%mm4 \n\t"	/* copy mm3 into mm4  */
1124		"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack low  bytes of Src1 into words */
1125		"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack high bytes of Src1 into words */
1126		"punpcklbw %%mm0, %%mm3 \n\t"	/* unpack low  bytes of Src2 into words */
1127		"punpckhbw %%mm0, %%mm4 \n\t"	/* unpack high bytes of Src2 into words */
1128		"psrlw        $1, %%mm1 \n\t"	/* divide mm1 words by 2, Src1 low bytes */
1129		"psrlw        $1, %%mm2 \n\t"	/* divide mm2 words by 2, Src1 high bytes */
1130		"psrlw        $1, %%mm3 \n\t"	/* divide mm3 words by 2, Src2 low bytes */
1131		"psrlw        $1, %%mm4 \n\t"	/* divide mm4 words by 2, Src2 high bytes */
1132		"pmullw    %%mm3, %%mm1 \n\t"	/* mul low  bytes of Src1 and Src2  */
1133		"pmullw    %%mm4, %%mm2 \n\t"	/* mul high bytes of Src1 and Src2 */
1134		"packuswb  %%mm2, %%mm1 \n\t"	/* pack words back into bytes with saturation */
1135		"movq    %%mm1, (%%edi) \n\t"	/* store result in Dest */
1136		"add $8, %%eax \n\t"	/* increase Src1, Src2 and Dest  */
1137		"add $8, %%ebx \n\t"	/* register pointers by 8 */
1138		"add $8, %%edi \n\t" "dec %%ecx     \n\t"	/* decrease loop counter */
1139		"jnz 1b        \n\t"	/* check loop termination, proceed if required */
1140		"emms          \n\t"	/* exit MMX state */
1141		"popa                   \n\t":"=m" (Dest)	/* %0 */
1142		:"m"(Src2),		/* %1 */
1143		"m"(Src1),		/* %2 */
1144		"m"(SrcLength)		/* %3 */
1145		);
1146#endif
1147	return (0);
1148#else
1149	return (-1);
1150#endif
1151}
1152
1153/*!
1154\brief Filter using MultDivby4: D = saturation255(S1/2 * S2/2)
1155
1156\param Src1 Pointer to the start of the first source byte array (S1).
1157\param Src2 Pointer to the start of the second source byte array (S2).
1158\param Dest Pointer to the start of the destination byte array (D).
1159\param length The number of bytes in the source arrays.
1160
1161\return Returns 0 for success or -1 for error.
1162*/
1163int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
1164{
1165	unsigned int i, istart;
1166	unsigned char *cursrc1, *cursrc2, *curdst;
1167	int result;
1168
1169	/* Validate input parameters */
1170	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
1171		return(-1);
1172	if (length == 0)
1173		return(0);
1174
1175	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1176		/* MMX routine */
1177		SDL_imageFilterMultDivby4MMX(Src1, Src2, Dest, length);
1178
1179		/* Check for unaligned bytes */
1180		if ((length & 7) > 0) {
1181			/* Setup to process unaligned bytes */
1182			istart = length & 0xfffffff8;
1183			cursrc1 = &Src1[istart];
1184			cursrc2 = &Src2[istart];
1185			curdst = &Dest[istart];
1186		} else {
1187			/* No unaligned bytes - we are done */
1188			return (0);
1189		}
1190	} else {
1191		/* Setup to process whole image */
1192		istart = 0;
1193		cursrc1 = Src1;
1194		cursrc2 = Src2;
1195		curdst = Dest;
1196	}
1197
1198	/* C routine to process image */
1199	for (i = istart; i < length; i++) {
1200		result = ((int) *cursrc1 / 2) * ((int) *cursrc2 / 2);
1201		if (result > 255)
1202			result = 255;
1203		*curdst = (unsigned char) result;
1204		/* Advance pointers */
1205		cursrc1++;
1206		cursrc2++;
1207		curdst++;
1208	}
1209
1210	return (0);
1211}
1212
1213/*!
1214\brief Internal MMX Filter using BitAnd: D = S1 & S2
1215
1216\param Src1 Pointer to the start of the first source byte array (S1).
1217\param Src2 Pointer to the start of the second source byte array (S2).
1218\param Dest Pointer to the start of the destination byte array (D).
1219\param SrcLength The number of bytes in the source arrays.
1220
1221\return Returns 0 for success or -1 for error.
1222*/
1223int SDL_imageFilterBitAndMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
1224{
1225#ifdef USE_MMX
1226#if !defined(GCC__)
1227	__asm
1228	{
1229		pusha
1230			mov eax, Src1   	/* load Src1 address into eax */
1231			mov ebx, Src2   	/* load Src2 address into ebx */
1232			mov edi, Dest   	/* load Dest address into edi */
1233			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
1234			shr ecx, 3 	/* counter/8 (MMX loads 8 bytes at a time) */
1235			align 16          	/* 16 byte alignment of the loop entry */
1236L1017:
1237		movq mm1, [eax]   	/* load 8 bytes from Src1 into mm1 */
1238		pand mm1, [ebx]   	/* mm1=Src1&Src2 */
1239		movq [edi], mm1   	/* store result in Dest */
1240			add eax, 8   	/* increase Src1, Src2 and Dest  */
1241			add ebx, 8   	/* register pointers by 8 */
1242			add edi, 8
1243			dec ecx        	/* decrease loop counter */
1244			jnz L1017       	/* check loop termination, proceed if required */
1245			emms             	/* exit MMX state */
1246			popa
1247	}
1248#else
1249	asm volatile
1250		("pusha		     \n\t" "mov %2, %%eax \n\t"	/* load Src1 address into eax */
1251		"mov %1, %%ebx \n\t"	/* load Src2 address into ebx */
1252		"mov %0, %%edi \n\t"	/* load Dest address into edi */
1253		"mov %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
1254		"shr $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
1255		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
1256		"1: movq (%%eax), %%mm1 \n\t"	/* load 8 bytes from Src1 into mm1 */
1257		"pand    (%%ebx), %%mm1 \n\t"	/* mm1=Src1&Src2 */
1258		"movq    %%mm1, (%%edi) \n\t"	/* store result in Dest */
1259		"add $8, %%eax \n\t"	/* increase Src1, Src2 and Dest  */
1260		"add $8, %%ebx \n\t"	/* register pointers by 8 */
1261		"add $8, %%edi \n\t" "dec %%ecx     \n\t"	/* decrease loop counter */
1262		"jnz 1b        \n\t"	/* check loop termination, proceed if required */
1263		"emms          \n\t"	/* exit MMX state */
1264		"popa                   \n\t":"=m" (Dest)	/* %0 */
1265		:"m"(Src2),		/* %1 */
1266		"m"(Src1),		/* %2 */
1267		"m"(SrcLength)		/* %3 */
1268		);
1269#endif
1270	return (0);
1271#else
1272	return (-1);
1273#endif
1274}
1275
1276/*!
1277\brief Filter using BitAnd: D = S1 & S2
1278
1279\param Src1 Pointer to the start of the first source byte array (S1).
1280\param Src2 Pointer to the start of the second source byte array (S2).
1281\param Dest Pointer to the start of the destination byte array (D).
1282\param length The number of bytes in the source arrays.
1283
1284\return Returns 0 for success or -1 for error.
1285*/
1286int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
1287{
1288	unsigned int i, istart;
1289	unsigned char *cursrc1, *cursrc2, *curdst;
1290
1291	/* Validate input parameters */
1292	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
1293		return(-1);
1294	if (length == 0)
1295		return(0);
1296
1297	if ((SDL_imageFilterMMXdetect()>0) && (length>7)) {
1298		/*  if (length > 7) { */
1299		/* Call MMX routine */
1300
1301		SDL_imageFilterBitAndMMX(Src1, Src2, Dest, length);
1302
1303		/* Check for unaligned bytes */
1304		if ((length & 7) > 0) {
1305
1306			/* Setup to process unaligned bytes */
1307			istart = length & 0xfffffff8;
1308			cursrc1 = &Src1[istart];
1309			cursrc2 = &Src2[istart];
1310			curdst = &Dest[istart];
1311		} else {
1312			/* No unaligned bytes - we are done */
1313			return (0);
1314		}
1315	} else {
1316		/* Setup to process whole image */
1317		istart = 0;
1318		cursrc1 = Src1;
1319		cursrc2 = Src2;
1320		curdst = Dest;
1321	}
1322
1323	/* C routine to process image */
1324	for (i = istart; i < length; i++) {
1325		*curdst = (*cursrc1) & (*cursrc2);
1326		/* Advance pointers */
1327		cursrc1++;
1328		cursrc2++;
1329		curdst++;
1330	}
1331
1332	return (0);
1333}
1334
1335/*!
1336\brief Internal MMX Filter using BitOr: D = S1 | S2
1337
1338\param Src1 Pointer to the start of the first source byte array (S1).
1339\param Src2 Pointer to the start of the second source byte array (S2).
1340\param Dest Pointer to the start of the destination byte array (D).
1341\param SrcLength The number of bytes in the source arrays.
1342
1343\return Returns 0 for success or -1 for error.
1344*/
1345int SDL_imageFilterBitOrMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
1346{
1347#ifdef USE_MMX
1348#if !defined(GCC__)
1349	__asm
1350	{
1351		pusha
1352			mov eax, Src1   	/* load Src1 address into eax */
1353			mov ebx, Src2   	/* load Src2 address into ebx */
1354			mov edi, Dest   	/* load Dest address into edi */
1355			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
1356			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
1357			align 16          	/* 16 byte alignment of the loop entry */
1358L91017:
1359		movq mm1, [eax]   	/* load 8 bytes from Src1 into mm1 */
1360		por mm1, [ebx]   	/* mm1=Src1|Src2 */
1361		movq [edi], mm1   	/* store result in Dest */
1362			add eax, 8   	/* increase Src1, Src2 and Dest  */
1363			add ebx, 8   	/* register pointers by 8 */
1364			add edi,  8
1365			dec ecx        	/* decrease loop counter */
1366			jnz L91017      	/* check loop termination, proceed if required */
1367			emms             	/* exit MMX state */
1368			popa
1369	}
1370#else
1371	asm volatile
1372		("pusha		     \n\t" "mov %2, %%eax \n\t"	/* load Src1 address into eax */
1373		"mov %1, %%ebx \n\t"	/* load Src2 address into ebx */
1374		"mov %0, %%edi \n\t"	/* load Dest address into edi */
1375		"mov %3, %%ecx \n\t"	/* load loop counter (SIZE) into ecx */
1376		"shr $3, %%ecx \n\t"	/* counter/8 (MMX loads 8 bytes at a time) */
1377		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
1378		"1: movq (%%eax), %%mm1 \n\t"	/* load 8 bytes from Src1 into mm1 */
1379		"por     (%%ebx), %%mm1 \n\t"	/* mm1=Src1|Src2 */
1380		"movq    %%mm1, (%%edi) \n\t"	/* store result in Dest */
1381		"add $8, %%eax \n\t"	/* increase Src1, Src2 and Dest  */
1382		"add $8, %%ebx \n\t"	/* register pointers by 8 */
1383		"add $8, %%edi \n\t" "dec %%ecx     \n\t"	/* decrease loop counter */
1384		"jnz 1b        \n\t"	/* check loop termination, proceed if required */
1385		"emms          \n\t"	/* exit MMX state */
1386		"popa                   \n\t":"=m" (Dest)	/* %0 */
1387		:"m"(Src2),		/* %1 */
1388		"m"(Src1),		/* %2 */
1389		"m"(SrcLength)		/* %3 */
1390		);
1391#endif
1392	return (0);
1393#else
1394	return (-1);
1395#endif
1396}
1397
1398/*!
1399\brief Filter using BitOr: D = S1 | S2
1400
1401\param Src1 Pointer to the start of the first source byte array (S1).
1402\param Src2 Pointer to the start of the second source byte array (S2).
1403\param Dest Pointer to the start of the destination byte array (D).
1404\param length The number of bytes in the source arrays.
1405
1406\return Returns 0 for success or -1 for error.
1407*/
1408int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
1409{
1410	unsigned int i, istart;
1411	unsigned char *cursrc1, *cursrc2, *curdst;
1412
1413	/* Validate input parameters */
1414	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
1415		return(-1);
1416	if (length == 0)
1417		return(0);
1418
1419	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1420
1421		/* MMX routine */
1422		SDL_imageFilterBitOrMMX(Src1, Src2, Dest, length);
1423
1424		/* Check for unaligned bytes */
1425		if ((length & 7) > 0) {
1426			/* Setup to process unaligned bytes */
1427			istart = length & 0xfffffff8;
1428			cursrc1 = &Src1[istart];
1429			cursrc2 = &Src2[istart];
1430			curdst = &Dest[istart];
1431		} else {
1432			/*…

Large files files are truncated, but you can click here to view the full file