/project/jni/sdl_gfx/SDL_imageFilter.c

https://github.com/aichunyu/FFPlayer · C · 7556 lines · 6038 code · 274 blank · 1244 comment · 520 complexity · d43bab1767b6590f1804da8a41766b90 MD5 · raw file

Large files are truncated click here to view the full file

  1. /*
  2. SDL_imageFilter - bytes-image "filter" routines.
  3. (Uses inline x86 MMX or ASM optimizations if available and enabled.)
  4. LGPL (c) A. Schiffler
  5. Note: Most of the MMX code is based on published routines
  6. by Vladimir Kravtchenko at vk@cs.ubc.ca - credits go to
  7. him for his work.
  8. */
  9. #include <stdio.h>
  10. #include <stdlib.h>
  11. #include <string.h>
  12. #include "SDL_imageFilter.h"
  13. /*!
  14. \brief Swaps the byte order in a 32bit integer (LSB becomes MSB, etc.).
  15. */
  16. #define SWAP_32(x) (((x) >> 24) | (((x) & 0x00ff0000) >> 8) | (((x) & 0x0000ff00) << 8) | ((x) << 24))
  17. /* ------ Static variables ----- */
  18. /*!
  19. \brief Static state which enables the use of the MMX routines. Enabled by default
  20. */
  21. static int SDL_imageFilterUseMMX = 1;
  22. /* Detect GCC */
  23. #if defined(__GNUC__)
  24. #define GCC__
  25. #endif
  26. /*!
  27. \brief Internal function returning the CPU flags.
  28. \returns Flags of system CPU.
  29. */
  30. unsigned int _cpuFlags()
  31. {
  32. int flags = 0;
  33. #ifdef USE_MMX
  34. #if !defined(GCC__)
  35. __asm
  36. {
  37. pusha
  38. mov eax, 1
  39. cpuid /* get CPU ID flag */
  40. mov flags,edx /* move result to mmx_bit */
  41. popa
  42. }
  43. #else
  44. asm volatile ("pusha \n\t" "mov %1, %%eax \n\t" /* request feature flag */
  45. "cpuid \n\t" /* get CPU ID flag */
  46. "mov %%edx, %0 \n\t" /* move result to mmx_bit */
  47. "popa \n\t":"=m" (flags) /* %0 */
  48. :"i"(0x00000001) /* %1 */
  49. );
  50. #endif
  51. #endif
  52. return (flags);
  53. }
  54. /*!
  55. \brief MMX detection routine (with override flag).
  56. \returns 1 of MMX was detected, 0 otherwise.
  57. */
  58. int SDL_imageFilterMMXdetect(void)
  59. {
  60. unsigned int mmx_bit;
  61. /* Check override flag */
  62. if (SDL_imageFilterUseMMX == 0) {
  63. return (0);
  64. }
  65. mmx_bit = _cpuFlags();
  66. mmx_bit &= 0x00800000;
  67. mmx_bit = (mmx_bit && 0x00800000);
  68. return (mmx_bit);
  69. }
  70. /*!
  71. \brief Disable MMX check for filter functions and and force to use non-MMX C based code.
  72. */
  73. void SDL_imageFilterMMXoff()
  74. {
  75. SDL_imageFilterUseMMX = 0;
  76. }
  77. /*!
  78. \brief Enable MMX check for filter functions and use MMX code if available.
  79. */
  80. void SDL_imageFilterMMXon()
  81. {
  82. SDL_imageFilterUseMMX = 1;
  83. }
  84. /* ------------------------------------------------------------------------------------ */
  85. /*!
  86. \brief Internal MMX Filter using Add: D = saturation255(S1 + S2)
  87. \param Src1 Pointer to the start of the first source byte array (S1).
  88. \param Src2 Pointer to the start of the second source byte array (S2).
  89. \param Dest Pointer to the start of the destination byte array (D).
  90. \param SrcLength The number of bytes in the source arrays.
  91. \return Returns 0 for success or -1 for error.
  92. */
  93. int SDL_imageFilterAddMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
  94. {
  95. #ifdef USE_MMX
  96. #if !defined(GCC__)
  97. __asm
  98. {
  99. pusha
  100. mov eax, Src1 /* load Src1 address into eax */
  101. mov ebx, Src2 /* load Src2 address into ebx */
  102. mov edi, Dest /* load Dest address into edi */
  103. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  104. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  105. align 16 /* 16 byte alignment of the loop entry */
  106. L1010:
  107. movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
  108. paddusb mm1, [ebx] /* mm1=Src1+Src2 (add 8 bytes with saturation) */
  109. movq [edi], mm1 /* store result in Dest */
  110. add eax, 8 /* increase Src1, Src2 and Dest */
  111. add ebx, 8 /* register pointers by 8 */
  112. add edi, 8
  113. dec ecx /* decrease loop counter */
  114. jnz L1010 /* check loop termination, proceed if required */
  115. emms /* exit MMX state */
  116. popa
  117. }
  118. #else
  119. asm volatile
  120. ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
  121. "mov %1, %%ebx \n\t" /* load Src2 address into ebx */
  122. "mov %0, %%edi \n\t" /* load Dest address into edi */
  123. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  124. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  125. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  126. "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
  127. "paddusb (%%ebx), %%mm1 \n\t" /* mm1=Src1+Src2 (add 8 bytes with saturation) */
  128. "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
  129. "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
  130. "add $8, %%ebx \n\t" /* register pointers by 8 */
  131. "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
  132. "jnz 1b \n\t" /* check loop termination, proceed if required */
  133. "emms \n\t" /* exit MMX state */
  134. "popa \n\t":"=m" (Dest) /* %0 */
  135. :"m"(Src2), /* %1 */
  136. "m"(Src1), /* %2 */
  137. "m"(SrcLength) /* %3 */
  138. );
  139. #endif
  140. return (0);
  141. #else
  142. return (-1);
  143. #endif
  144. }
  145. /*!
  146. \brief Filter using Add: D = saturation255(S1 + S2)
  147. \param Src1 Pointer to the start of the first source byte array (S1).
  148. \param Src2 Pointer to the start of the second source byte array (S2).
  149. \param Dest Pointer to the start of the destination byte array (D).
  150. \param length The number of bytes in the source arrays.
  151. \return Returns 0 for success or -1 for error.
  152. */
  153. int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  154. {
  155. unsigned int i, istart;
  156. unsigned char *cursrc1, *cursrc2, *curdst;
  157. int result;
  158. /* Validate input parameters */
  159. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  160. return(-1);
  161. if (length == 0)
  162. return(0);
  163. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  164. /* Use MMX assembly routine */
  165. SDL_imageFilterAddMMX(Src1, Src2, Dest, length);
  166. /* Check for unaligned bytes */
  167. if ((length & 7) > 0) {
  168. /* Setup to process unaligned bytes */
  169. istart = length & 0xfffffff8;
  170. cursrc1 = &Src1[istart];
  171. cursrc2 = &Src2[istart];
  172. curdst = &Dest[istart];
  173. } else {
  174. /* No unaligned bytes - we are done */
  175. return (0);
  176. }
  177. } else {
  178. /* Setup to process whole image */
  179. istart = 0;
  180. cursrc1 = Src1;
  181. cursrc2 = Src2;
  182. curdst = Dest;
  183. }
  184. /* C routine to process image */
  185. for (i = istart; i < length; i++) {
  186. result = (int) *cursrc1 + (int) *cursrc2;
  187. if (result > 255)
  188. result = 255;
  189. *curdst = (unsigned char) result;
  190. /* Advance pointers */
  191. cursrc1++;
  192. cursrc2++;
  193. curdst++;
  194. }
  195. return (0);
  196. }
  197. /*!
  198. \brief Internal MMX Filter using Mean: D = S1/2 + S2/2
  199. \param Src1 Pointer to the start of the first source byte array (S1).
  200. \param Src2 Pointer to the start of the second source byte array (S2).
  201. \param Dest Pointer to the start of the destination byte array (D).
  202. \param SrcLength The number of bytes in the source arrays.
  203. \param Mask Mask array containing 8 bytes with 0x7F value.
  204. ]
  205. \return Returns 0 for success or -1 for error.
  206. */
  207. int SDL_imageFilterMeanMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength,
  208. unsigned char *Mask)
  209. {
  210. #ifdef USE_MMX
  211. #if !defined(GCC__)
  212. __asm
  213. {
  214. pusha
  215. mov edx, Mask /* load Mask address into edx */
  216. movq mm0, [edx] /* load Mask into mm0 */
  217. mov eax, Src1 /* load Src1 address into eax */
  218. mov ebx, Src2 /* load Src2 address into ebx */
  219. mov edi, Dest /* load Dest address into edi */
  220. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  221. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  222. align 16 /* 16 byte alignment of the loop entry */
  223. L21011:
  224. movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
  225. movq mm2, [ebx] /* load 8 bytes from Src2 into mm2 */
  226. /* --- Byte shift via Word shift --- */
  227. psrlw mm1, 1 /* shift 4 WORDS of mm1 1 bit to the right */
  228. psrlw mm2, 1 /* shift 4 WORDS of mm2 1 bit to the right */
  229. pand mm1, mm0 // apply Mask to 8 BYTES of mm1 */
  230. /* byte 0x0f, 0xdb, 0xc8 */
  231. pand mm2, mm0 // apply Mask to 8 BYTES of mm2 */
  232. /* byte 0x0f, 0xdb, 0xd0 */
  233. paddusb mm1, mm2 /* mm1=mm1+mm2 (add 8 bytes with saturation) */
  234. movq [edi], mm1 /* store result in Dest */
  235. add eax, 8 /* increase Src1, Src2 and Dest */
  236. add ebx, 8 /* register pointers by 8 */
  237. add edi, 8
  238. dec ecx /* decrease loop counter */
  239. jnz L21011 /* check loop termination, proceed if required */
  240. emms /* exit MMX state */
  241. popa
  242. }
  243. #else
  244. asm volatile
  245. ("pusha \n\t" "movl %4, %%edx \n\t" /* load Mask address into edx */
  246. "movq (%%edx), %%mm0 \n\t" /* load Mask into mm0 */
  247. "mov %2, %%eax \n\t" /* load Src1 address into eax */
  248. "mov %1, %%ebx \n\t" /* load Src2 address into ebx */
  249. "mov %0, %%edi \n\t" /* load Dest address into edi */
  250. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  251. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  252. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  253. "1: \n\t"
  254. "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
  255. "movq (%%ebx), %%mm2 \n\t" /* load 8 bytes from Src2 into mm2 */
  256. /* --- Byte shift via Word shift --- */
  257. "psrlw $1, %%mm1 \n\t" /* shift 4 WORDS of mm1 1 bit to the right */
  258. "psrlw $1, %%mm2 \n\t" /* shift 4 WORDS of mm2 1 bit to the right */
  259. /* "pand %%mm0, %%mm1 \n\t" // apply Mask to 8 BYTES of mm1 */
  260. ".byte 0x0f, 0xdb, 0xc8 \n\t"
  261. /* "pand %%mm0, %%mm2 \n\t" // apply Mask to 8 BYTES of mm2 */
  262. ".byte 0x0f, 0xdb, 0xd0 \n\t"
  263. "paddusb %%mm2, %%mm1 \n\t" /* mm1=mm1+mm2 (add 8 bytes with saturation) */
  264. "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
  265. "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
  266. "add $8, %%ebx \n\t" /* register pointers by 8 */
  267. "add $8, %%edi \n\t"
  268. "dec %%ecx \n\t" /* decrease loop counter */
  269. "jnz 1b \n\t" /* check loop termination, proceed if required */
  270. "emms \n\t" /* exit MMX state */
  271. "popa \n\t":"=m" (Dest) /* %0 */
  272. :"m"(Src2), /* %1 */
  273. "m"(Src1), /* %2 */
  274. "m"(SrcLength), /* %3 */
  275. "m"(Mask) /* %4 */
  276. );
  277. #endif
  278. return (0);
  279. #else
  280. return (-1);
  281. #endif
  282. }
  283. /*!
  284. \brief Filter using Mean: D = S1/2 + S2/2
  285. \param Src1 Pointer to the start of the first source byte array (S1).
  286. \param Src2 Pointer to the start of the second source byte array (S2).
  287. \param Dest Pointer to the start of the destination byte array (D).
  288. \param length The number of bytes in the source arrays.
  289. \return Returns 0 for success or -1 for error.
  290. */
  291. int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  292. {
  293. static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
  294. unsigned int i, istart;
  295. unsigned char *cursrc1, *cursrc2, *curdst;
  296. int result;
  297. /* Validate input parameters */
  298. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  299. return(-1);
  300. if (length == 0)
  301. return(0);
  302. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  303. /* MMX routine */
  304. SDL_imageFilterMeanMMX(Src1, Src2, Dest, length, Mask);
  305. /* Check for unaligned bytes */
  306. if ((length & 7) > 0) {
  307. /* Setup to process unaligned bytes */
  308. istart = length & 0xfffffff8;
  309. cursrc1 = &Src1[istart];
  310. cursrc2 = &Src2[istart];
  311. curdst = &Dest[istart];
  312. } else {
  313. /* No unaligned bytes - we are done */
  314. return (0);
  315. }
  316. } else {
  317. /* Setup to process whole image */
  318. istart = 0;
  319. cursrc1 = Src1;
  320. cursrc2 = Src2;
  321. curdst = Dest;
  322. }
  323. /* C routine to process image */
  324. for (i = istart; i < length; i++) {
  325. result = (int) *cursrc1 / 2 + (int) *cursrc2 / 2;
  326. *curdst = (unsigned char) result;
  327. /* Advance pointers */
  328. cursrc1++;
  329. cursrc2++;
  330. curdst++;
  331. }
  332. return (0);
  333. }
  334. /*!
  335. \brief Internal MMX Filter using Sub: D = saturation0(S1 - S2)
  336. \param Src1 Pointer to the start of the first source byte array (S1).
  337. \param Src2 Pointer to the start of the second source byte array (S2).
  338. \param Dest Pointer to the start of the destination byte array (D).
  339. \param SrcLength The number of bytes in the source arrays.
  340. \return Returns 0 for success or -1 for error.
  341. */
  342. int SDL_imageFilterSubMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
  343. {
  344. #ifdef USE_MMX
  345. #if !defined(GCC__)
  346. __asm
  347. {
  348. pusha
  349. mov eax, Src1 /* load Src1 address into eax */
  350. mov ebx, Src2 /* load Src2 address into ebx */
  351. mov edi, Dest /* load Dest address into edi */
  352. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  353. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  354. align 16 /* 16 byte alignment of the loop entry */
  355. L1012:
  356. movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
  357. psubusb mm1, [ebx] /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
  358. movq [edi], mm1 /* store result in Dest */
  359. add eax, 8 /* increase Src1, Src2 and Dest */
  360. add ebx, 8 /* register pointers by 8 */
  361. add edi, 8
  362. dec ecx /* decrease loop counter */
  363. jnz L1012 /* check loop termination, proceed if required */
  364. emms /* exit MMX state */
  365. popa
  366. }
  367. #else
  368. asm volatile
  369. ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
  370. "mov %1, %%ebx \n\t" /* load Src2 address into ebx */
  371. "mov %0, %%edi \n\t" /* load Dest address into edi */
  372. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  373. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  374. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  375. "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
  376. "psubusb (%%ebx), %%mm1 \n\t" /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
  377. "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
  378. "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
  379. "add $8, %%ebx \n\t" /* register pointers by 8 */
  380. "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
  381. "jnz 1b \n\t" /* check loop termination, proceed if required */
  382. "emms \n\t" /* exit MMX state */
  383. "popa \n\t":"=m" (Dest) /* %0 */
  384. :"m"(Src2), /* %1 */
  385. "m"(Src1), /* %2 */
  386. "m"(SrcLength) /* %3 */
  387. );
  388. #endif
  389. return (0);
  390. #else
  391. return (-1);
  392. #endif
  393. }
  394. /*!
  395. \brief Filter using Sub: D = saturation0(S1 - S2)
  396. \param Src1 Pointer to the start of the first source byte array (S1).
  397. \param Src2 Pointer to the start of the second source byte array (S2).
  398. \param Dest Pointer to the start of the destination byte array (D).
  399. \param length The number of bytes in the source arrays.
  400. \return Returns 0 for success or -1 for error.
  401. */
  402. int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  403. {
  404. unsigned int i, istart;
  405. unsigned char *cursrc1, *cursrc2, *curdst;
  406. int result;
  407. /* Validate input parameters */
  408. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  409. return(-1);
  410. if (length == 0)
  411. return(0);
  412. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  413. /* MMX routine */
  414. SDL_imageFilterSubMMX(Src1, Src2, Dest, length);
  415. /* Check for unaligned bytes */
  416. if ((length & 7) > 0) {
  417. /* Setup to process unaligned bytes */
  418. istart = length & 0xfffffff8;
  419. cursrc1 = &Src1[istart];
  420. cursrc2 = &Src2[istart];
  421. curdst = &Dest[istart];
  422. } else {
  423. /* No unaligned bytes - we are done */
  424. return (0);
  425. }
  426. } else {
  427. /* Setup to process whole image */
  428. istart = 0;
  429. cursrc1 = Src1;
  430. cursrc2 = Src2;
  431. curdst = Dest;
  432. }
  433. /* C routine to process image */
  434. for (i = istart; i < length; i++) {
  435. result = (int) *cursrc1 - (int) *cursrc2;
  436. if (result < 0)
  437. result = 0;
  438. *curdst = (unsigned char) result;
  439. /* Advance pointers */
  440. cursrc1++;
  441. cursrc2++;
  442. curdst++;
  443. }
  444. return (0);
  445. }
  446. /*!
  447. \brief Internal MMX Filter using AbsDiff: D = | S1 - S2 |
  448. \param Src1 Pointer to the start of the first source byte array (S1).
  449. \param Src2 Pointer to the start of the second source byte array (S2).
  450. \param Dest Pointer to the start of the destination byte array (D).
  451. \param SrcLength The number of bytes in the source arrays.
  452. \return Returns 0 for success or -1 for error.
  453. */
  454. int SDL_imageFilterAbsDiffMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
  455. {
  456. #ifdef USE_MMX
  457. #if !defined(GCC__)
  458. __asm
  459. {
  460. pusha
  461. mov eax, Src1 /* load Src1 address into eax */
  462. mov ebx, Src2 /* load Src2 address into ebx */
  463. mov edi, Dest /* load Dest address into edi */
  464. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  465. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  466. align 16 /* 16 byte alignment of the loop entry */
  467. L1013:
  468. movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
  469. movq mm2, [ebx] /* load 8 bytes from Src2 into mm2 */
  470. psubusb mm1, [ebx] /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
  471. psubusb mm2, [eax] /* mm2=Src2-Src1 (sub 8 bytes with saturation) */
  472. por mm1, mm2 /* combine both mm2 and mm1 results */
  473. movq [edi], mm1 /* store result in Dest */
  474. add eax, 8 /* increase Src1, Src2 and Dest */
  475. add ebx, 8 /* register pointers by 8 */
  476. add edi, 8
  477. dec ecx /* decrease loop counter */
  478. jnz L1013 /* check loop termination, proceed if required */
  479. emms /* exit MMX state */
  480. popa
  481. }
  482. #else
  483. asm volatile
  484. ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
  485. "mov %1, %%ebx \n\t" /* load Src2 address into ebx */
  486. "mov %0, %%edi \n\t" /* load Dest address into edi */
  487. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  488. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  489. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  490. "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
  491. "movq (%%ebx), %%mm2 \n\t" /* load 8 bytes from Src2 into mm2 */
  492. "psubusb (%%ebx), %%mm1 \n\t" /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
  493. "psubusb (%%eax), %%mm2 \n\t" /* mm2=Src2-Src1 (sub 8 bytes with saturation) */
  494. "por %%mm2, %%mm1 \n\t" /* combine both mm2 and mm1 results */
  495. "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
  496. "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
  497. "add $8, %%ebx \n\t" /* register pointers by 8 */
  498. "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
  499. "jnz 1b \n\t" /* check loop termination, proceed if required */
  500. "emms \n\t" /* exit MMX state */
  501. "popa \n\t":"=m" (Dest) /* %0 */
  502. :"m"(Src2), /* %1 */
  503. "m"(Src1), /* %2 */
  504. "m"(SrcLength) /* %3 */
  505. );
  506. #endif
  507. return (0);
  508. #else
  509. return (-1);
  510. #endif
  511. }
  512. /*!
  513. \brief Filter using AbsDiff: D = | S1 - S2 |
  514. \param Src1 Pointer to the start of the first source byte array (S1).
  515. \param Src2 Pointer to the start of the second source byte array (S2).
  516. \param Dest Pointer to the start of the destination byte array (D).
  517. \param length The number of bytes in the source arrays.
  518. \return Returns 0 for success or -1 for error.
  519. */
  520. int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  521. {
  522. unsigned int i, istart;
  523. unsigned char *cursrc1, *cursrc2, *curdst;
  524. int result;
  525. /* Validate input parameters */
  526. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  527. return(-1);
  528. if (length == 0)
  529. return(0);
  530. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  531. /* MMX routine */
  532. SDL_imageFilterAbsDiffMMX(Src1, Src2, Dest, length);
  533. /* Check for unaligned bytes */
  534. if ((length & 7) > 0) {
  535. /* Setup to process unaligned bytes */
  536. istart = length & 0xfffffff8;
  537. cursrc1 = &Src1[istart];
  538. cursrc2 = &Src2[istart];
  539. curdst = &Dest[istart];
  540. } else {
  541. /* No unaligned bytes - we are done */
  542. return (0);
  543. }
  544. } else {
  545. /* Setup to process whole image */
  546. istart = 0;
  547. cursrc1 = Src1;
  548. cursrc2 = Src2;
  549. curdst = Dest;
  550. }
  551. /* C routine to process image */
  552. for (i = istart; i < length; i++) {
  553. result = abs((int) *cursrc1 - (int) *cursrc2);
  554. *curdst = (unsigned char) result;
  555. /* Advance pointers */
  556. cursrc1++;
  557. cursrc2++;
  558. curdst++;
  559. }
  560. return (0);
  561. }
  562. /*!
  563. \brief Internal MMX Filter using Mult: D = saturation255(S1 * S2)
  564. \param Src1 Pointer to the start of the first source byte array (S1).
  565. \param Src2 Pointer to the start of the second source byte array (S2).
  566. \param Dest Pointer to the start of the destination byte array (D).
  567. \param SrcLength The number of bytes in the source arrays.
  568. \return Returns 0 for success or -1 for error.
  569. */
  570. int SDL_imageFilterMultMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
  571. {
  572. #ifdef USE_MMX
  573. #if !defined(GCC__)
  574. __asm
  575. {
  576. pusha
  577. mov eax, Src1 /* load Src1 address into eax */
  578. mov ebx, Src2 /* load Src2 address into ebx */
  579. mov edi, Dest /* load Dest address into edi */
  580. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  581. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  582. pxor mm0, mm0 /* zero mm0 register */
  583. align 16 /* 16 byte alignment of the loop entry */
  584. L1014:
  585. movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
  586. movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */
  587. movq mm2, mm1 /* copy mm1 into mm2 */
  588. movq mm4, mm3 /* copy mm3 into mm4 */
  589. punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */
  590. punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */
  591. punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */
  592. punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */
  593. pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */
  594. pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */
  595. /* Take abs value of the results (signed words) */
  596. movq mm5, mm1 /* copy mm1 into mm5 */
  597. movq mm6, mm2 /* copy mm2 into mm6 */
  598. psraw mm5, 15 /* fill mm5 words with word sign bit */
  599. psraw mm6, 15 /* fill mm6 words with word sign bit */
  600. pxor mm1, mm5 /* take 1's compliment of only neg. words */
  601. pxor mm2, mm6 /* take 1's compliment of only neg. words */
  602. psubsw mm1, mm5 /* add 1 to only neg. words, W-(-1) or W-0 */
  603. psubsw mm2, mm6 /* add 1 to only neg. words, W-(-1) or W-0 */
  604. packuswb mm1, mm2 /* pack words back into bytes with saturation */
  605. movq [edi], mm1 /* store result in Dest */
  606. add eax, 8 /* increase Src1, Src2 and Dest */
  607. add ebx, 8 /* register pointers by 8 */
  608. add edi, 8
  609. dec ecx /* decrease loop counter */
  610. jnz L1014 /* check loop termination, proceed if required */
  611. emms /* exit MMX state */
  612. popa
  613. }
  614. #else
  615. asm volatile
  616. ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
  617. "mov %1, %%ebx \n\t" /* load Src2 address into ebx */
  618. "mov %0, %%edi \n\t" /* load Dest address into edi */
  619. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  620. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  621. "pxor %%mm0, %%mm0 \n\t" /* zero mm0 register */
  622. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  623. "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
  624. "movq (%%ebx), %%mm3 \n\t" /* load 8 bytes from Src2 into mm3 */
  625. "movq %%mm1, %%mm2 \n\t" /* copy mm1 into mm2 */
  626. "movq %%mm3, %%mm4 \n\t" /* copy mm3 into mm4 */
  627. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack low bytes of Src1 into words */
  628. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack high bytes of Src1 into words */
  629. "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of Src2 into words */
  630. "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of Src2 into words */
  631. "pmullw %%mm3, %%mm1 \n\t" /* mul low bytes of Src1 and Src2 */
  632. "pmullw %%mm4, %%mm2 \n\t" /* mul high bytes of Src1 and Src2 */
  633. /* Take abs value of the results (signed words) */
  634. "movq %%mm1, %%mm5 \n\t" /* copy mm1 into mm5 */
  635. "movq %%mm2, %%mm6 \n\t" /* copy mm2 into mm6 */
  636. "psraw $15, %%mm5 \n\t" /* fill mm5 words with word sign bit */
  637. "psraw $15, %%mm6 \n\t" /* fill mm6 words with word sign bit */
  638. "pxor %%mm5, %%mm1 \n\t" /* take 1's compliment of only neg. words */
  639. "pxor %%mm6, %%mm2 \n\t" /* take 1's compliment of only neg. words */
  640. "psubsw %%mm5, %%mm1 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
  641. "psubsw %%mm6, %%mm2 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
  642. "packuswb %%mm2, %%mm1 \n\t" /* pack words back into bytes with saturation */
  643. "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
  644. "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
  645. "add $8, %%ebx \n\t" /* register pointers by 8 */
  646. "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
  647. "jnz 1b \n\t" /* check loop termination, proceed if required */
  648. "emms \n\t" /* exit MMX state */
  649. "popa \n\t":"=m" (Dest) /* %0 */
  650. :"m"(Src2), /* %1 */
  651. "m"(Src1), /* %2 */
  652. "m"(SrcLength) /* %3 */
  653. );
  654. #endif
  655. return (0);
  656. #else
  657. return (-1);
  658. #endif
  659. }
  660. /*!
  661. \brief Filter using Mult: D = saturation255(S1 * S2)
  662. \param Src1 Pointer to the start of the first source byte array (S1).
  663. \param Src2 Pointer to the start of the second source byte array (S2).
  664. \param Dest Pointer to the start of the destination byte array (D).
  665. \param length The number of bytes in the source arrays.
  666. \return Returns 0 for success or -1 for error.
  667. */
  668. int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  669. {
  670. unsigned int i, istart;
  671. unsigned char *cursrc1, *cursrc2, *curdst;
  672. int result;
  673. /* Validate input parameters */
  674. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  675. return(-1);
  676. if (length == 0)
  677. return(0);
  678. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  679. /* MMX routine */
  680. SDL_imageFilterMultMMX(Src1, Src2, Dest, length);
  681. /* Check for unaligned bytes */
  682. if ((length & 7) > 0) {
  683. /* Setup to process unaligned bytes */
  684. istart = length & 0xfffffff8;
  685. cursrc1 = &Src1[istart];
  686. cursrc2 = &Src2[istart];
  687. curdst = &Dest[istart];
  688. } else {
  689. /* No unaligned bytes - we are done */
  690. return (0);
  691. }
  692. } else {
  693. /* Setup to process whole image */
  694. istart = 0;
  695. cursrc1 = Src1;
  696. cursrc2 = Src2;
  697. curdst = Dest;
  698. }
  699. /* C routine to process image */
  700. for (i = istart; i < length; i++) {
  701. /* NOTE: this is probably wrong - dunno what the MMX code does */
  702. result = (int) *cursrc1 * (int) *cursrc2;
  703. if (result > 255)
  704. result = 255;
  705. *curdst = (unsigned char) result;
  706. /* Advance pointers */
  707. cursrc1++;
  708. cursrc2++;
  709. curdst++;
  710. }
  711. return (0);
  712. }
  713. /*!
  714. \brief Internal ASM Filter using MultNor: D = S1 * S2
  715. \param Src1 Pointer to the start of the first source byte array (S1).
  716. \param Src2 Pointer to the start of the second source byte array (S2).
  717. \param Dest Pointer to the start of the destination byte array (D).
  718. \param SrcLength The number of bytes in the source arrays.
  719. \return Returns 0 for success or -1 for error.
  720. */
  721. int SDL_imageFilterMultNorASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
  722. {
  723. #ifdef USE_MMX
  724. #if !defined(GCC__)
  725. __asm
  726. {
  727. pusha
  728. mov edx, Src1 /* load Src1 address into edx */
  729. mov esi, Src2 /* load Src2 address into esi */
  730. mov edi, Dest /* load Dest address into edi */
  731. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  732. align 16 /* 16 byte alignment of the loop entry */
  733. L10141:
  734. mov al, [edx] /* load a byte from Src1 */
  735. mul [esi] /* mul with a byte from Src2 */
  736. mov [edi], al /* move a byte result to Dest */
  737. inc edx /* increment Src1, Src2, Dest */
  738. inc esi /* pointer registers by one */
  739. inc edi
  740. dec ecx /* decrease loop counter */
  741. jnz L10141 /* check loop termination, proceed if required */
  742. popa
  743. }
  744. #else
  745. asm volatile
  746. ("pusha \n\t" "mov %2, %%edx \n\t" /* load Src1 address into edx */
  747. "mov %1, %%esi \n\t" /* load Src2 address into esi */
  748. "mov %0, %%edi \n\t" /* load Dest address into edi */
  749. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  750. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  751. "1:mov (%%edx), %%al \n\t" /* load a byte from Src1 */
  752. "mulb (%%esi) \n\t" /* mul with a byte from Src2 */
  753. "mov %%al, (%%edi) \n\t" /* move a byte result to Dest */
  754. "inc %%edx \n\t" /* increment Src1, Src2, Dest */
  755. "inc %%esi \n\t" /* pointer registers by one */
  756. "inc %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
  757. "jnz 1b \n\t" /* check loop termination, proceed if required */
  758. "popa \n\t":"=m" (Dest) /* %0 */
  759. :"m"(Src2), /* %1 */
  760. "m"(Src1), /* %2 */
  761. "m"(SrcLength) /* %3 */
  762. );
  763. #endif
  764. return (0);
  765. #else
  766. return (-1);
  767. #endif
  768. }
  769. /*!
  770. \brief Filter using MultNor: D = S1 * S2
  771. \param Src1 Pointer to the start of the first source byte array (S1).
  772. \param Src2 Pointer to the start of the second source byte array (S2).
  773. \param Dest Pointer to the start of the destination byte array (D).
  774. \param length The number of bytes in the source arrays.
  775. \return Returns 0 for success or -1 for error.
  776. */
  777. int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  778. {
  779. unsigned int i, istart;
  780. unsigned char *cursrc1, *cursrc2, *curdst;
  781. int result;
  782. /* Validate input parameters */
  783. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  784. return(-1);
  785. if (length == 0)
  786. return(0);
  787. if (SDL_imageFilterMMXdetect()) {
  788. if (length > 0) {
  789. /* ASM routine */
  790. SDL_imageFilterMultNorASM(Src1, Src2, Dest, length);
  791. /* Check for unaligned bytes */
  792. if ((length & 7) > 0) {
  793. /* Setup to process unaligned bytes */
  794. istart = length & 0xfffffff8;
  795. cursrc1 = &Src1[istart];
  796. cursrc2 = &Src2[istart];
  797. curdst = &Dest[istart];
  798. } else {
  799. /* No unaligned bytes - we are done */
  800. return (0);
  801. }
  802. } else {
  803. /* No bytes - we are done */
  804. return (0);
  805. }
  806. } else {
  807. /* Setup to process whole image */
  808. istart = 0;
  809. cursrc1 = Src1;
  810. cursrc2 = Src2;
  811. curdst = Dest;
  812. }
  813. /* C routine to process image */
  814. for (i = istart; i < length; i++) {
  815. result = (int) *cursrc1 * (int) *cursrc2;
  816. *curdst = (unsigned char) result;
  817. /* Advance pointers */
  818. cursrc1++;
  819. cursrc2++;
  820. curdst++;
  821. }
  822. return (0);
  823. }
  824. /*!
  825. \brief Internal MMX Filter using MultDivby2: D = saturation255(S1/2 * S2)
  826. \param Src1 Pointer to the start of the first source byte array (S1).
  827. \param Src2 Pointer to the start of the second source byte array (S2).
  828. \param Dest Pointer to the start of the destination byte array (D).
  829. \param SrcLength The number of bytes in the source arrays.
  830. \return Returns 0 for success or -1 for error.
  831. */
  832. int SDL_imageFilterMultDivby2MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
  833. {
  834. #ifdef USE_MMX
  835. #if !defined(GCC__)
  836. __asm
  837. {
  838. pusha
  839. mov eax, Src1 /* load Src1 address into eax */
  840. mov ebx, Src2 /* load Src2 address into ebx */
  841. mov edi, Dest /* load Dest address into edi */
  842. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  843. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  844. pxor mm0, mm0 /* zero mm0 register */
  845. align 16 /* 16 byte alignment of the loop entry */
  846. L1015:
  847. movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
  848. movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */
  849. movq mm2, mm1 /* copy mm1 into mm2 */
  850. movq mm4, mm3 /* copy mm3 into mm4 */
  851. punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */
  852. punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */
  853. punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */
  854. punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */
  855. psrlw mm1, 1 /* divide mm1 words by 2, Src1 low bytes */
  856. psrlw mm2, 1 /* divide mm2 words by 2, Src1 high bytes */
  857. pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */
  858. pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */
  859. packuswb mm1, mm2 /* pack words back into bytes with saturation */
  860. movq [edi], mm1 /* store result in Dest */
  861. add eax, 8 /* increase Src1, Src2 and Dest */
  862. add ebx, 8 /* register pointers by 8 */
  863. add edi, 8
  864. dec ecx /* decrease loop counter */
  865. jnz L1015 /* check loop termination, proceed if required */
  866. emms /* exit MMX state */
  867. popa
  868. }
  869. #else
  870. asm volatile
  871. ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
  872. "mov %1, %%ebx \n\t" /* load Src2 address into ebx */
  873. "mov %0, %%edi \n\t" /* load Dest address into edi */
  874. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  875. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  876. "pxor %%mm0, %%mm0 \n\t" /* zero mm0 register */
  877. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  878. "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
  879. "movq (%%ebx), %%mm3 \n\t" /* load 8 bytes from Src2 into mm3 */
  880. "movq %%mm1, %%mm2 \n\t" /* copy mm1 into mm2 */
  881. "movq %%mm3, %%mm4 \n\t" /* copy mm3 into mm4 */
  882. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack low bytes of Src1 into words */
  883. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack high bytes of Src1 into words */
  884. "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of Src2 into words */
  885. "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of Src2 into words */
  886. "psrlw $1, %%mm1 \n\t" /* divide mm1 words by 2, Src1 low bytes */
  887. "psrlw $1, %%mm2 \n\t" /* divide mm2 words by 2, Src1 high bytes */
  888. "pmullw %%mm3, %%mm1 \n\t" /* mul low bytes of Src1 and Src2 */
  889. "pmullw %%mm4, %%mm2 \n\t" /* mul high bytes of Src1 and Src2 */
  890. "packuswb %%mm2, %%mm1 \n\t" /* pack words back into bytes with saturation */
  891. "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
  892. "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
  893. "add $8, %%ebx \n\t" /* register pointers by 8 */
  894. "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
  895. "jnz 1b \n\t" /* check loop termination, proceed if required */
  896. "emms \n\t" /* exit MMX state */
  897. "popa \n\t":"=m" (Dest) /* %0 */
  898. :"m"(Src2), /* %1 */
  899. "m"(Src1), /* %2 */
  900. "m"(SrcLength) /* %3 */
  901. );
  902. #endif
  903. return (0);
  904. #else
  905. return (-1);
  906. #endif
  907. }
  908. /*!
  909. \brief Filter using MultDivby2: D = saturation255(S1/2 * S2)
  910. \param Src1 Pointer to the start of the first source byte array (S1).
  911. \param Src2 Pointer to the start of the second source byte array (S2).
  912. \param Dest Pointer to the start of the destination byte array (D).
  913. \param length The number of bytes in the source arrays.
  914. \return Returns 0 for success or -1 for error.
  915. */
  916. int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  917. {
  918. unsigned int i, istart;
  919. unsigned char *cursrc1, *cursrc2, *curdst;
  920. int result;
  921. /* Validate input parameters */
  922. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  923. return(-1);
  924. if (length == 0)
  925. return(0);
  926. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  927. /* MMX routine */
  928. SDL_imageFilterMultDivby2MMX(Src1, Src2, Dest, length);
  929. /* Check for unaligned bytes */
  930. if ((length & 7) > 0) {
  931. /* Setup to process unaligned bytes */
  932. istart = length & 0xfffffff8;
  933. cursrc1 = &Src1[istart];
  934. cursrc2 = &Src2[istart];
  935. curdst = &Dest[istart];
  936. } else {
  937. /* No unaligned bytes - we are done */
  938. return (0);
  939. }
  940. } else {
  941. /* Setup to process whole image */
  942. istart = 0;
  943. cursrc1 = Src1;
  944. cursrc2 = Src2;
  945. curdst = Dest;
  946. }
  947. /* C routine to process image */
  948. for (i = istart; i < length; i++) {
  949. result = ((int) *cursrc1 / 2) * (int) *cursrc2;
  950. if (result > 255)
  951. result = 255;
  952. *curdst = (unsigned char) result;
  953. /* Advance pointers */
  954. cursrc1++;
  955. cursrc2++;
  956. curdst++;
  957. }
  958. return (0);
  959. }
  960. /*!
  961. \brief Internal MMX Filter using MultDivby4: D = saturation255(S1/2 * S2/2)
  962. \param Src1 Pointer to the start of the first source byte array (S1).
  963. \param Src2 Pointer to the start of the second source byte array (S2).
  964. \param Dest Pointer to the start of the destination byte array (D).
  965. \param SrcLength The number of bytes in the source arrays.
  966. \return Returns 0 for success or -1 for error.
  967. */
  968. int SDL_imageFilterMultDivby4MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
  969. {
  970. #ifdef USE_MMX
  971. #if !defined(GCC__)
  972. __asm
  973. {
  974. pusha
  975. mov eax, Src1 /* load Src1 address into eax */
  976. mov ebx, Src2 /* load Src2 address into ebx */
  977. mov edi, Dest /* load Dest address into edi */
  978. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  979. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  980. pxor mm0, mm0 /* zero mm0 register */
  981. align 16 /* 16 byte alignment of the loop entry */
  982. L1016:
  983. movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
  984. movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */
  985. movq mm2, mm1 /* copy mm1 into mm2 */
  986. movq mm4, mm3 /* copy mm3 into mm4 */
  987. punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */
  988. punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */
  989. punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */
  990. punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */
  991. psrlw mm1, 1 /* divide mm1 words by 2, Src1 low bytes */
  992. psrlw mm2, 1 /* divide mm2 words by 2, Src1 high bytes */
  993. psrlw mm3, 1 /* divide mm3 words by 2, Src2 low bytes */
  994. psrlw mm4, 1 /* divide mm4 words by 2, Src2 high bytes */
  995. pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */
  996. pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */
  997. packuswb mm1, mm2 /* pack words back into bytes with saturation */
  998. movq [edi], mm1 /* store result in Dest */
  999. add eax, 8 /* increase Src1, Src2 and Dest */
  1000. add ebx, 8 /* register pointers by 8 */
  1001. add edi, 8
  1002. dec ecx /* decrease loop counter */
  1003. jnz L1016 /* check loop termination, proceed if required */
  1004. emms /* exit MMX state */
  1005. popa
  1006. }
  1007. #else
  1008. asm volatile
  1009. ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
  1010. "mov %1, %%ebx \n\t" /* load Src2 address into ebx */
  1011. "mov %0, %%edi \n\t" /* load Dest address into edi */
  1012. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  1013. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  1014. "pxor %%mm0, %%mm0 \n\t" /* zero mm0 register */
  1015. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  1016. "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
  1017. "movq (%%ebx), %%mm3 \n\t" /* load 8 bytes from Src2 into mm3 */
  1018. "movq %%mm1, %%mm2 \n\t" /* copy mm1 into mm2 */
  1019. "movq %%mm3, %%mm4 \n\t" /* copy mm3 into mm4 */
  1020. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack low bytes of Src1 into words */
  1021. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack high bytes of Src1 into words */
  1022. "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of Src2 into words */
  1023. "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of Src2 into words */
  1024. "psrlw $1, %%mm1 \n\t" /* divide mm1 words by 2, Src1 low bytes */
  1025. "psrlw $1, %%mm2 \n\t" /* divide mm2 words by 2, Src1 high bytes */
  1026. "psrlw $1, %%mm3 \n\t" /* divide mm3 words by 2, Src2 low bytes */
  1027. "psrlw $1, %%mm4 \n\t" /* divide mm4 words by 2, Src2 high bytes */
  1028. "pmullw %%mm3, %%mm1 \n\t" /* mul low bytes of Src1 and Src2 */
  1029. "pmullw %%mm4, %%mm2 \n\t" /* mul high bytes of Src1 and Src2 */
  1030. "packuswb %%mm2, %%mm1 \n\t" /* pack words back into bytes with saturation */
  1031. "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
  1032. "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
  1033. "add $8, %%ebx \n\t" /* register pointers by 8 */
  1034. "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
  1035. "jnz 1b \n\t" /* check loop termination, proceed if required */
  1036. "emms \n\t" /* exit MMX state */
  1037. "popa \n\t":"=m" (Dest) /* %0 */
  1038. :"m"(Src2), /* %1 */
  1039. "m"(Src1), /* %2 */
  1040. "m"(SrcLength) /* %3 */
  1041. );
  1042. #endif
  1043. return (0);
  1044. #else
  1045. return (-1);
  1046. #endif
  1047. }
  1048. /*!
  1049. \brief Filter using MultDivby4: D = saturation255(S1/2 * S2/2)
  1050. \param Src1 Pointer to the start of the first source byte array (S1).
  1051. \param Src2 Pointer to the start of the second source byte array (S2).
  1052. \param Dest Pointer to the start of the destination byte array (D).
  1053. \param length The number of bytes in the source arrays.
  1054. \return Returns 0 for success or -1 for error.
  1055. */
  1056. int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  1057. {
  1058. unsigned int i, istart;
  1059. unsigned char *cursrc1, *cursrc2, *curdst;
  1060. int result;
  1061. /* Validate input parameters */
  1062. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  1063. return(-1);
  1064. if (length == 0)
  1065. return(0);
  1066. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  1067. /* MMX routine */
  1068. SDL_imageFilterMultDivby4MMX(Src1, Src2, Dest, length);
  1069. /* Check for unaligned bytes */
  1070. if ((length & 7) > 0) {
  1071. /* Setup to process unaligned bytes */
  1072. istart = length & 0xfffffff8;
  1073. cursrc1 = &Src1[istart];
  1074. cursrc2 = &Src2[istart];
  1075. curdst = &Dest[istart];
  1076. } else {
  1077. /* No unaligned bytes - we are done */
  1078. return (0);
  1079. }
  1080. } else {
  1081. /* Setup to process whole image */
  1082. istart = 0;
  1083. cursrc1 = Src1;
  1084. cursrc2 = Src2;
  1085. curdst = Dest;
  1086. }
  1087. /* C routine to process image */
  1088. for (i = istart; i < length; i++) {
  1089. result = ((int) *cursrc1 / 2) * ((int) *cursrc2 / 2);
  1090. if (result > 255)
  1091. result = 255;
  1092. *curdst = (unsigned char) result;
  1093. /* Advance pointers */
  1094. cursrc1++;
  1095. cursrc2++;
  1096. curdst++;
  1097. }
  1098. return (0);
  1099. }
  1100. /*!
  1101. \brief Internal MMX Filter using BitAnd: D = S1 & S2
  1102. \param Src1 Pointer to the start of the first source byte array (S1).
  1103. \param Src2 Pointer to the start of the second source byte array (S2).
  1104. \param Dest Pointer to the start of the destination byte array (D).
  1105. \param SrcLength The number of bytes in the source arrays.
  1106. \return Returns 0 for success or -1 for error.
  1107. */
  1108. int SDL_imageFilterBitAndMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
  1109. {
  1110. #ifdef USE_MMX
  1111. #if !defined(GCC__)
  1112. __asm
  1113. {
  1114. pusha
  1115. mov eax, Src1 /* load Src1 address into eax */
  1116. mov ebx, Src2 /* load Src2 address into ebx */
  1117. mov edi, Dest /* load Dest address into edi */
  1118. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  1119. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  1120. align 16 /* 16 byte alignment of the loop entry */
  1121. L1017:
  1122. movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
  1123. pand mm1, [ebx] /* mm1=Src1&Src2 */
  1124. movq [edi], mm1 /* store result in Dest */
  1125. add eax, 8 /* increase Src1, Src2 and Dest */
  1126. add ebx, 8 /* register pointers by 8 */
  1127. add edi, 8
  1128. dec ecx /* decrease loop counter */
  1129. jnz L1017 /* check loop termination, proceed if required */
  1130. emms /* exit MMX state */
  1131. popa
  1132. }
  1133. #else
  1134. asm volatile
  1135. ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
  1136. "mov %1, %%ebx \n\t" /* load Src2 address into ebx */
  1137. "mov %0, %%edi \n\t" /* load Dest address into edi */
  1138. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  1139. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  1140. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  1141. "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
  1142. "pand (%%ebx), %%mm1 \n\t" /* mm1=Src1&Src2 */
  1143. "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
  1144. "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
  1145. "add $8, %%ebx \n\t" /* register pointers by 8 */
  1146. "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
  1147. "jnz 1b \n\t" /* check loop termination, proceed if required */
  1148. "emms \n\t" /* exit MMX state */
  1149. "popa \n\t":"=m" (Dest) /* %0 */
  1150. :"m"(Src2), /* %1 */
  1151. "m"(Src1), /* %2 */
  1152. "m"(SrcLength) /* %3 */
  1153. );
  1154. #endif
  1155. return (0);
  1156. #else
  1157. return (-1);
  1158. #endif
  1159. }
  1160. /*!
  1161. \brief Filter using BitAnd: D = S1 & S2
  1162. \param Src1 Pointer to the start of the first source byte array (S1).
  1163. \param Src2 Pointer to the start of the second source byte array (S2).
  1164. \param Dest Pointer to the start of the destination byte array (D).
  1165. \param length The number of bytes in the source arrays.
  1166. \return Returns 0 for success or -1 for error.
  1167. */
  1168. int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  1169. {
  1170. unsigned int i, istart;
  1171. unsigned char *cursrc1, *cursrc2, *curdst;
  1172. /* Validate input parameters */
  1173. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  1174. return(-1);
  1175. if (length == 0)
  1176. return(0);
  1177. if ((SDL_imageFilterMMXdetect()>0) && (length>7)) {
  1178. /* if (length > 7) { */
  1179. /* Call MMX routine */
  1180. SDL_imageFilterBitAndMMX(Src1, Src2, Dest, length);
  1181. /* Check for unaligned bytes */
  1182. if ((length & 7) > 0) {
  1183. /* Setup to process unaligned bytes */
  1184. istart = length & 0xfffffff8;
  1185. cursrc1 = &Src1[istart];
  1186. cursrc2 = &Src2[istart];
  1187. curdst = &Dest[istart];
  1188. } else {
  1189. /* No unaligned bytes - we are done */
  1190. return (0);
  1191. }
  1192. } else {
  1193. /* Setup to process whole image */
  1194. istart = 0;
  1195. cursrc1 = Src1;
  1196. cursrc2 = Src2;
  1197. curdst = Dest;
  1198. }
  1199. /* C routine to process image */
  1200. for (i = istart; i < length; i++) {
  1201. *curdst = (*cursrc1) & (*cursrc2);
  1202. /* Advance pointers */
  1203. cursrc1++;
  1204. cursrc2++;
  1205. curdst++;
  1206. }
  1207. return (0);
  1208. }
  1209. /*!
  1210. \brief Internal MMX Filter using BitOr: D = S1 | S2
  1211. \param Src1 Pointer to the start of the first source byte array (S1).
  1212. \param Src2 Pointer to the start of the second source byte array (S2).
  1213. \param Dest Pointer to the start of the destination byte array (D).
  1214. \param SrcLength The number of bytes in the source arrays.
  1215. \return Returns 0 for success or -1 for error.
  1216. */
  1217. int SDL_imageFilterBitOrMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
  1218. {
  1219. #ifdef USE_MMX
  1220. #if !defined(GCC__)
  1221. __asm
  1222. {
  1223. pusha
  1224. mov eax, Src1 /* load Src1 address into eax */
  1225. mov ebx, Src2 /* load Src2 address into ebx */
  1226. mov edi, Dest /* load Dest address into edi */
  1227. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  1228. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  1229. align 16 /* 16 byte alignment of the loop entry */
  1230. L91017:
  1231. movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
  1232. por mm1, [ebx] /* mm1=Src1|Src2 */
  1233. movq [edi], mm1 /* store result in Dest */
  1234. add eax, 8 /* increase Src1, Src2 and Dest */
  1235. add ebx, 8 /* register pointers by 8 */
  1236. add edi, 8
  1237. dec ecx /* decrease loop counter */
  1238. jnz L91017 /* check loop termination, proceed if required */
  1239. emms /* exit MMX state */
  1240. popa
  1241. }
  1242. #else
  1243. asm volatile
  1244. ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
  1245. "mov %1, %%ebx \n\t" /* load Src2 address into ebx */
  1246. "mov %0, %%edi \n\t" /* load Dest address into edi */
  1247. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  1248. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  1249. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  1250. "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
  1251. "por (%%ebx), %%mm1 \n\t" /* mm1=Src1|Src2 */
  1252. "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
  1253. "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
  1254. "add $8, %%ebx \n\t" /* register pointers by 8 */
  1255. "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
  1256. "jnz 1b \n\t" /* check loop termination, proceed if required */
  1257. "emms \n\t" /* exit MMX state */
  1258. "popa \n\t":"=m" (Dest) /* %0 */
  1259. :"m"(Src2), /* %1 */
  1260. "m"(Src1), /* %2 */
  1261. "m"(SrcLength) /* %3 */
  1262. );
  1263. #endif
  1264. return (0);
  1265. #else
  1266. return (-1);
  1267. #endif
  1268. }
  1269. /*!
  1270. \brief Filter using BitOr: D = S1 | S2
  1271. \param Src1 Pointer to the start of the first source byte array (S1).
  1272. \param Src2 Pointer to the start of the second source byte array (S2).
  1273. \param Dest Pointer to the start of the destination byte array (D).
  1274. \param length The number of bytes in the source arrays.
  1275. \return Returns 0 for success or -1 for error.
  1276. */
  1277. int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  1278. {
  1279. unsigned int i, istart;
  1280. unsigned char *cursrc1, *cursrc2, *curdst;
  1281. /* Validate input parameters */
  1282. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  1283. return(-1);
  1284. if (length == 0)
  1285. return(0);
  1286. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  1287. /* MMX routine */
  1288. SDL_imageFilterBitOrMMX(Src1, Src2, Dest, length);
  1289. /* Check for unaligned bytes */
  1290. if ((length & 7) > 0) {
  1291. /* Setup to process unaligned bytes */
  1292. istart = length & 0xfffffff8;
  1293. cursrc1 = &Src1[istart];
  1294. cursrc2 = &Src2[istart];
  1295. curdst = &Dest[istart];
  1296. } else {
  1297. /*…