PageRenderTime 77ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 1ms

/project/jni/sdl_gfx/SDL_imageFilter.c

https://github.com/aichunyu/FFPlayer
C | 7556 lines | 6038 code | 274 blank | 1244 comment | 520 complexity | d43bab1767b6590f1804da8a41766b90 MD5 | raw file
Possible License(s): LGPL-3.0, 0BSD, Apache-2.0, LGPL-2.1, GPL-2.0, CC-BY-SA-3.0, LGPL-2.0, BSD-3-Clause
  1. /*
  2. SDL_imageFilter - bytes-image "filter" routines.
  3. (Uses inline x86 MMX or ASM optimizations if available and enabled.)
  4. LGPL (c) A. Schiffler
  5. Note: Most of the MMX code is based on published routines
  6. by Vladimir Kravtchenko at vk@cs.ubc.ca - credits go to
  7. him for his work.
  8. */
  9. #include <stdio.h>
  10. #include <stdlib.h>
  11. #include <string.h>
  12. #include "SDL_imageFilter.h"
  13. /*!
  14. \brief Swaps the byte order in a 32bit integer (LSB becomes MSB, etc.).
  15. */
  16. #define SWAP_32(x) (((x) >> 24) | (((x) & 0x00ff0000) >> 8) | (((x) & 0x0000ff00) << 8) | ((x) << 24))
  17. /* ------ Static variables ----- */
  18. /*!
  19. \brief Static state which enables the use of the MMX routines. Enabled by default
  20. */
  21. static int SDL_imageFilterUseMMX = 1;
  22. /* Detect GCC */
  23. #if defined(__GNUC__)
  24. #define GCC__
  25. #endif
  26. /*!
  27. \brief Internal function returning the CPU flags.
  28. \returns Flags of system CPU.
  29. */
  30. unsigned int _cpuFlags()
  31. {
  32. int flags = 0;
  33. #ifdef USE_MMX
  34. #if !defined(GCC__)
  35. __asm
  36. {
  37. pusha
  38. mov eax, 1
  39. cpuid /* get CPU ID flag */
  40. mov flags,edx /* move result to mmx_bit */
  41. popa
  42. }
  43. #else
  44. asm volatile ("pusha \n\t" "mov %1, %%eax \n\t" /* request feature flag */
  45. "cpuid \n\t" /* get CPU ID flag */
  46. "mov %%edx, %0 \n\t" /* move result to mmx_bit */
  47. "popa \n\t":"=m" (flags) /* %0 */
  48. :"i"(0x00000001) /* %1 */
  49. );
  50. #endif
  51. #endif
  52. return (flags);
  53. }
  54. /*!
  55. \brief MMX detection routine (with override flag).
  56. \returns 1 of MMX was detected, 0 otherwise.
  57. */
  58. int SDL_imageFilterMMXdetect(void)
  59. {
  60. unsigned int mmx_bit;
  61. /* Check override flag */
  62. if (SDL_imageFilterUseMMX == 0) {
  63. return (0);
  64. }
  65. mmx_bit = _cpuFlags();
  66. mmx_bit &= 0x00800000;
  67. mmx_bit = (mmx_bit && 0x00800000);
  68. return (mmx_bit);
  69. }
  70. /*!
  71. \brief Disable MMX check for filter functions and and force to use non-MMX C based code.
  72. */
  73. void SDL_imageFilterMMXoff()
  74. {
  75. SDL_imageFilterUseMMX = 0;
  76. }
  77. /*!
  78. \brief Enable MMX check for filter functions and use MMX code if available.
  79. */
  80. void SDL_imageFilterMMXon()
  81. {
  82. SDL_imageFilterUseMMX = 1;
  83. }
  84. /* ------------------------------------------------------------------------------------ */
  85. /*!
  86. \brief Internal MMX Filter using Add: D = saturation255(S1 + S2)
  87. \param Src1 Pointer to the start of the first source byte array (S1).
  88. \param Src2 Pointer to the start of the second source byte array (S2).
  89. \param Dest Pointer to the start of the destination byte array (D).
  90. \param SrcLength The number of bytes in the source arrays.
  91. \return Returns 0 for success or -1 for error.
  92. */
  93. int SDL_imageFilterAddMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
  94. {
  95. #ifdef USE_MMX
  96. #if !defined(GCC__)
  97. __asm
  98. {
  99. pusha
  100. mov eax, Src1 /* load Src1 address into eax */
  101. mov ebx, Src2 /* load Src2 address into ebx */
  102. mov edi, Dest /* load Dest address into edi */
  103. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  104. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  105. align 16 /* 16 byte alignment of the loop entry */
  106. L1010:
  107. movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
  108. paddusb mm1, [ebx] /* mm1=Src1+Src2 (add 8 bytes with saturation) */
  109. movq [edi], mm1 /* store result in Dest */
  110. add eax, 8 /* increase Src1, Src2 and Dest */
  111. add ebx, 8 /* register pointers by 8 */
  112. add edi, 8
  113. dec ecx /* decrease loop counter */
  114. jnz L1010 /* check loop termination, proceed if required */
  115. emms /* exit MMX state */
  116. popa
  117. }
  118. #else
  119. asm volatile
  120. ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
  121. "mov %1, %%ebx \n\t" /* load Src2 address into ebx */
  122. "mov %0, %%edi \n\t" /* load Dest address into edi */
  123. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  124. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  125. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  126. "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
  127. "paddusb (%%ebx), %%mm1 \n\t" /* mm1=Src1+Src2 (add 8 bytes with saturation) */
  128. "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
  129. "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
  130. "add $8, %%ebx \n\t" /* register pointers by 8 */
  131. "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
  132. "jnz 1b \n\t" /* check loop termination, proceed if required */
  133. "emms \n\t" /* exit MMX state */
  134. "popa \n\t":"=m" (Dest) /* %0 */
  135. :"m"(Src2), /* %1 */
  136. "m"(Src1), /* %2 */
  137. "m"(SrcLength) /* %3 */
  138. );
  139. #endif
  140. return (0);
  141. #else
  142. return (-1);
  143. #endif
  144. }
  145. /*!
  146. \brief Filter using Add: D = saturation255(S1 + S2)
  147. \param Src1 Pointer to the start of the first source byte array (S1).
  148. \param Src2 Pointer to the start of the second source byte array (S2).
  149. \param Dest Pointer to the start of the destination byte array (D).
  150. \param length The number of bytes in the source arrays.
  151. \return Returns 0 for success or -1 for error.
  152. */
  153. int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  154. {
  155. unsigned int i, istart;
  156. unsigned char *cursrc1, *cursrc2, *curdst;
  157. int result;
  158. /* Validate input parameters */
  159. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  160. return(-1);
  161. if (length == 0)
  162. return(0);
  163. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  164. /* Use MMX assembly routine */
  165. SDL_imageFilterAddMMX(Src1, Src2, Dest, length);
  166. /* Check for unaligned bytes */
  167. if ((length & 7) > 0) {
  168. /* Setup to process unaligned bytes */
  169. istart = length & 0xfffffff8;
  170. cursrc1 = &Src1[istart];
  171. cursrc2 = &Src2[istart];
  172. curdst = &Dest[istart];
  173. } else {
  174. /* No unaligned bytes - we are done */
  175. return (0);
  176. }
  177. } else {
  178. /* Setup to process whole image */
  179. istart = 0;
  180. cursrc1 = Src1;
  181. cursrc2 = Src2;
  182. curdst = Dest;
  183. }
  184. /* C routine to process image */
  185. for (i = istart; i < length; i++) {
  186. result = (int) *cursrc1 + (int) *cursrc2;
  187. if (result > 255)
  188. result = 255;
  189. *curdst = (unsigned char) result;
  190. /* Advance pointers */
  191. cursrc1++;
  192. cursrc2++;
  193. curdst++;
  194. }
  195. return (0);
  196. }
  197. /*!
  198. \brief Internal MMX Filter using Mean: D = S1/2 + S2/2
  199. \param Src1 Pointer to the start of the first source byte array (S1).
  200. \param Src2 Pointer to the start of the second source byte array (S2).
  201. \param Dest Pointer to the start of the destination byte array (D).
  202. \param SrcLength The number of bytes in the source arrays.
  203. \param Mask Mask array containing 8 bytes with 0x7F value.
  204. ]
  205. \return Returns 0 for success or -1 for error.
  206. */
  207. int SDL_imageFilterMeanMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength,
  208. unsigned char *Mask)
  209. {
  210. #ifdef USE_MMX
  211. #if !defined(GCC__)
  212. __asm
  213. {
  214. pusha
  215. mov edx, Mask /* load Mask address into edx */
  216. movq mm0, [edx] /* load Mask into mm0 */
  217. mov eax, Src1 /* load Src1 address into eax */
  218. mov ebx, Src2 /* load Src2 address into ebx */
  219. mov edi, Dest /* load Dest address into edi */
  220. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  221. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  222. align 16 /* 16 byte alignment of the loop entry */
  223. L21011:
  224. movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
  225. movq mm2, [ebx] /* load 8 bytes from Src2 into mm2 */
  226. /* --- Byte shift via Word shift --- */
  227. psrlw mm1, 1 /* shift 4 WORDS of mm1 1 bit to the right */
  228. psrlw mm2, 1 /* shift 4 WORDS of mm2 1 bit to the right */
  229. pand mm1, mm0 // apply Mask to 8 BYTES of mm1 */
  230. /* byte 0x0f, 0xdb, 0xc8 */
  231. pand mm2, mm0 // apply Mask to 8 BYTES of mm2 */
  232. /* byte 0x0f, 0xdb, 0xd0 */
  233. paddusb mm1, mm2 /* mm1=mm1+mm2 (add 8 bytes with saturation) */
  234. movq [edi], mm1 /* store result in Dest */
  235. add eax, 8 /* increase Src1, Src2 and Dest */
  236. add ebx, 8 /* register pointers by 8 */
  237. add edi, 8
  238. dec ecx /* decrease loop counter */
  239. jnz L21011 /* check loop termination, proceed if required */
  240. emms /* exit MMX state */
  241. popa
  242. }
  243. #else
  244. asm volatile
  245. ("pusha \n\t" "movl %4, %%edx \n\t" /* load Mask address into edx */
  246. "movq (%%edx), %%mm0 \n\t" /* load Mask into mm0 */
  247. "mov %2, %%eax \n\t" /* load Src1 address into eax */
  248. "mov %1, %%ebx \n\t" /* load Src2 address into ebx */
  249. "mov %0, %%edi \n\t" /* load Dest address into edi */
  250. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  251. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  252. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  253. "1: \n\t"
  254. "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
  255. "movq (%%ebx), %%mm2 \n\t" /* load 8 bytes from Src2 into mm2 */
  256. /* --- Byte shift via Word shift --- */
  257. "psrlw $1, %%mm1 \n\t" /* shift 4 WORDS of mm1 1 bit to the right */
  258. "psrlw $1, %%mm2 \n\t" /* shift 4 WORDS of mm2 1 bit to the right */
  259. /* "pand %%mm0, %%mm1 \n\t" // apply Mask to 8 BYTES of mm1 */
  260. ".byte 0x0f, 0xdb, 0xc8 \n\t"
  261. /* "pand %%mm0, %%mm2 \n\t" // apply Mask to 8 BYTES of mm2 */
  262. ".byte 0x0f, 0xdb, 0xd0 \n\t"
  263. "paddusb %%mm2, %%mm1 \n\t" /* mm1=mm1+mm2 (add 8 bytes with saturation) */
  264. "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
  265. "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
  266. "add $8, %%ebx \n\t" /* register pointers by 8 */
  267. "add $8, %%edi \n\t"
  268. "dec %%ecx \n\t" /* decrease loop counter */
  269. "jnz 1b \n\t" /* check loop termination, proceed if required */
  270. "emms \n\t" /* exit MMX state */
  271. "popa \n\t":"=m" (Dest) /* %0 */
  272. :"m"(Src2), /* %1 */
  273. "m"(Src1), /* %2 */
  274. "m"(SrcLength), /* %3 */
  275. "m"(Mask) /* %4 */
  276. );
  277. #endif
  278. return (0);
  279. #else
  280. return (-1);
  281. #endif
  282. }
  283. /*!
  284. \brief Filter using Mean: D = S1/2 + S2/2
  285. \param Src1 Pointer to the start of the first source byte array (S1).
  286. \param Src2 Pointer to the start of the second source byte array (S2).
  287. \param Dest Pointer to the start of the destination byte array (D).
  288. \param length The number of bytes in the source arrays.
  289. \return Returns 0 for success or -1 for error.
  290. */
  291. int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  292. {
  293. static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
  294. unsigned int i, istart;
  295. unsigned char *cursrc1, *cursrc2, *curdst;
  296. int result;
  297. /* Validate input parameters */
  298. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  299. return(-1);
  300. if (length == 0)
  301. return(0);
  302. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  303. /* MMX routine */
  304. SDL_imageFilterMeanMMX(Src1, Src2, Dest, length, Mask);
  305. /* Check for unaligned bytes */
  306. if ((length & 7) > 0) {
  307. /* Setup to process unaligned bytes */
  308. istart = length & 0xfffffff8;
  309. cursrc1 = &Src1[istart];
  310. cursrc2 = &Src2[istart];
  311. curdst = &Dest[istart];
  312. } else {
  313. /* No unaligned bytes - we are done */
  314. return (0);
  315. }
  316. } else {
  317. /* Setup to process whole image */
  318. istart = 0;
  319. cursrc1 = Src1;
  320. cursrc2 = Src2;
  321. curdst = Dest;
  322. }
  323. /* C routine to process image */
  324. for (i = istart; i < length; i++) {
  325. result = (int) *cursrc1 / 2 + (int) *cursrc2 / 2;
  326. *curdst = (unsigned char) result;
  327. /* Advance pointers */
  328. cursrc1++;
  329. cursrc2++;
  330. curdst++;
  331. }
  332. return (0);
  333. }
  334. /*!
  335. \brief Internal MMX Filter using Sub: D = saturation0(S1 - S2)
  336. \param Src1 Pointer to the start of the first source byte array (S1).
  337. \param Src2 Pointer to the start of the second source byte array (S2).
  338. \param Dest Pointer to the start of the destination byte array (D).
  339. \param SrcLength The number of bytes in the source arrays.
  340. \return Returns 0 for success or -1 for error.
  341. */
  342. int SDL_imageFilterSubMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
  343. {
  344. #ifdef USE_MMX
  345. #if !defined(GCC__)
  346. __asm
  347. {
  348. pusha
  349. mov eax, Src1 /* load Src1 address into eax */
  350. mov ebx, Src2 /* load Src2 address into ebx */
  351. mov edi, Dest /* load Dest address into edi */
  352. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  353. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  354. align 16 /* 16 byte alignment of the loop entry */
  355. L1012:
  356. movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
  357. psubusb mm1, [ebx] /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
  358. movq [edi], mm1 /* store result in Dest */
  359. add eax, 8 /* increase Src1, Src2 and Dest */
  360. add ebx, 8 /* register pointers by 8 */
  361. add edi, 8
  362. dec ecx /* decrease loop counter */
  363. jnz L1012 /* check loop termination, proceed if required */
  364. emms /* exit MMX state */
  365. popa
  366. }
  367. #else
  368. asm volatile
  369. ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
  370. "mov %1, %%ebx \n\t" /* load Src2 address into ebx */
  371. "mov %0, %%edi \n\t" /* load Dest address into edi */
  372. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  373. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  374. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  375. "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
  376. "psubusb (%%ebx), %%mm1 \n\t" /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
  377. "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
  378. "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
  379. "add $8, %%ebx \n\t" /* register pointers by 8 */
  380. "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
  381. "jnz 1b \n\t" /* check loop termination, proceed if required */
  382. "emms \n\t" /* exit MMX state */
  383. "popa \n\t":"=m" (Dest) /* %0 */
  384. :"m"(Src2), /* %1 */
  385. "m"(Src1), /* %2 */
  386. "m"(SrcLength) /* %3 */
  387. );
  388. #endif
  389. return (0);
  390. #else
  391. return (-1);
  392. #endif
  393. }
  394. /*!
  395. \brief Filter using Sub: D = saturation0(S1 - S2)
  396. \param Src1 Pointer to the start of the first source byte array (S1).
  397. \param Src2 Pointer to the start of the second source byte array (S2).
  398. \param Dest Pointer to the start of the destination byte array (D).
  399. \param length The number of bytes in the source arrays.
  400. \return Returns 0 for success or -1 for error.
  401. */
  402. int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  403. {
  404. unsigned int i, istart;
  405. unsigned char *cursrc1, *cursrc2, *curdst;
  406. int result;
  407. /* Validate input parameters */
  408. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  409. return(-1);
  410. if (length == 0)
  411. return(0);
  412. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  413. /* MMX routine */
  414. SDL_imageFilterSubMMX(Src1, Src2, Dest, length);
  415. /* Check for unaligned bytes */
  416. if ((length & 7) > 0) {
  417. /* Setup to process unaligned bytes */
  418. istart = length & 0xfffffff8;
  419. cursrc1 = &Src1[istart];
  420. cursrc2 = &Src2[istart];
  421. curdst = &Dest[istart];
  422. } else {
  423. /* No unaligned bytes - we are done */
  424. return (0);
  425. }
  426. } else {
  427. /* Setup to process whole image */
  428. istart = 0;
  429. cursrc1 = Src1;
  430. cursrc2 = Src2;
  431. curdst = Dest;
  432. }
  433. /* C routine to process image */
  434. for (i = istart; i < length; i++) {
  435. result = (int) *cursrc1 - (int) *cursrc2;
  436. if (result < 0)
  437. result = 0;
  438. *curdst = (unsigned char) result;
  439. /* Advance pointers */
  440. cursrc1++;
  441. cursrc2++;
  442. curdst++;
  443. }
  444. return (0);
  445. }
  446. /*!
  447. \brief Internal MMX Filter using AbsDiff: D = | S1 - S2 |
  448. \param Src1 Pointer to the start of the first source byte array (S1).
  449. \param Src2 Pointer to the start of the second source byte array (S2).
  450. \param Dest Pointer to the start of the destination byte array (D).
  451. \param SrcLength The number of bytes in the source arrays.
  452. \return Returns 0 for success or -1 for error.
  453. */
  454. int SDL_imageFilterAbsDiffMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
  455. {
  456. #ifdef USE_MMX
  457. #if !defined(GCC__)
  458. __asm
  459. {
  460. pusha
  461. mov eax, Src1 /* load Src1 address into eax */
  462. mov ebx, Src2 /* load Src2 address into ebx */
  463. mov edi, Dest /* load Dest address into edi */
  464. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  465. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  466. align 16 /* 16 byte alignment of the loop entry */
  467. L1013:
  468. movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
  469. movq mm2, [ebx] /* load 8 bytes from Src2 into mm2 */
  470. psubusb mm1, [ebx] /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
  471. psubusb mm2, [eax] /* mm2=Src2-Src1 (sub 8 bytes with saturation) */
  472. por mm1, mm2 /* combine both mm2 and mm1 results */
  473. movq [edi], mm1 /* store result in Dest */
  474. add eax, 8 /* increase Src1, Src2 and Dest */
  475. add ebx, 8 /* register pointers by 8 */
  476. add edi, 8
  477. dec ecx /* decrease loop counter */
  478. jnz L1013 /* check loop termination, proceed if required */
  479. emms /* exit MMX state */
  480. popa
  481. }
  482. #else
  483. asm volatile
  484. ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
  485. "mov %1, %%ebx \n\t" /* load Src2 address into ebx */
  486. "mov %0, %%edi \n\t" /* load Dest address into edi */
  487. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  488. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  489. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  490. "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
  491. "movq (%%ebx), %%mm2 \n\t" /* load 8 bytes from Src2 into mm2 */
  492. "psubusb (%%ebx), %%mm1 \n\t" /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
  493. "psubusb (%%eax), %%mm2 \n\t" /* mm2=Src2-Src1 (sub 8 bytes with saturation) */
  494. "por %%mm2, %%mm1 \n\t" /* combine both mm2 and mm1 results */
  495. "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
  496. "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
  497. "add $8, %%ebx \n\t" /* register pointers by 8 */
  498. "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
  499. "jnz 1b \n\t" /* check loop termination, proceed if required */
  500. "emms \n\t" /* exit MMX state */
  501. "popa \n\t":"=m" (Dest) /* %0 */
  502. :"m"(Src2), /* %1 */
  503. "m"(Src1), /* %2 */
  504. "m"(SrcLength) /* %3 */
  505. );
  506. #endif
  507. return (0);
  508. #else
  509. return (-1);
  510. #endif
  511. }
  512. /*!
  513. \brief Filter using AbsDiff: D = | S1 - S2 |
  514. \param Src1 Pointer to the start of the first source byte array (S1).
  515. \param Src2 Pointer to the start of the second source byte array (S2).
  516. \param Dest Pointer to the start of the destination byte array (D).
  517. \param length The number of bytes in the source arrays.
  518. \return Returns 0 for success or -1 for error.
  519. */
  520. int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  521. {
  522. unsigned int i, istart;
  523. unsigned char *cursrc1, *cursrc2, *curdst;
  524. int result;
  525. /* Validate input parameters */
  526. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  527. return(-1);
  528. if (length == 0)
  529. return(0);
  530. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  531. /* MMX routine */
  532. SDL_imageFilterAbsDiffMMX(Src1, Src2, Dest, length);
  533. /* Check for unaligned bytes */
  534. if ((length & 7) > 0) {
  535. /* Setup to process unaligned bytes */
  536. istart = length & 0xfffffff8;
  537. cursrc1 = &Src1[istart];
  538. cursrc2 = &Src2[istart];
  539. curdst = &Dest[istart];
  540. } else {
  541. /* No unaligned bytes - we are done */
  542. return (0);
  543. }
  544. } else {
  545. /* Setup to process whole image */
  546. istart = 0;
  547. cursrc1 = Src1;
  548. cursrc2 = Src2;
  549. curdst = Dest;
  550. }
  551. /* C routine to process image */
  552. for (i = istart; i < length; i++) {
  553. result = abs((int) *cursrc1 - (int) *cursrc2);
  554. *curdst = (unsigned char) result;
  555. /* Advance pointers */
  556. cursrc1++;
  557. cursrc2++;
  558. curdst++;
  559. }
  560. return (0);
  561. }
  562. /*!
  563. \brief Internal MMX Filter using Mult: D = saturation255(S1 * S2)
  564. \param Src1 Pointer to the start of the first source byte array (S1).
  565. \param Src2 Pointer to the start of the second source byte array (S2).
  566. \param Dest Pointer to the start of the destination byte array (D).
  567. \param SrcLength The number of bytes in the source arrays.
  568. \return Returns 0 for success or -1 for error.
  569. */
  570. int SDL_imageFilterMultMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
  571. {
  572. #ifdef USE_MMX
  573. #if !defined(GCC__)
  574. __asm
  575. {
  576. pusha
  577. mov eax, Src1 /* load Src1 address into eax */
  578. mov ebx, Src2 /* load Src2 address into ebx */
  579. mov edi, Dest /* load Dest address into edi */
  580. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  581. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  582. pxor mm0, mm0 /* zero mm0 register */
  583. align 16 /* 16 byte alignment of the loop entry */
  584. L1014:
  585. movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
  586. movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */
  587. movq mm2, mm1 /* copy mm1 into mm2 */
  588. movq mm4, mm3 /* copy mm3 into mm4 */
  589. punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */
  590. punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */
  591. punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */
  592. punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */
  593. pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */
  594. pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */
  595. /* Take abs value of the results (signed words) */
  596. movq mm5, mm1 /* copy mm1 into mm5 */
  597. movq mm6, mm2 /* copy mm2 into mm6 */
  598. psraw mm5, 15 /* fill mm5 words with word sign bit */
  599. psraw mm6, 15 /* fill mm6 words with word sign bit */
  600. pxor mm1, mm5 /* take 1's compliment of only neg. words */
  601. pxor mm2, mm6 /* take 1's compliment of only neg. words */
  602. psubsw mm1, mm5 /* add 1 to only neg. words, W-(-1) or W-0 */
  603. psubsw mm2, mm6 /* add 1 to only neg. words, W-(-1) or W-0 */
  604. packuswb mm1, mm2 /* pack words back into bytes with saturation */
  605. movq [edi], mm1 /* store result in Dest */
  606. add eax, 8 /* increase Src1, Src2 and Dest */
  607. add ebx, 8 /* register pointers by 8 */
  608. add edi, 8
  609. dec ecx /* decrease loop counter */
  610. jnz L1014 /* check loop termination, proceed if required */
  611. emms /* exit MMX state */
  612. popa
  613. }
  614. #else
  615. asm volatile
  616. ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
  617. "mov %1, %%ebx \n\t" /* load Src2 address into ebx */
  618. "mov %0, %%edi \n\t" /* load Dest address into edi */
  619. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  620. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  621. "pxor %%mm0, %%mm0 \n\t" /* zero mm0 register */
  622. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  623. "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
  624. "movq (%%ebx), %%mm3 \n\t" /* load 8 bytes from Src2 into mm3 */
  625. "movq %%mm1, %%mm2 \n\t" /* copy mm1 into mm2 */
  626. "movq %%mm3, %%mm4 \n\t" /* copy mm3 into mm4 */
  627. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack low bytes of Src1 into words */
  628. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack high bytes of Src1 into words */
  629. "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of Src2 into words */
  630. "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of Src2 into words */
  631. "pmullw %%mm3, %%mm1 \n\t" /* mul low bytes of Src1 and Src2 */
  632. "pmullw %%mm4, %%mm2 \n\t" /* mul high bytes of Src1 and Src2 */
  633. /* Take abs value of the results (signed words) */
  634. "movq %%mm1, %%mm5 \n\t" /* copy mm1 into mm5 */
  635. "movq %%mm2, %%mm6 \n\t" /* copy mm2 into mm6 */
  636. "psraw $15, %%mm5 \n\t" /* fill mm5 words with word sign bit */
  637. "psraw $15, %%mm6 \n\t" /* fill mm6 words with word sign bit */
  638. "pxor %%mm5, %%mm1 \n\t" /* take 1's compliment of only neg. words */
  639. "pxor %%mm6, %%mm2 \n\t" /* take 1's compliment of only neg. words */
  640. "psubsw %%mm5, %%mm1 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
  641. "psubsw %%mm6, %%mm2 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
  642. "packuswb %%mm2, %%mm1 \n\t" /* pack words back into bytes with saturation */
  643. "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
  644. "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
  645. "add $8, %%ebx \n\t" /* register pointers by 8 */
  646. "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
  647. "jnz 1b \n\t" /* check loop termination, proceed if required */
  648. "emms \n\t" /* exit MMX state */
  649. "popa \n\t":"=m" (Dest) /* %0 */
  650. :"m"(Src2), /* %1 */
  651. "m"(Src1), /* %2 */
  652. "m"(SrcLength) /* %3 */
  653. );
  654. #endif
  655. return (0);
  656. #else
  657. return (-1);
  658. #endif
  659. }
  660. /*!
  661. \brief Filter using Mult: D = saturation255(S1 * S2)
  662. \param Src1 Pointer to the start of the first source byte array (S1).
  663. \param Src2 Pointer to the start of the second source byte array (S2).
  664. \param Dest Pointer to the start of the destination byte array (D).
  665. \param length The number of bytes in the source arrays.
  666. \return Returns 0 for success or -1 for error.
  667. */
  668. int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  669. {
  670. unsigned int i, istart;
  671. unsigned char *cursrc1, *cursrc2, *curdst;
  672. int result;
  673. /* Validate input parameters */
  674. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  675. return(-1);
  676. if (length == 0)
  677. return(0);
  678. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  679. /* MMX routine */
  680. SDL_imageFilterMultMMX(Src1, Src2, Dest, length);
  681. /* Check for unaligned bytes */
  682. if ((length & 7) > 0) {
  683. /* Setup to process unaligned bytes */
  684. istart = length & 0xfffffff8;
  685. cursrc1 = &Src1[istart];
  686. cursrc2 = &Src2[istart];
  687. curdst = &Dest[istart];
  688. } else {
  689. /* No unaligned bytes - we are done */
  690. return (0);
  691. }
  692. } else {
  693. /* Setup to process whole image */
  694. istart = 0;
  695. cursrc1 = Src1;
  696. cursrc2 = Src2;
  697. curdst = Dest;
  698. }
  699. /* C routine to process image */
  700. for (i = istart; i < length; i++) {
  701. /* NOTE: this is probably wrong - dunno what the MMX code does */
  702. result = (int) *cursrc1 * (int) *cursrc2;
  703. if (result > 255)
  704. result = 255;
  705. *curdst = (unsigned char) result;
  706. /* Advance pointers */
  707. cursrc1++;
  708. cursrc2++;
  709. curdst++;
  710. }
  711. return (0);
  712. }
  713. /*!
  714. \brief Internal ASM Filter using MultNor: D = S1 * S2
  715. \param Src1 Pointer to the start of the first source byte array (S1).
  716. \param Src2 Pointer to the start of the second source byte array (S2).
  717. \param Dest Pointer to the start of the destination byte array (D).
  718. \param SrcLength The number of bytes in the source arrays.
  719. \return Returns 0 for success or -1 for error.
  720. */
  721. int SDL_imageFilterMultNorASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
  722. {
  723. #ifdef USE_MMX
  724. #if !defined(GCC__)
  725. __asm
  726. {
  727. pusha
  728. mov edx, Src1 /* load Src1 address into edx */
  729. mov esi, Src2 /* load Src2 address into esi */
  730. mov edi, Dest /* load Dest address into edi */
  731. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  732. align 16 /* 16 byte alignment of the loop entry */
  733. L10141:
  734. mov al, [edx] /* load a byte from Src1 */
  735. mul [esi] /* mul with a byte from Src2 */
  736. mov [edi], al /* move a byte result to Dest */
  737. inc edx /* increment Src1, Src2, Dest */
  738. inc esi /* pointer registers by one */
  739. inc edi
  740. dec ecx /* decrease loop counter */
  741. jnz L10141 /* check loop termination, proceed if required */
  742. popa
  743. }
  744. #else
  745. asm volatile
  746. ("pusha \n\t" "mov %2, %%edx \n\t" /* load Src1 address into edx */
  747. "mov %1, %%esi \n\t" /* load Src2 address into esi */
  748. "mov %0, %%edi \n\t" /* load Dest address into edi */
  749. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  750. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  751. "1:mov (%%edx), %%al \n\t" /* load a byte from Src1 */
  752. "mulb (%%esi) \n\t" /* mul with a byte from Src2 */
  753. "mov %%al, (%%edi) \n\t" /* move a byte result to Dest */
  754. "inc %%edx \n\t" /* increment Src1, Src2, Dest */
  755. "inc %%esi \n\t" /* pointer registers by one */
  756. "inc %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
  757. "jnz 1b \n\t" /* check loop termination, proceed if required */
  758. "popa \n\t":"=m" (Dest) /* %0 */
  759. :"m"(Src2), /* %1 */
  760. "m"(Src1), /* %2 */
  761. "m"(SrcLength) /* %3 */
  762. );
  763. #endif
  764. return (0);
  765. #else
  766. return (-1);
  767. #endif
  768. }
  769. /*!
  770. \brief Filter using MultNor: D = S1 * S2
  771. \param Src1 Pointer to the start of the first source byte array (S1).
  772. \param Src2 Pointer to the start of the second source byte array (S2).
  773. \param Dest Pointer to the start of the destination byte array (D).
  774. \param length The number of bytes in the source arrays.
  775. \return Returns 0 for success or -1 for error.
  776. */
  777. int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  778. {
  779. unsigned int i, istart;
  780. unsigned char *cursrc1, *cursrc2, *curdst;
  781. int result;
  782. /* Validate input parameters */
  783. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  784. return(-1);
  785. if (length == 0)
  786. return(0);
  787. if (SDL_imageFilterMMXdetect()) {
  788. if (length > 0) {
  789. /* ASM routine */
  790. SDL_imageFilterMultNorASM(Src1, Src2, Dest, length);
  791. /* Check for unaligned bytes */
  792. if ((length & 7) > 0) {
  793. /* Setup to process unaligned bytes */
  794. istart = length & 0xfffffff8;
  795. cursrc1 = &Src1[istart];
  796. cursrc2 = &Src2[istart];
  797. curdst = &Dest[istart];
  798. } else {
  799. /* No unaligned bytes - we are done */
  800. return (0);
  801. }
  802. } else {
  803. /* No bytes - we are done */
  804. return (0);
  805. }
  806. } else {
  807. /* Setup to process whole image */
  808. istart = 0;
  809. cursrc1 = Src1;
  810. cursrc2 = Src2;
  811. curdst = Dest;
  812. }
  813. /* C routine to process image */
  814. for (i = istart; i < length; i++) {
  815. result = (int) *cursrc1 * (int) *cursrc2;
  816. *curdst = (unsigned char) result;
  817. /* Advance pointers */
  818. cursrc1++;
  819. cursrc2++;
  820. curdst++;
  821. }
  822. return (0);
  823. }
  824. /*!
  825. \brief Internal MMX Filter using MultDivby2: D = saturation255(S1/2 * S2)
  826. \param Src1 Pointer to the start of the first source byte array (S1).
  827. \param Src2 Pointer to the start of the second source byte array (S2).
  828. \param Dest Pointer to the start of the destination byte array (D).
  829. \param SrcLength The number of bytes in the source arrays.
  830. \return Returns 0 for success or -1 for error.
  831. */
  832. int SDL_imageFilterMultDivby2MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
  833. {
  834. #ifdef USE_MMX
  835. #if !defined(GCC__)
  836. __asm
  837. {
  838. pusha
  839. mov eax, Src1 /* load Src1 address into eax */
  840. mov ebx, Src2 /* load Src2 address into ebx */
  841. mov edi, Dest /* load Dest address into edi */
  842. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  843. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  844. pxor mm0, mm0 /* zero mm0 register */
  845. align 16 /* 16 byte alignment of the loop entry */
  846. L1015:
  847. movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
  848. movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */
  849. movq mm2, mm1 /* copy mm1 into mm2 */
  850. movq mm4, mm3 /* copy mm3 into mm4 */
  851. punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */
  852. punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */
  853. punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */
  854. punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */
  855. psrlw mm1, 1 /* divide mm1 words by 2, Src1 low bytes */
  856. psrlw mm2, 1 /* divide mm2 words by 2, Src1 high bytes */
  857. pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */
  858. pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */
  859. packuswb mm1, mm2 /* pack words back into bytes with saturation */
  860. movq [edi], mm1 /* store result in Dest */
  861. add eax, 8 /* increase Src1, Src2 and Dest */
  862. add ebx, 8 /* register pointers by 8 */
  863. add edi, 8
  864. dec ecx /* decrease loop counter */
  865. jnz L1015 /* check loop termination, proceed if required */
  866. emms /* exit MMX state */
  867. popa
  868. }
  869. #else
  870. asm volatile
  871. ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
  872. "mov %1, %%ebx \n\t" /* load Src2 address into ebx */
  873. "mov %0, %%edi \n\t" /* load Dest address into edi */
  874. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  875. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  876. "pxor %%mm0, %%mm0 \n\t" /* zero mm0 register */
  877. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  878. "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
  879. "movq (%%ebx), %%mm3 \n\t" /* load 8 bytes from Src2 into mm3 */
  880. "movq %%mm1, %%mm2 \n\t" /* copy mm1 into mm2 */
  881. "movq %%mm3, %%mm4 \n\t" /* copy mm3 into mm4 */
  882. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack low bytes of Src1 into words */
  883. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack high bytes of Src1 into words */
  884. "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of Src2 into words */
  885. "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of Src2 into words */
  886. "psrlw $1, %%mm1 \n\t" /* divide mm1 words by 2, Src1 low bytes */
  887. "psrlw $1, %%mm2 \n\t" /* divide mm2 words by 2, Src1 high bytes */
  888. "pmullw %%mm3, %%mm1 \n\t" /* mul low bytes of Src1 and Src2 */
  889. "pmullw %%mm4, %%mm2 \n\t" /* mul high bytes of Src1 and Src2 */
  890. "packuswb %%mm2, %%mm1 \n\t" /* pack words back into bytes with saturation */
  891. "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
  892. "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
  893. "add $8, %%ebx \n\t" /* register pointers by 8 */
  894. "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
  895. "jnz 1b \n\t" /* check loop termination, proceed if required */
  896. "emms \n\t" /* exit MMX state */
  897. "popa \n\t":"=m" (Dest) /* %0 */
  898. :"m"(Src2), /* %1 */
  899. "m"(Src1), /* %2 */
  900. "m"(SrcLength) /* %3 */
  901. );
  902. #endif
  903. return (0);
  904. #else
  905. return (-1);
  906. #endif
  907. }
  908. /*!
  909. \brief Filter using MultDivby2: D = saturation255(S1/2 * S2)
  910. \param Src1 Pointer to the start of the first source byte array (S1).
  911. \param Src2 Pointer to the start of the second source byte array (S2).
  912. \param Dest Pointer to the start of the destination byte array (D).
  913. \param length The number of bytes in the source arrays.
  914. \return Returns 0 for success or -1 for error.
  915. */
  916. int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  917. {
  918. unsigned int i, istart;
  919. unsigned char *cursrc1, *cursrc2, *curdst;
  920. int result;
  921. /* Validate input parameters */
  922. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  923. return(-1);
  924. if (length == 0)
  925. return(0);
  926. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  927. /* MMX routine */
  928. SDL_imageFilterMultDivby2MMX(Src1, Src2, Dest, length);
  929. /* Check for unaligned bytes */
  930. if ((length & 7) > 0) {
  931. /* Setup to process unaligned bytes */
  932. istart = length & 0xfffffff8;
  933. cursrc1 = &Src1[istart];
  934. cursrc2 = &Src2[istart];
  935. curdst = &Dest[istart];
  936. } else {
  937. /* No unaligned bytes - we are done */
  938. return (0);
  939. }
  940. } else {
  941. /* Setup to process whole image */
  942. istart = 0;
  943. cursrc1 = Src1;
  944. cursrc2 = Src2;
  945. curdst = Dest;
  946. }
  947. /* C routine to process image */
  948. for (i = istart; i < length; i++) {
  949. result = ((int) *cursrc1 / 2) * (int) *cursrc2;
  950. if (result > 255)
  951. result = 255;
  952. *curdst = (unsigned char) result;
  953. /* Advance pointers */
  954. cursrc1++;
  955. cursrc2++;
  956. curdst++;
  957. }
  958. return (0);
  959. }
  960. /*!
  961. \brief Internal MMX Filter using MultDivby4: D = saturation255(S1/2 * S2/2)
  962. \param Src1 Pointer to the start of the first source byte array (S1).
  963. \param Src2 Pointer to the start of the second source byte array (S2).
  964. \param Dest Pointer to the start of the destination byte array (D).
  965. \param SrcLength The number of bytes in the source arrays.
  966. \return Returns 0 for success or -1 for error.
  967. */
  968. int SDL_imageFilterMultDivby4MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
  969. {
  970. #ifdef USE_MMX
  971. #if !defined(GCC__)
  972. __asm
  973. {
  974. pusha
  975. mov eax, Src1 /* load Src1 address into eax */
  976. mov ebx, Src2 /* load Src2 address into ebx */
  977. mov edi, Dest /* load Dest address into edi */
  978. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  979. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  980. pxor mm0, mm0 /* zero mm0 register */
  981. align 16 /* 16 byte alignment of the loop entry */
  982. L1016:
  983. movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
  984. movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */
  985. movq mm2, mm1 /* copy mm1 into mm2 */
  986. movq mm4, mm3 /* copy mm3 into mm4 */
  987. punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */
  988. punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */
  989. punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */
  990. punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */
  991. psrlw mm1, 1 /* divide mm1 words by 2, Src1 low bytes */
  992. psrlw mm2, 1 /* divide mm2 words by 2, Src1 high bytes */
  993. psrlw mm3, 1 /* divide mm3 words by 2, Src2 low bytes */
  994. psrlw mm4, 1 /* divide mm4 words by 2, Src2 high bytes */
  995. pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */
  996. pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */
  997. packuswb mm1, mm2 /* pack words back into bytes with saturation */
  998. movq [edi], mm1 /* store result in Dest */
  999. add eax, 8 /* increase Src1, Src2 and Dest */
  1000. add ebx, 8 /* register pointers by 8 */
  1001. add edi, 8
  1002. dec ecx /* decrease loop counter */
  1003. jnz L1016 /* check loop termination, proceed if required */
  1004. emms /* exit MMX state */
  1005. popa
  1006. }
  1007. #else
  1008. asm volatile
  1009. ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
  1010. "mov %1, %%ebx \n\t" /* load Src2 address into ebx */
  1011. "mov %0, %%edi \n\t" /* load Dest address into edi */
  1012. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  1013. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  1014. "pxor %%mm0, %%mm0 \n\t" /* zero mm0 register */
  1015. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  1016. "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
  1017. "movq (%%ebx), %%mm3 \n\t" /* load 8 bytes from Src2 into mm3 */
  1018. "movq %%mm1, %%mm2 \n\t" /* copy mm1 into mm2 */
  1019. "movq %%mm3, %%mm4 \n\t" /* copy mm3 into mm4 */
  1020. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack low bytes of Src1 into words */
  1021. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack high bytes of Src1 into words */
  1022. "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of Src2 into words */
  1023. "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of Src2 into words */
  1024. "psrlw $1, %%mm1 \n\t" /* divide mm1 words by 2, Src1 low bytes */
  1025. "psrlw $1, %%mm2 \n\t" /* divide mm2 words by 2, Src1 high bytes */
  1026. "psrlw $1, %%mm3 \n\t" /* divide mm3 words by 2, Src2 low bytes */
  1027. "psrlw $1, %%mm4 \n\t" /* divide mm4 words by 2, Src2 high bytes */
  1028. "pmullw %%mm3, %%mm1 \n\t" /* mul low bytes of Src1 and Src2 */
  1029. "pmullw %%mm4, %%mm2 \n\t" /* mul high bytes of Src1 and Src2 */
  1030. "packuswb %%mm2, %%mm1 \n\t" /* pack words back into bytes with saturation */
  1031. "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
  1032. "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
  1033. "add $8, %%ebx \n\t" /* register pointers by 8 */
  1034. "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
  1035. "jnz 1b \n\t" /* check loop termination, proceed if required */
  1036. "emms \n\t" /* exit MMX state */
  1037. "popa \n\t":"=m" (Dest) /* %0 */
  1038. :"m"(Src2), /* %1 */
  1039. "m"(Src1), /* %2 */
  1040. "m"(SrcLength) /* %3 */
  1041. );
  1042. #endif
  1043. return (0);
  1044. #else
  1045. return (-1);
  1046. #endif
  1047. }
  1048. /*!
  1049. \brief Filter using MultDivby4: D = saturation255(S1/2 * S2/2)
  1050. \param Src1 Pointer to the start of the first source byte array (S1).
  1051. \param Src2 Pointer to the start of the second source byte array (S2).
  1052. \param Dest Pointer to the start of the destination byte array (D).
  1053. \param length The number of bytes in the source arrays.
  1054. \return Returns 0 for success or -1 for error.
  1055. */
  1056. int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  1057. {
  1058. unsigned int i, istart;
  1059. unsigned char *cursrc1, *cursrc2, *curdst;
  1060. int result;
  1061. /* Validate input parameters */
  1062. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  1063. return(-1);
  1064. if (length == 0)
  1065. return(0);
  1066. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  1067. /* MMX routine */
  1068. SDL_imageFilterMultDivby4MMX(Src1, Src2, Dest, length);
  1069. /* Check for unaligned bytes */
  1070. if ((length & 7) > 0) {
  1071. /* Setup to process unaligned bytes */
  1072. istart = length & 0xfffffff8;
  1073. cursrc1 = &Src1[istart];
  1074. cursrc2 = &Src2[istart];
  1075. curdst = &Dest[istart];
  1076. } else {
  1077. /* No unaligned bytes - we are done */
  1078. return (0);
  1079. }
  1080. } else {
  1081. /* Setup to process whole image */
  1082. istart = 0;
  1083. cursrc1 = Src1;
  1084. cursrc2 = Src2;
  1085. curdst = Dest;
  1086. }
  1087. /* C routine to process image */
  1088. for (i = istart; i < length; i++) {
  1089. result = ((int) *cursrc1 / 2) * ((int) *cursrc2 / 2);
  1090. if (result > 255)
  1091. result = 255;
  1092. *curdst = (unsigned char) result;
  1093. /* Advance pointers */
  1094. cursrc1++;
  1095. cursrc2++;
  1096. curdst++;
  1097. }
  1098. return (0);
  1099. }
  1100. /*!
  1101. \brief Internal MMX Filter using BitAnd: D = S1 & S2
  1102. \param Src1 Pointer to the start of the first source byte array (S1).
  1103. \param Src2 Pointer to the start of the second source byte array (S2).
  1104. \param Dest Pointer to the start of the destination byte array (D).
  1105. \param SrcLength The number of bytes in the source arrays.
  1106. \return Returns 0 for success or -1 for error.
  1107. */
  1108. int SDL_imageFilterBitAndMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
  1109. {
  1110. #ifdef USE_MMX
  1111. #if !defined(GCC__)
  1112. __asm
  1113. {
  1114. pusha
  1115. mov eax, Src1 /* load Src1 address into eax */
  1116. mov ebx, Src2 /* load Src2 address into ebx */
  1117. mov edi, Dest /* load Dest address into edi */
  1118. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  1119. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  1120. align 16 /* 16 byte alignment of the loop entry */
  1121. L1017:
  1122. movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
  1123. pand mm1, [ebx] /* mm1=Src1&Src2 */
  1124. movq [edi], mm1 /* store result in Dest */
  1125. add eax, 8 /* increase Src1, Src2 and Dest */
  1126. add ebx, 8 /* register pointers by 8 */
  1127. add edi, 8
  1128. dec ecx /* decrease loop counter */
  1129. jnz L1017 /* check loop termination, proceed if required */
  1130. emms /* exit MMX state */
  1131. popa
  1132. }
  1133. #else
  1134. asm volatile
  1135. ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
  1136. "mov %1, %%ebx \n\t" /* load Src2 address into ebx */
  1137. "mov %0, %%edi \n\t" /* load Dest address into edi */
  1138. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  1139. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  1140. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  1141. "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
  1142. "pand (%%ebx), %%mm1 \n\t" /* mm1=Src1&Src2 */
  1143. "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
  1144. "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
  1145. "add $8, %%ebx \n\t" /* register pointers by 8 */
  1146. "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
  1147. "jnz 1b \n\t" /* check loop termination, proceed if required */
  1148. "emms \n\t" /* exit MMX state */
  1149. "popa \n\t":"=m" (Dest) /* %0 */
  1150. :"m"(Src2), /* %1 */
  1151. "m"(Src1), /* %2 */
  1152. "m"(SrcLength) /* %3 */
  1153. );
  1154. #endif
  1155. return (0);
  1156. #else
  1157. return (-1);
  1158. #endif
  1159. }
  1160. /*!
  1161. \brief Filter using BitAnd: D = S1 & S2
  1162. \param Src1 Pointer to the start of the first source byte array (S1).
  1163. \param Src2 Pointer to the start of the second source byte array (S2).
  1164. \param Dest Pointer to the start of the destination byte array (D).
  1165. \param length The number of bytes in the source arrays.
  1166. \return Returns 0 for success or -1 for error.
  1167. */
  1168. int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  1169. {
  1170. unsigned int i, istart;
  1171. unsigned char *cursrc1, *cursrc2, *curdst;
  1172. /* Validate input parameters */
  1173. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  1174. return(-1);
  1175. if (length == 0)
  1176. return(0);
  1177. if ((SDL_imageFilterMMXdetect()>0) && (length>7)) {
  1178. /* if (length > 7) { */
  1179. /* Call MMX routine */
  1180. SDL_imageFilterBitAndMMX(Src1, Src2, Dest, length);
  1181. /* Check for unaligned bytes */
  1182. if ((length & 7) > 0) {
  1183. /* Setup to process unaligned bytes */
  1184. istart = length & 0xfffffff8;
  1185. cursrc1 = &Src1[istart];
  1186. cursrc2 = &Src2[istart];
  1187. curdst = &Dest[istart];
  1188. } else {
  1189. /* No unaligned bytes - we are done */
  1190. return (0);
  1191. }
  1192. } else {
  1193. /* Setup to process whole image */
  1194. istart = 0;
  1195. cursrc1 = Src1;
  1196. cursrc2 = Src2;
  1197. curdst = Dest;
  1198. }
  1199. /* C routine to process image */
  1200. for (i = istart; i < length; i++) {
  1201. *curdst = (*cursrc1) & (*cursrc2);
  1202. /* Advance pointers */
  1203. cursrc1++;
  1204. cursrc2++;
  1205. curdst++;
  1206. }
  1207. return (0);
  1208. }
  1209. /*!
  1210. \brief Internal MMX Filter using BitOr: D = S1 | S2
  1211. \param Src1 Pointer to the start of the first source byte array (S1).
  1212. \param Src2 Pointer to the start of the second source byte array (S2).
  1213. \param Dest Pointer to the start of the destination byte array (D).
  1214. \param SrcLength The number of bytes in the source arrays.
  1215. \return Returns 0 for success or -1 for error.
  1216. */
  1217. int SDL_imageFilterBitOrMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
  1218. {
  1219. #ifdef USE_MMX
  1220. #if !defined(GCC__)
  1221. __asm
  1222. {
  1223. pusha
  1224. mov eax, Src1 /* load Src1 address into eax */
  1225. mov ebx, Src2 /* load Src2 address into ebx */
  1226. mov edi, Dest /* load Dest address into edi */
  1227. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  1228. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  1229. align 16 /* 16 byte alignment of the loop entry */
  1230. L91017:
  1231. movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
  1232. por mm1, [ebx] /* mm1=Src1|Src2 */
  1233. movq [edi], mm1 /* store result in Dest */
  1234. add eax, 8 /* increase Src1, Src2 and Dest */
  1235. add ebx, 8 /* register pointers by 8 */
  1236. add edi, 8
  1237. dec ecx /* decrease loop counter */
  1238. jnz L91017 /* check loop termination, proceed if required */
  1239. emms /* exit MMX state */
  1240. popa
  1241. }
  1242. #else
  1243. asm volatile
  1244. ("pusha \n\t" "mov %2, %%eax \n\t" /* load Src1 address into eax */
  1245. "mov %1, %%ebx \n\t" /* load Src2 address into ebx */
  1246. "mov %0, %%edi \n\t" /* load Dest address into edi */
  1247. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  1248. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  1249. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  1250. "1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
  1251. "por (%%ebx), %%mm1 \n\t" /* mm1=Src1|Src2 */
  1252. "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
  1253. "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
  1254. "add $8, %%ebx \n\t" /* register pointers by 8 */
  1255. "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
  1256. "jnz 1b \n\t" /* check loop termination, proceed if required */
  1257. "emms \n\t" /* exit MMX state */
  1258. "popa \n\t":"=m" (Dest) /* %0 */
  1259. :"m"(Src2), /* %1 */
  1260. "m"(Src1), /* %2 */
  1261. "m"(SrcLength) /* %3 */
  1262. );
  1263. #endif
  1264. return (0);
  1265. #else
  1266. return (-1);
  1267. #endif
  1268. }
  1269. /*!
  1270. \brief Filter using BitOr: D = S1 | S2
  1271. \param Src1 Pointer to the start of the first source byte array (S1).
  1272. \param Src2 Pointer to the start of the second source byte array (S2).
  1273. \param Dest Pointer to the start of the destination byte array (D).
  1274. \param length The number of bytes in the source arrays.
  1275. \return Returns 0 for success or -1 for error.
  1276. */
  1277. int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  1278. {
  1279. unsigned int i, istart;
  1280. unsigned char *cursrc1, *cursrc2, *curdst;
  1281. /* Validate input parameters */
  1282. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  1283. return(-1);
  1284. if (length == 0)
  1285. return(0);
  1286. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  1287. /* MMX routine */
  1288. SDL_imageFilterBitOrMMX(Src1, Src2, Dest, length);
  1289. /* Check for unaligned bytes */
  1290. if ((length & 7) > 0) {
  1291. /* Setup to process unaligned bytes */
  1292. istart = length & 0xfffffff8;
  1293. cursrc1 = &Src1[istart];
  1294. cursrc2 = &Src2[istart];
  1295. curdst = &Dest[istart];
  1296. } else {
  1297. /* No unaligned bytes - we are done */
  1298. return (0);
  1299. }
  1300. } else {
  1301. /* Setup to process whole image */
  1302. istart = 0;
  1303. cursrc1 = Src1;
  1304. cursrc2 = Src2;
  1305. curdst = Dest;
  1306. }
  1307. /* C routine to process image */
  1308. for (i = istart; i < length; i++) {
  1309. *curdst = *cursrc1 | *cursrc2;
  1310. /* Advance pointers */
  1311. cursrc1++;
  1312. cursrc2++;
  1313. curdst++;
  1314. }
  1315. return (0);
  1316. }
  1317. /*!
  1318. \brief Internal ASM Filter using Div: D = S1 / S2
  1319. \param Src1 Pointer to the start of the first source byte array (S1).
  1320. \param Src2 Pointer to the start of the second source byte array (S2).
  1321. \param Dest Pointer to the start of the destination byte array (D).
  1322. \param SrcLength The number of bytes in the source arrays.
  1323. \return Returns 0 for success or -1 for error.
  1324. */
  1325. int SDL_imageFilterDivASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
  1326. {
  1327. #ifdef USE_MMX
  1328. #if !defined(GCC__)
  1329. __asm
  1330. {
  1331. pusha
  1332. mov edx, Src1 /* load Src1 address into edx */
  1333. mov esi, Src2 /* load Src2 address into esi */
  1334. mov edi, Dest /* load Dest address into edi */
  1335. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  1336. align 16 /* 16 byte alignment of the loop entry */
  1337. L10191:
  1338. mov bl, [esi] /* load a byte from Src2 */
  1339. cmp bl, 0 /* check if it zero */
  1340. jnz L10192
  1341. mov [edi], 255 /* division by zero = 255 !!! */
  1342. jmp L10193
  1343. L10192:
  1344. xor ah, ah /* prepare AX, zero AH register */
  1345. mov al, [edx] /* load a byte from Src1 into AL */
  1346. div bl /* divide AL by BL */
  1347. mov [edi], al /* move a byte result to Dest */
  1348. L10193:
  1349. inc edx /* increment Src1, Src2, Dest */
  1350. inc esi /* pointer registers by one */
  1351. inc edi
  1352. dec ecx /* decrease loop counter */
  1353. jnz L10191 /* check loop termination, proceed if required */
  1354. popa
  1355. }
  1356. #else
  1357. asm volatile
  1358. ("pusha \n\t" "mov %2, %%edx \n\t" /* load Src1 address into edx */
  1359. "mov %1, %%esi \n\t" /* load Src2 address into esi */
  1360. "mov %0, %%edi \n\t" /* load Dest address into edi */
  1361. "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  1362. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  1363. "1: mov (%%esi), %%bl \n\t" /* load a byte from Src2 */
  1364. "cmp $0, %%bl \n\t" /* check if it zero */
  1365. "jnz 2f \n\t" "movb $255, (%%edi) \n\t" /* division by zero = 255 !!! */
  1366. "jmp 3f \n\t" "2: \n\t" "xor %%ah, %%ah \n\t" /* prepare AX, zero AH register */
  1367. "mov (%%edx), %%al \n\t" /* load a byte from Src1 into AL */
  1368. "div %%bl \n\t" /* divide AL by BL */
  1369. "mov %%al, (%%edi) \n\t" /* move a byte result to Dest */
  1370. "3: inc %%edx \n\t" /* increment Src1, Src2, Dest */
  1371. "inc %%esi \n\t" /* pointer registers by one */
  1372. "inc %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
  1373. "jnz 1b \n\t" /* check loop termination, proceed if required */
  1374. "popa \n\t":"=m" (Dest) /* %0 */
  1375. :"m"(Src2), /* %1 */
  1376. "m"(Src1), /* %2 */
  1377. "m"(SrcLength) /* %3 */
  1378. );
  1379. #endif
  1380. return (0);
  1381. #else
  1382. return (-1);
  1383. #endif
  1384. }
  1385. /*!
  1386. \brief Filter using Div: D = S1 / S2
  1387. \param Src1 Pointer to the start of the first source byte array (S1).
  1388. \param Src2 Pointer to the start of the second source byte array (S2).
  1389. \param Dest Pointer to the start of the destination byte array (D).
  1390. \param length The number of bytes in the source arrays.
  1391. \return Returns 0 for success or -1 for error.
  1392. */
  1393. int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
  1394. {
  1395. unsigned int i, istart;
  1396. unsigned char *cursrc1, *cursrc2, *curdst;
  1397. int result;
  1398. /* Validate input parameters */
  1399. if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
  1400. return(-1);
  1401. if (length == 0)
  1402. return(0);
  1403. if (SDL_imageFilterMMXdetect()) {
  1404. if (length > 0) {
  1405. /* Call ASM routine */
  1406. SDL_imageFilterDivASM(Src1, Src2, Dest, length);
  1407. /* Never unaligned bytes - we are done */
  1408. return (0);
  1409. } else {
  1410. return (-1);
  1411. }
  1412. } else {
  1413. /* Setup to process whole image */
  1414. istart = 0;
  1415. cursrc1 = Src1;
  1416. cursrc2 = Src2;
  1417. curdst = Dest;
  1418. }
  1419. /* C routine to process image */
  1420. for (i = istart; i < length; i++) {
  1421. result = (int) *cursrc1 / (int) *cursrc2;
  1422. *curdst = (unsigned char) result;
  1423. /* Advance pointers */
  1424. cursrc1++;
  1425. cursrc2++;
  1426. curdst++;
  1427. }
  1428. return (0);
  1429. }
  1430. /* ------------------------------------------------------------------------------------ */
  1431. /*!
  1432. \brief Internal MMX Filter using BitNegation: D = !S
  1433. \param Src1 Pointer to the start of the source byte array (S1).
  1434. \param Dest Pointer to the start of the destination byte array (D).
  1435. \param SrcLength The number of bytes in the source array.
  1436. \return Returns 0 for success or -1 for error.
  1437. */
  1438. int SDL_imageFilterBitNegationMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength)
  1439. {
  1440. #ifdef USE_MMX
  1441. #if !defined(GCC__)
  1442. __asm
  1443. {
  1444. pusha
  1445. pcmpeqb mm1, mm1 /* generate all 1's in mm1 */
  1446. mov eax, Src1 /* load Src1 address into eax */
  1447. mov edi, Dest /* load Dest address into edi */
  1448. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  1449. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  1450. align 16 /* 16 byte alignment of the loop entry */
  1451. L91117:
  1452. movq mm0, [eax] /* load 8 bytes from Src1 into mm1 */
  1453. pxor mm0, mm1 /* negate mm0 by xoring with mm1 */
  1454. movq [edi], mm0 /* store result in Dest */
  1455. add eax, 8 /* increase Src1, Src2 and Dest */
  1456. add edi, 8
  1457. dec ecx /* decrease loop counter */
  1458. jnz L91117 /* check loop termination, proceed if required */
  1459. emms /* exit MMX state */
  1460. popa
  1461. }
  1462. #else
  1463. asm volatile
  1464. ("pusha \n\t" "pcmpeqb %%mm1, %%mm1 \n\t" /* generate all 1's in mm1 */
  1465. "mov %1, %%eax \n\t" /* load Src1 address into eax */
  1466. "mov %0, %%edi \n\t" /* load Dest address into edi */
  1467. "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  1468. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  1469. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  1470. "1: movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into mm1 */
  1471. "pxor %%mm1, %%mm0 \n\t" /* negate mm0 by xoring with mm1 */
  1472. "movq %%mm0, (%%edi) \n\t" /* store result in Dest */
  1473. "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
  1474. "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
  1475. "jnz 1b \n\t" /* check loop termination, proceed if required */
  1476. "emms \n\t" /* exit MMX state */
  1477. "popa \n\t":"=m" (Dest) /* %0 */
  1478. :"m"(Src1), /* %1 */
  1479. "m"(SrcLength) /* %2 */
  1480. );
  1481. #endif
  1482. return (0);
  1483. #else
  1484. return (-1);
  1485. #endif
  1486. }
  1487. /*!
  1488. \brief Filter using BitNegation: D = !S
  1489. \param Src1 Pointer to the start of the source byte array (S).
  1490. \param Dest Pointer to the start of the destination byte array (D).
  1491. \param length The number of bytes in the source array.
  1492. \return Returns 0 for success or -1 for error.
  1493. */
  1494. int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length)
  1495. {
  1496. unsigned int i, istart;
  1497. unsigned char *cursrc1, *curdst;
  1498. /* Validate input parameters */
  1499. if ((Src1 == NULL) || (Dest == NULL))
  1500. return(-1);
  1501. if (length == 0)
  1502. return(0);
  1503. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  1504. /* MMX routine */
  1505. SDL_imageFilterBitNegationMMX(Src1, Dest, length);
  1506. /* Check for unaligned bytes */
  1507. if ((length & 7) > 0) {
  1508. /* Setup to process unaligned bytes */
  1509. istart = length & 0xfffffff8;
  1510. cursrc1 = &Src1[istart];
  1511. curdst = &Dest[istart];
  1512. } else {
  1513. /* No unaligned bytes - we are done */
  1514. return (0);
  1515. }
  1516. } else {
  1517. /* Setup to process whole image */
  1518. istart = 0;
  1519. cursrc1 = Src1;
  1520. curdst = Dest;
  1521. }
  1522. /* C routine to process image */
  1523. for (i = istart; i < length; i++) {
  1524. *curdst = ~(*cursrc1);
  1525. /* Advance pointers */
  1526. cursrc1++;
  1527. curdst++;
  1528. }
  1529. return (0);
  1530. }
  1531. /*!
  1532. \brief Internal MMX Filter using AddByte: D = saturation255(S + C)
  1533. \param Src1 Pointer to the start of the source byte array (S).
  1534. \param Dest Pointer to the start of the destination byte array (D).
  1535. \param SrcLength The number of bytes in the source array.
  1536. \param C Constant value to add (C).
  1537. \return Returns 0 for success or -1 for error.
  1538. */
  1539. int SDL_imageFilterAddByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
  1540. {
  1541. #ifdef USE_MMX
  1542. #if !defined(GCC__)
  1543. __asm
  1544. {
  1545. pusha
  1546. /* ** Duplicate C in 8 bytes of MM1 ** */
  1547. mov al, C /* load C into AL */
  1548. mov ah, al /* copy AL into AH */
  1549. mov bx, ax /* copy AX into BX */
  1550. shl eax, 16 /* shift 2 bytes of EAX left */
  1551. mov ax, bx /* copy BX into AX */
  1552. movd mm1, eax /* copy EAX into MM1 */
  1553. movd mm2, eax /* copy EAX into MM2 */
  1554. punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */
  1555. mov eax, Src1 /* load Src1 address into eax */
  1556. mov edi, Dest /* load Dest address into edi */
  1557. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  1558. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  1559. align 16 /* 16 byte alignment of the loop entry */
  1560. L1021:
  1561. movq mm0, [eax] /* load 8 bytes from Src1 into MM0 */
  1562. paddusb mm0, mm1 /* MM0=SrcDest+C (add 8 bytes with saturation) */
  1563. movq [edi], mm0 /* store result in Dest */
  1564. add eax, 8 /* increase Dest register pointer by 8 */
  1565. add edi, 8 /* increase Dest register pointer by 8 */
  1566. dec ecx /* decrease loop counter */
  1567. jnz L1021 /* check loop termination, proceed if required */
  1568. emms /* exit MMX state */
  1569. popa
  1570. }
  1571. #else
  1572. asm volatile
  1573. ("pusha \n\t"
  1574. /* ** Duplicate C in 8 bytes of MM1 ** */
  1575. "mov %3, %%al \n\t" /* load C into AL */
  1576. "mov %%al, %%ah \n\t" /* copy AL into AH */
  1577. "mov %%ax, %%bx \n\t" /* copy AX into BX */
  1578. "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */
  1579. "mov %%bx, %%ax \n\t" /* copy BX into AX */
  1580. "movd %%eax, %%mm1 \n\t" /* copy EAX into MM1 */
  1581. "movd %%eax, %%mm2 \n\t" /* copy EAX into MM2 */
  1582. "punpckldq %%mm2, %%mm1 \n\t" /* fill higher bytes of MM1 with C */
  1583. "mov %1, %%eax \n\t" /* load Src1 address into eax */
  1584. "mov %0, %%edi \n\t" /* load Dest address into edi */
  1585. "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  1586. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  1587. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  1588. "1: \n\t"
  1589. "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into MM0 */
  1590. "paddusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest+C (add 8 bytes with saturation) */
  1591. "movq %%mm0, (%%edi) \n\t" /* store result in Dest */
  1592. "add $8, %%eax \n\t" /* increase Dest register pointer by 8 */
  1593. "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
  1594. "dec %%ecx \n\t" /* decrease loop counter */
  1595. "jnz 1b \n\t" /* check loop termination, proceed if required */
  1596. "emms \n\t" /* exit MMX state */
  1597. "popa \n\t":"=m" (Dest) /* %0 */
  1598. :"m"(Src1), /* %1 */
  1599. "m"(SrcLength), /* %2 */
  1600. "m"(C) /* %3 */
  1601. );
  1602. #endif
  1603. return (0);
  1604. #else
  1605. return (-1);
  1606. #endif
  1607. }
  1608. /*!
  1609. \brief Filter using AddByte: D = saturation255(S + C)
  1610. \param Src1 Pointer to the start of the source byte array (S).
  1611. \param Dest Pointer to the start of the destination byte array (D).
  1612. \param length The number of bytes in the source array.
  1613. \param C Constant value to add (C).
  1614. \return Returns 0 for success or -1 for error.
  1615. */
  1616. int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
  1617. {
  1618. unsigned int i, istart;
  1619. int iC;
  1620. unsigned char *cursrc1, *curdest;
  1621. int result;
  1622. /* Validate input parameters */
  1623. if ((Src1 == NULL) || (Dest == NULL))
  1624. return(-1);
  1625. if (length == 0)
  1626. return(0);
  1627. /* Special case: C==0 */
  1628. if (C == 0) {
  1629. memcpy(Src1, Dest, length);
  1630. return (0);
  1631. }
  1632. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  1633. /* MMX routine */
  1634. SDL_imageFilterAddByteMMX(Src1, Dest, length, C);
  1635. /* Check for unaligned bytes */
  1636. if ((length & 7) > 0) {
  1637. /* Setup to process unaligned bytes */
  1638. istart = length & 0xfffffff8;
  1639. cursrc1 = &Src1[istart];
  1640. curdest = &Dest[istart];
  1641. } else {
  1642. /* No unaligned bytes - we are done */
  1643. return (0);
  1644. }
  1645. } else {
  1646. /* Setup to process whole image */
  1647. istart = 0;
  1648. cursrc1 = Src1;
  1649. curdest = Dest;
  1650. }
  1651. /* C routine to process image */
  1652. iC = (int) C;
  1653. for (i = istart; i < length; i++) {
  1654. result = (int) *cursrc1 + iC;
  1655. if (result > 255)
  1656. result = 255;
  1657. *curdest = (unsigned char) result;
  1658. /* Advance pointers */
  1659. cursrc1++;
  1660. curdest++;
  1661. }
  1662. return (0);
  1663. }
  1664. /*!
  1665. \brief Internal MMX Filter using AddUint: D = saturation255((S[i] + Cs[i % 4]), Cs=Swap32((uint)C)
  1666. \param Src1 Pointer to the start of the source byte array (S).
  1667. \param Dest Pointer to the start of the destination byte array (D).
  1668. \param SrcLength The number of bytes in the source array.
  1669. \param C Constant to add (C).
  1670. \param D Byteorder-swapped constant to add (Cs).
  1671. \return Returns 0 for success or -1 for error.
  1672. */
  1673. int SDL_imageFilterAddUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
  1674. {
  1675. #ifdef USE_MMX
  1676. #if !defined(GCC__)
  1677. __asm
  1678. {
  1679. pusha
  1680. /* ** Duplicate (int)C in 8 bytes of MM1 ** */
  1681. mov eax, C /* load C into EAX */
  1682. movd mm1, eax /* copy EAX into MM1 */
  1683. mov eax, D /* load D into EAX */
  1684. movd mm2, eax /* copy EAX into MM2 */
  1685. punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */
  1686. mov eax, Src1 /* load Src1 address into eax */
  1687. mov edi, Dest /* load Dest address into edi */
  1688. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  1689. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  1690. align 16 /* 16 byte alignment of the loop entry */
  1691. L11023:
  1692. movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
  1693. paddusb mm0, mm1 /* MM0=SrcDest+C (add 8 bytes with saturation) */
  1694. movq [edi], mm0 /* store result in SrcDest */
  1695. add eax, 8 /* increase Src1 register pointer by 8 */
  1696. add edi, 8 /* increase Dest register pointer by 8 */
  1697. dec ecx /* decrease loop counter */
  1698. jnz L11023 /* check loop termination, proceed if required */
  1699. emms /* exit MMX state */
  1700. popa
  1701. }
  1702. #else
  1703. asm volatile
  1704. ("pusha \n\t"
  1705. /* ** Duplicate (int)C in 8 bytes of MM1 ** */
  1706. "mov %3, %%eax \n\t" /* load C into EAX */
  1707. "movd %%eax, %%mm1 \n\t" /* copy EAX into MM1 */
  1708. "mov %4, %%eax \n\t" /* load D into EAX */
  1709. "movd %%eax, %%mm2 \n\t" /* copy EAX into MM2 */
  1710. "punpckldq %%mm2, %%mm1 \n\t" /* fill higher bytes of MM1 with C */
  1711. "mov %1, %%eax \n\t" /* load Src1 address into eax */
  1712. "mov %0, %%edi \n\t" /* load Dest address into edi */
  1713. "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  1714. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  1715. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  1716. "1: \n\t"
  1717. "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */
  1718. "paddusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest+C (add 8 bytes with saturation) */
  1719. "movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */
  1720. "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
  1721. "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
  1722. "dec %%ecx \n\t" /* decrease loop counter */
  1723. "jnz 1b \n\t" /* check loop termination, proceed if required */
  1724. "emms \n\t" /* exit MMX state */
  1725. "popa \n\t":"=m" (Dest) /* %0 */
  1726. :"m"(Src1), /* %1 */
  1727. "m"(SrcLength), /* %2 */
  1728. "m"(C), /* %3 */
  1729. "m"(D) /* %4 */
  1730. );
  1731. #endif
  1732. return (0);
  1733. #else
  1734. return (-1);
  1735. #endif
  1736. }
  1737. /*!
  1738. \brief Filter using AddUint: D = saturation255((S[i] + Cs[i % 4]), Cs=Swap32((uint)C)
  1739. \param Src1 Pointer to the start of the source byte array (S).
  1740. \param Dest Pointer to the start of the destination byte array (D).
  1741. \param length The number of bytes in the source array.
  1742. \param C Constant to add (C).
  1743. \return Returns 0 for success or -1 for error.
  1744. */
  1745. int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
  1746. {
  1747. unsigned int i, j, istart, D;
  1748. int iC[4];
  1749. unsigned char *cursrc1;
  1750. unsigned char *curdest;
  1751. int result;
  1752. /* Validate input parameters */
  1753. if ((Src1 == NULL) || (Dest == NULL))
  1754. return(-1);
  1755. if (length == 0)
  1756. return(0);
  1757. /* Special case: C==0 */
  1758. if (C == 0) {
  1759. memcpy(Src1, Dest, length);
  1760. return (0);
  1761. }
  1762. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  1763. /* MMX routine */
  1764. D=SWAP_32(C);
  1765. SDL_imageFilterAddUintMMX(Src1, Dest, length, C, D);
  1766. /* Check for unaligned bytes */
  1767. if ((length & 7) > 0) {
  1768. /* Setup to process unaligned bytes */
  1769. istart = length & 0xfffffff8;
  1770. cursrc1 = &Src1[istart];
  1771. curdest = &Dest[istart];
  1772. } else {
  1773. /* No unaligned bytes - we are done */
  1774. return (0);
  1775. }
  1776. } else {
  1777. /* Setup to process whole image */
  1778. istart = 0;
  1779. cursrc1 = Src1;
  1780. curdest = Dest;
  1781. }
  1782. /* C routine to process bytes */
  1783. iC[3] = (int) ((C >> 24) & 0xff);
  1784. iC[2] = (int) ((C >> 16) & 0xff);
  1785. iC[1] = (int) ((C >> 8) & 0xff);
  1786. iC[0] = (int) ((C >> 0) & 0xff);
  1787. for (i = istart; i < length; i += 4) {
  1788. for (j = 0; j < 4; j++) {
  1789. if ((i+j)<length) {
  1790. result = (int) *cursrc1 + iC[j];
  1791. if (result > 255) result = 255;
  1792. *curdest = (unsigned char) result;
  1793. /* Advance pointers */
  1794. cursrc1++;
  1795. curdest++;
  1796. }
  1797. }
  1798. }
  1799. return (0);
  1800. }
  1801. /*!
  1802. \brief Internal MMX Filter using AddByteToHalf: D = saturation255(S/2 + C)
  1803. \param Src1 Pointer to the start of the source byte array (S).
  1804. \param Dest Pointer to the start of the destination byte array (D).
  1805. \param SrcLength The number of bytes in the source array.
  1806. \param C Constant to add (C).
  1807. \param Mask Pointer to 8 mask bytes of value 0x7F.
  1808. \return Returns 0 for success or -1 for error.
  1809. */
  1810. int SDL_imageFilterAddByteToHalfMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C,
  1811. unsigned char *Mask)
  1812. {
  1813. #ifdef USE_MMX
  1814. #if !defined(GCC__)
  1815. __asm
  1816. {
  1817. pusha
  1818. /* ** Duplicate C in 8 bytes of MM1 ** */
  1819. mov al, C /* load C into AL */
  1820. mov ah, al /* copy AL into AH */
  1821. mov bx, ax /* copy AX into BX */
  1822. shl eax, 16 /* shift 2 bytes of EAX left */
  1823. mov ax, bx /* copy BX into AX */
  1824. movd mm1, eax /* copy EAX into MM1 */
  1825. movd mm2, eax /* copy EAX into MM2 */
  1826. punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */
  1827. mov edx, Mask /* load Mask address into edx */
  1828. movq mm0, [edx] /* load Mask into mm0 */
  1829. mov eax, Src1 /* load Src1 address into eax */
  1830. mov edi, Dest /* load Dest address into edi */
  1831. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  1832. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  1833. align 16 /* 16 byte alignment of the loop entry */
  1834. L1022:
  1835. movq mm2, [eax] /* load 8 bytes from Src1 into MM2 */
  1836. psrlw mm2, 1 /* shift 4 WORDS of MM2 1 bit to the right */
  1837. pand mm2, mm0 // apply Mask to 8 BYTES of MM2 */
  1838. /* byte 0x0f, 0xdb, 0xd0 */
  1839. paddusb mm2, mm1 /* MM2=SrcDest+C (add 8 bytes with saturation) */
  1840. movq [edi], mm2 /* store result in Dest */
  1841. add eax, 8 /* increase Src1 register pointer by 8 */
  1842. add edi, 8 /* increase Dest register pointer by 8 */
  1843. dec ecx /* decrease loop counter */
  1844. jnz L1022 /* check loop termination, proceed if required */
  1845. emms /* exit MMX state */
  1846. popa
  1847. }
  1848. #else
  1849. asm volatile
  1850. ("pusha \n\t"
  1851. /* ** Duplicate C in 8 bytes of MM1 ** */
  1852. "mov %3, %%al \n\t" /* load C into AL */
  1853. "mov %%al, %%ah \n\t" /* copy AL into AH */
  1854. "mov %%ax, %%bx \n\t" /* copy AX into BX */
  1855. "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */
  1856. "mov %%bx, %%ax \n\t" /* copy BX into AX */
  1857. "movd %%eax, %%mm1 \n\t" /* copy EAX into MM1 */
  1858. "movd %%eax, %%mm2 \n\t" /* copy EAX into MM2 */
  1859. "punpckldq %%mm2, %%mm1 \n\t" /* fill higher bytes of MM1 with C */
  1860. "movl %4, %%edx \n\t" /* load Mask address into edx */
  1861. "movq (%%edx), %%mm0 \n\t" /* load Mask into mm0 */
  1862. "mov %1, %%eax \n\t" /* load Src1 address into eax */
  1863. "mov %0, %%edi \n\t" /* load Dest address into edi */
  1864. "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  1865. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  1866. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  1867. "1: \n\t"
  1868. "movq (%%eax), %%mm2 \n\t" /* load 8 bytes from Src1 into MM2 */
  1869. "psrlw $1, %%mm2 \n\t" /* shift 4 WORDS of MM2 1 bit to the right */
  1870. /* "pand %%mm0, %%mm2 \n\t" // apply Mask to 8 BYTES of MM2 */
  1871. ".byte 0x0f, 0xdb, 0xd0 \n\t"
  1872. "paddusb %%mm1, %%mm2 \n\t" /* MM2=SrcDest+C (add 8 bytes with saturation) */
  1873. "movq %%mm2, (%%edi) \n\t" /* store result in Dest */
  1874. "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
  1875. "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
  1876. "dec %%ecx \n\t" /* decrease loop counter */
  1877. "jnz 1b \n\t" /* check loop termination, proceed if required */
  1878. "emms \n\t" /* exit MMX state */
  1879. "popa \n\t":"=m" (Dest) /* %0 */
  1880. :"m"(Src1), /* %1 */
  1881. "m"(SrcLength), /* %2 */
  1882. "m"(C), /* %3 */
  1883. "m"(Mask) /* %4 */
  1884. );
  1885. #endif
  1886. return (0);
  1887. #else
  1888. return (-1);
  1889. #endif
  1890. }
  1891. /*!
  1892. \brief Filter using AddByteToHalf: D = saturation255(S/2 + C)
  1893. \param Src1 Pointer to the start of the source byte array (S).
  1894. \param Dest Pointer to the start of the destination byte array (D).
  1895. \param length The number of bytes in the source array.
  1896. \param C Constant to add (C).
  1897. \return Returns 0 for success or -1 for error.
  1898. */
  1899. int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
  1900. {
  1901. static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
  1902. unsigned int i, istart;
  1903. int iC;
  1904. unsigned char *cursrc1;
  1905. unsigned char *curdest;
  1906. int result;
  1907. /* Validate input parameters */
  1908. if ((Src1 == NULL) || (Dest == NULL))
  1909. return(-1);
  1910. if (length == 0)
  1911. return(0);
  1912. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  1913. /* MMX routine */
  1914. SDL_imageFilterAddByteToHalfMMX(Src1, Dest, length, C, Mask);
  1915. /* Check for unaligned bytes */
  1916. if ((length & 7) > 0) {
  1917. /* Setup to process unaligned bytes */
  1918. istart = length & 0xfffffff8;
  1919. cursrc1 = &Src1[istart];
  1920. curdest = &Dest[istart];
  1921. } else {
  1922. /* No unaligned bytes - we are done */
  1923. return (0);
  1924. }
  1925. } else {
  1926. /* Setup to process whole image */
  1927. istart = 0;
  1928. cursrc1 = Src1;
  1929. curdest = Dest;
  1930. }
  1931. /* C routine to process image */
  1932. iC = (int) C;
  1933. for (i = istart; i < length; i++) {
  1934. result = (int) (*cursrc1 / 2) + iC;
  1935. if (result > 255)
  1936. result = 255;
  1937. *curdest = (unsigned char) result;
  1938. /* Advance pointers */
  1939. cursrc1++;
  1940. curdest++;
  1941. }
  1942. return (0);
  1943. }
  1944. /*!
  1945. \brief Internal MMX Filter using SubByte: D = saturation0(S - C)
  1946. \param Src1 Pointer to the start of the source byte array (S).
  1947. \param Dest Pointer to the start of the destination byte array (D).
  1948. \param SrcLength The number of bytes in the source array.
  1949. \param C Constant to subtract (C).
  1950. \return Returns 0 for success or -1 for error.
  1951. */
  1952. int SDL_imageFilterSubByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
  1953. {
  1954. #ifdef USE_MMX
  1955. #if !defined(GCC__)
  1956. __asm
  1957. {
  1958. pusha
  1959. /* ** Duplicate C in 8 bytes of MM1 ** */
  1960. mov al, C /* load C into AL */
  1961. mov ah, al /* copy AL into AH */
  1962. mov bx, ax /* copy AX into BX */
  1963. shl eax, 16 /* shift 2 bytes of EAX left */
  1964. mov ax, bx /* copy BX into AX */
  1965. movd mm1, eax /* copy EAX into MM1 */
  1966. movd mm2, eax /* copy EAX into MM2 */
  1967. punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */
  1968. mov eax, Src1 /* load Src1 address into eax */
  1969. mov edi, Dest /* load Dest address into edi */
  1970. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  1971. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  1972. align 16 /* 16 byte alignment of the loop entry */
  1973. L1023:
  1974. movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
  1975. psubusb mm0, mm1 /* MM0=SrcDest-C (sub 8 bytes with saturation) */
  1976. movq [edi], mm0 /* store result in SrcDest */
  1977. add eax, 8 /* increase Src1 register pointer by 8 */
  1978. add edi, 8 /* increase Dest register pointer by 8 */
  1979. dec ecx /* decrease loop counter */
  1980. jnz L1023 /* check loop termination, proceed if required */
  1981. emms /* exit MMX state */
  1982. popa
  1983. }
  1984. #else
  1985. asm volatile
  1986. ("pusha \n\t"
  1987. /* ** Duplicate C in 8 bytes of MM1 ** */
  1988. "mov %3, %%al \n\t" /* load C into AL */
  1989. "mov %%al, %%ah \n\t" /* copy AL into AH */
  1990. "mov %%ax, %%bx \n\t" /* copy AX into BX */
  1991. "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */
  1992. "mov %%bx, %%ax \n\t" /* copy BX into AX */
  1993. "movd %%eax, %%mm1 \n\t" /* copy EAX into MM1 */
  1994. "movd %%eax, %%mm2 \n\t" /* copy EAX into MM2 */
  1995. "punpckldq %%mm2, %%mm1 \n\t" /* fill higher bytes of MM1 with C */
  1996. "mov %1, %%eax \n\t" /* load Src1 address into eax */
  1997. "mov %0, %%edi \n\t" /* load Dest address into edi */
  1998. "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  1999. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  2000. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  2001. "1: movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */
  2002. "psubusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest-C (sub 8 bytes with saturation) */
  2003. "movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */
  2004. "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
  2005. "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
  2006. "dec %%ecx \n\t" /* decrease loop counter */
  2007. "jnz 1b \n\t" /* check loop termination, proceed if required */
  2008. "emms \n\t" /* exit MMX state */
  2009. "popa \n\t":"=m" (Dest) /* %0 */
  2010. :"m"(Src1), /* %1 */
  2011. "m"(SrcLength), /* %2 */
  2012. "m"(C) /* %3 */
  2013. );
  2014. #endif
  2015. return (0);
  2016. #else
  2017. return (-1);
  2018. #endif
  2019. }
  2020. /*!
  2021. \brief Filter using SubByte: D = saturation0(S - C)
  2022. \param Src1 Pointer to the start of the source byte array (S).
  2023. \param Dest Pointer to the start of the destination byte array (D).
  2024. \param length The number of bytes in the source arrays.
  2025. \param C Constant to subtract (C).
  2026. \return Returns 0 for success or -1 for error.
  2027. */
  2028. int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
  2029. {
  2030. unsigned int i, istart;
  2031. int iC;
  2032. unsigned char *cursrc1;
  2033. unsigned char *curdest;
  2034. int result;
  2035. /* Validate input parameters */
  2036. if ((Src1 == NULL) || (Dest == NULL))
  2037. return(-1);
  2038. if (length == 0)
  2039. return(0);
  2040. /* Special case: C==0 */
  2041. if (C == 0) {
  2042. memcpy(Src1, Dest, length);
  2043. return (0);
  2044. }
  2045. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  2046. /* MMX routine */
  2047. SDL_imageFilterSubByteMMX(Src1, Dest, length, C);
  2048. /* Check for unaligned bytes */
  2049. if ((length & 7) > 0) {
  2050. /* Setup to process unaligned bytes */
  2051. istart = length & 0xfffffff8;
  2052. cursrc1 = &Src1[istart];
  2053. curdest = &Dest[istart];
  2054. } else {
  2055. /* No unaligned bytes - we are done */
  2056. return (0);
  2057. }
  2058. } else {
  2059. /* Setup to process whole image */
  2060. istart = 0;
  2061. cursrc1 = Src1;
  2062. curdest = Dest;
  2063. }
  2064. /* C routine to process image */
  2065. iC = (int) C;
  2066. for (i = istart; i < length; i++) {
  2067. result = (int) *cursrc1 - iC;
  2068. if (result < 0)
  2069. result = 0;
  2070. *curdest = (unsigned char) result;
  2071. /* Advance pointers */
  2072. cursrc1++;
  2073. curdest++;
  2074. }
  2075. return (0);
  2076. }
  2077. /*!
  2078. \brief Internal MMX Filter using SubUint: D = saturation0(S[i] - Cs[i % 4]), Cs=Swap32((uint)C)
  2079. \param Src1 Pointer to the start of the source byte array (S).
  2080. \param Dest Pointer to the start of the destination byte array (D).
  2081. \param SrcLength The number of bytes in the source array.
  2082. \param C Constant to subtract (C).
  2083. \param D Byteorder-swapped constant to subtract (Cs).
  2084. \return Returns 0 for success or -1 for error.
  2085. */
  2086. int SDL_imageFilterSubUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
  2087. {
  2088. #ifdef USE_MMX
  2089. #if !defined(GCC__)
  2090. __asm
  2091. {
  2092. pusha
  2093. /* ** Duplicate (int)C in 8 bytes of MM1 ** */
  2094. mov eax, C /* load C into EAX */
  2095. movd mm1, eax /* copy EAX into MM1 */
  2096. mov eax, D /* load D into EAX */
  2097. movd mm2, eax /* copy EAX into MM2 */
  2098. punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */
  2099. mov eax, Src1 /* load Src1 address into eax */
  2100. mov edi, Dest /* load Dest address into edi */
  2101. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  2102. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  2103. align 16 /* 16 byte alignment of the loop entry */
  2104. L11024:
  2105. movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
  2106. psubusb mm0, mm1 /* MM0=SrcDest-C (sub 8 bytes with saturation) */
  2107. movq [edi], mm0 /* store result in SrcDest */
  2108. add eax, 8 /* increase Src1 register pointer by 8 */
  2109. add edi, 8 /* increase Dest register pointer by 8 */
  2110. dec ecx /* decrease loop counter */
  2111. jnz L11024 /* check loop termination, proceed if required */
  2112. emms /* exit MMX state */
  2113. popa
  2114. }
  2115. #else
  2116. asm volatile
  2117. ("pusha \n\t"
  2118. /* ** Duplicate (int)C in 8 bytes of MM1 ** */
  2119. "mov %3, %%eax \n\t" /* load C into EAX */
  2120. "movd %%eax, %%mm1 \n\t" /* copy EAX into MM1 */
  2121. "mov %4, %%eax \n\t" /* load D into EAX */
  2122. "movd %%eax, %%mm2 \n\t" /* copy EAX into MM2 */
  2123. "punpckldq %%mm2, %%mm1 \n\t" /* fill higher bytes of MM1 with C */
  2124. "mov %1, %%eax \n\t" /* load Src1 address into eax */
  2125. "mov %0, %%edi \n\t" /* load Dest address into edi */
  2126. "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  2127. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  2128. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  2129. "1: movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */
  2130. "psubusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest-C (sub 8 bytes with saturation) */
  2131. "movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */
  2132. "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
  2133. "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
  2134. "dec %%ecx \n\t" /* decrease loop counter */
  2135. "jnz 1b \n\t" /* check loop termination, proceed if required */
  2136. "emms \n\t" /* exit MMX state */
  2137. "popa \n\t":"=m" (Dest) /* %0 */
  2138. :"m"(Src1), /* %1 */
  2139. "m"(SrcLength), /* %2 */
  2140. "m"(C), /* %3 */
  2141. "m"(D) /* %4 */
  2142. );
  2143. #endif
  2144. return (0);
  2145. #else
  2146. return (-1);
  2147. #endif
  2148. }
  2149. /*!
  2150. \brief Filter using SubUint: D = saturation0(S[i] - Cs[i % 4]), Cs=Swap32((uint)C)
  2151. \param Src1 Pointer to the start of the source byte array (S1).
  2152. \param Dest Pointer to the start of the destination byte array (D).
  2153. \param length The number of bytes in the source array.
  2154. \param C Constant to subtract (C).
  2155. \return Returns 0 for success or -1 for error.
  2156. */
  2157. int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
  2158. {
  2159. unsigned int i, j, istart, D;
  2160. int iC[4];
  2161. unsigned char *cursrc1;
  2162. unsigned char *curdest;
  2163. int result;
  2164. /* Validate input parameters */
  2165. if ((Src1 == NULL) || (Dest == NULL))
  2166. return(-1);
  2167. if (length == 0)
  2168. return(0);
  2169. /* Special case: C==0 */
  2170. if (C == 0) {
  2171. memcpy(Src1, Dest, length);
  2172. return (0);
  2173. }
  2174. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  2175. /* MMX routine */
  2176. D=SWAP_32(C);
  2177. SDL_imageFilterSubUintMMX(Src1, Dest, length, C, D);
  2178. /* Check for unaligned bytes */
  2179. if ((length & 7) > 0) {
  2180. /* Setup to process unaligned bytes */
  2181. istart = length & 0xfffffff8;
  2182. cursrc1 = &Src1[istart];
  2183. curdest = &Dest[istart];
  2184. } else {
  2185. /* No unaligned bytes - we are done */
  2186. return (0);
  2187. }
  2188. } else {
  2189. /* Setup to process whole image */
  2190. istart = 0;
  2191. cursrc1 = Src1;
  2192. curdest = Dest;
  2193. }
  2194. /* C routine to process image */
  2195. iC[3] = (int) ((C >> 24) & 0xff);
  2196. iC[2] = (int) ((C >> 16) & 0xff);
  2197. iC[1] = (int) ((C >> 8) & 0xff);
  2198. iC[0] = (int) ((C >> 0) & 0xff);
  2199. for (i = istart; i < length; i += 4) {
  2200. for (j = 0; j < 4; j++) {
  2201. if ((i+j)<length) {
  2202. result = (int) *cursrc1 - iC[j];
  2203. if (result < 0) result = 0;
  2204. *curdest = (unsigned char) result;
  2205. /* Advance pointers */
  2206. cursrc1++;
  2207. curdest++;
  2208. }
  2209. }
  2210. }
  2211. return (0);
  2212. }
  2213. /*!
  2214. \brief Internal MMX Filter using ShiftRight: D = saturation0(S >> N)
  2215. \param Src1 Pointer to the start of the source byte array (S).
  2216. \param Dest Pointer to the start of the destination byte array (D).
  2217. \param SrcLength The number of bytes in the source array.
  2218. \param N Number of bit-positions to shift (N). Valid range is 0 to 8.
  2219. \param Mask Byte array containing 8 bytes with 0x7F value.
  2220. \return Returns 0 for success or -1 for error.
  2221. */
  2222. int SDL_imageFilterShiftRightMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
  2223. unsigned char *Mask)
  2224. {
  2225. #ifdef USE_MMX
  2226. #if !defined(GCC__)
  2227. __asm
  2228. {
  2229. pusha
  2230. mov edx, Mask /* load Mask address into edx */
  2231. movq mm0, [edx] /* load Mask into mm0 */
  2232. xor ecx, ecx /* zero ECX */
  2233. mov cl, N /* load loop counter (N) into CL */
  2234. movd mm3, ecx /* copy (N) into MM3 */
  2235. pcmpeqb mm1, mm1 /* generate all 1's in mm1 */
  2236. L10240: /* ** Prepare proper bit-Mask in MM1 ** */
  2237. psrlw mm1, 1 /* shift 4 WORDS of MM1 1 bit to the right */
  2238. pand mm1, mm0 // apply Mask to 8 BYTES of MM1 */
  2239. /* byte 0x0f, 0xdb, 0xc8 */
  2240. dec cl /* decrease loop counter */
  2241. jnz L10240 /* check loop termination, proceed if required */
  2242. /* ** Shift all bytes of the image ** */
  2243. mov eax, Src1 /* load Src1 address into eax */
  2244. mov edi, Dest /* load Dest address into edi */
  2245. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  2246. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  2247. align 16 /* 16 byte alignment of the loop entry */
  2248. L10241:
  2249. movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
  2250. psrlw mm0, mm3 /* shift 4 WORDS of MM0 (N) bits to the right */
  2251. pand mm0, mm1 // apply proper bit-Mask to 8 BYTES of MM0 */
  2252. /* byte 0x0f, 0xdb, 0xc1 */
  2253. movq [edi], mm0 /* store result in SrcDest */
  2254. add eax, 8 /* increase Src1 register pointer by 8 */
  2255. add edi, 8 /* increase Dest register pointer by 8 */
  2256. dec ecx /* decrease loop counter */
  2257. jnz L10241 /* check loop termination, proceed if required */
  2258. emms /* exit MMX state */
  2259. popa
  2260. }
  2261. #else
  2262. asm volatile
  2263. ("pusha \n\t" "movl %4, %%edx \n\t" /* load Mask address into edx */
  2264. "movq (%%edx), %%mm0 \n\t" /* load Mask into mm0 */
  2265. "xor %%ecx, %%ecx \n\t" /* zero ECX */
  2266. "mov %3, %%cl \n\t" /* load loop counter (N) into CL */
  2267. "movd %%ecx, %%mm3 \n\t" /* copy (N) into MM3 */
  2268. "pcmpeqb %%mm1, %%mm1 \n\t" /* generate all 1's in mm1 */
  2269. "1: \n\t" /* ** Prepare proper bit-Mask in MM1 ** */
  2270. "psrlw $1, %%mm1 \n\t" /* shift 4 WORDS of MM1 1 bit to the right */
  2271. /* "pand %%mm0, %%mm1 \n\t" // apply Mask to 8 BYTES of MM1 */
  2272. ".byte 0x0f, 0xdb, 0xc8 \n\t"
  2273. "dec %%cl \n\t" /* decrease loop counter */
  2274. "jnz 1b \n\t" /* check loop termination, proceed if required */
  2275. /* ** Shift all bytes of the image ** */
  2276. "mov %1, %%eax \n\t" /* load Src1 address into eax */
  2277. "mov %0, %%edi \n\t" /* load Dest address into edi */
  2278. "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  2279. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  2280. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  2281. "2: \n\t"
  2282. "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */
  2283. "psrlw %%mm3, %%mm0 \n\t" /* shift 4 WORDS of MM0 (N) bits to the right */
  2284. /* "pand %%mm1, %%mm0 \n\t" // apply proper bit-Mask to 8 BYTES of MM0 */
  2285. ".byte 0x0f, 0xdb, 0xc1 \n\t"
  2286. "movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */
  2287. "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
  2288. "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
  2289. "dec %%ecx \n\t" /* decrease loop counter */
  2290. "jnz 2b \n\t" /* check loop termination, proceed if required */
  2291. "emms \n\t" /* exit MMX state */
  2292. "popa \n\t":"=m" (Dest) /* %0 */
  2293. :"m"(Src1), /* %1 */
  2294. "m"(SrcLength), /* %2 */
  2295. "m"(N), /* %3 */
  2296. "m"(Mask) /* %4 */
  2297. );
  2298. #endif
  2299. return (0);
  2300. #else
  2301. return (-1);
  2302. #endif
  2303. }
  2304. /*!
  2305. \brief Filter using ShiftRight: D = saturation0(S >> N)
  2306. \param Src1 Pointer to the start of the source byte array (S).
  2307. \param Dest Pointer to the start of the destination byte array (D).
  2308. \param length The number of bytes in the source array.
  2309. \param N Number of bit-positions to shift (N). Valid range is 0 to 8.
  2310. \return Returns 0 for success or -1 for error.
  2311. */
  2312. int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
  2313. {
  2314. static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
  2315. unsigned int i, istart;
  2316. unsigned char *cursrc1;
  2317. unsigned char *curdest;
  2318. /* Validate input parameters */
  2319. if ((Src1 == NULL) || (Dest == NULL))
  2320. return(-1);
  2321. if (length == 0)
  2322. return(0);
  2323. /* Check shift */
  2324. if (N > 8) {
  2325. return (-1);
  2326. }
  2327. /* Special case: N==0 */
  2328. if (N == 0) {
  2329. memcpy(Src1, Dest, length);
  2330. return (0);
  2331. }
  2332. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  2333. /* MMX routine */
  2334. SDL_imageFilterShiftRightMMX(Src1, Dest, length, N, Mask);
  2335. /* Check for unaligned bytes */
  2336. if ((length & 7) > 0) {
  2337. /* Setup to process unaligned bytes */
  2338. istart = length & 0xfffffff8;
  2339. cursrc1 = &Src1[istart];
  2340. curdest = &Dest[istart];
  2341. } else {
  2342. /* No unaligned bytes - we are done */
  2343. return (0);
  2344. }
  2345. } else {
  2346. /* Setup to process whole image */
  2347. istart = 0;
  2348. cursrc1 = Src1;
  2349. curdest = Dest;
  2350. }
  2351. /* C routine to process image */
  2352. for (i = istart; i < length; i++) {
  2353. *curdest = (unsigned char) *cursrc1 >> N;
  2354. /* Advance pointers */
  2355. cursrc1++;
  2356. curdest++;
  2357. }
  2358. return (0);
  2359. }
  2360. /*!
  2361. \brief Internal MMX Filter using ShiftRightUint: D = saturation0((uint)S[i] >> N)
  2362. \param Src1 Pointer to the start of the source byte array (S1).
  2363. \param Dest Pointer to the start of the destination byte array (D).
  2364. \param SrcLength The number of bytes in the source array.
  2365. \param N Number of bit-positions to shift (N).
  2366. \return Returns 0 for success or -1 for error.
  2367. */
  2368. int SDL_imageFilterShiftRightUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
  2369. {
  2370. #ifdef USE_MMX
  2371. #if !defined(GCC__)
  2372. __asm
  2373. {
  2374. pusha
  2375. mov eax, Src1 /* load Src1 address into eax */
  2376. mov edi, Dest /* load Dest address into edi */
  2377. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  2378. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  2379. align 16 /* 16 byte alignment of the loop entry */
  2380. L13023:
  2381. movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
  2382. psrld mm0, N
  2383. movq [edi], mm0 /* store result in SrcDest */
  2384. add eax, 8 /* increase Src1 register pointer by 8 */
  2385. add edi, 8 /* increase Dest register pointer by 8 */
  2386. dec ecx /* decrease loop counter */
  2387. jnz L13023 /* check loop termination, proceed if required */
  2388. emms /* exit MMX state */
  2389. popa
  2390. }
  2391. #else
  2392. asm volatile
  2393. ("pusha \n\t"
  2394. "mov %1, %%eax \n\t" /* load Src1 address into eax */
  2395. "mov %0, %%edi \n\t" /* load Dest address into edi */
  2396. "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  2397. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  2398. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  2399. "1: movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */
  2400. "psrld %3, %%mm0 \n\t"
  2401. "movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */
  2402. "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
  2403. "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
  2404. "dec %%ecx \n\t" /* decrease loop counter */
  2405. "jnz 1b \n\t" /* check loop termination, proceed if required */
  2406. "emms \n\t" /* exit MMX state */
  2407. "popa \n\t":"=m" (Dest) /* %0 */
  2408. :"m"(Src1), /* %1 */
  2409. "m"(SrcLength), /* %2 */
  2410. "m"(N) /* %3 */
  2411. );
  2412. #endif
  2413. return (0);
  2414. #else
  2415. return (-1);
  2416. #endif
  2417. }
  2418. /*!
  2419. \brief Filter using ShiftRightUint: D = saturation0((uint)S[i] >> N)
  2420. \param Src1 Pointer to the start of the source byte array (S1).
  2421. \param Dest Pointer to the start of the destination byte array (D).
  2422. \param length The number of bytes in the source array.
  2423. \param N Number of bit-positions to shift (N). Valid range is 0 to 32.
  2424. \return Returns 0 for success or -1 for error.
  2425. */
  2426. int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
  2427. {
  2428. unsigned int i, istart;
  2429. unsigned char *cursrc1, *curdest;
  2430. unsigned int *icursrc1, *icurdest;
  2431. int result;
  2432. /* Validate input parameters */
  2433. if ((Src1 == NULL) || (Dest == NULL))
  2434. return(-1);
  2435. if (length == 0)
  2436. return(0);
  2437. if (N > 32) {
  2438. return (-1);
  2439. }
  2440. /* Special case: N==0 */
  2441. if (N == 0) {
  2442. memcpy(Src1, Dest, length);
  2443. return (0);
  2444. }
  2445. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  2446. SDL_imageFilterShiftRightUintMMX(Src1, Dest, length, N);
  2447. /* Check for unaligned bytes */
  2448. if ((length & 7) > 0) {
  2449. /* Setup to process unaligned bytes */
  2450. istart = length & 0xfffffff8;
  2451. cursrc1 = &Src1[istart];
  2452. curdest = &Dest[istart];
  2453. } else {
  2454. /* No unaligned bytes - we are done */
  2455. return (0);
  2456. }
  2457. } else {
  2458. /* Setup to process whole image */
  2459. istart = 0;
  2460. cursrc1 = Src1;
  2461. curdest = Dest;
  2462. }
  2463. /* C routine to process image */
  2464. icursrc1=(unsigned int *)cursrc1;
  2465. icurdest=(unsigned int *)curdest;
  2466. for (i = istart; i < length; i += 4) {
  2467. if ((i+4)<length) {
  2468. result = ((unsigned int)*icursrc1 >> N);
  2469. *icurdest = (unsigned int)result;
  2470. }
  2471. /* Advance pointers */
  2472. icursrc1++;
  2473. icurdest++;
  2474. }
  2475. return (0);
  2476. }
  2477. /*!
  2478. \brief Internal MMX Filter using MultByByte: D = saturation255(S * C)
  2479. \param Src1 Pointer to the start of the source byte array (S).
  2480. \param Dest Pointer to the start of the destination byte array (D).
  2481. \param SrcLength The number of bytes in the source array.
  2482. \param C Constant to multiply with (C).
  2483. \return Returns 0 for success or -1 for error.
  2484. */
  2485. int SDL_imageFilterMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
  2486. {
  2487. #ifdef USE_MMX
  2488. #if !defined(GCC__)
  2489. __asm
  2490. {
  2491. pusha
  2492. /* ** Duplicate C in 4 words of MM1 ** */
  2493. mov al, C /* load C into AL */
  2494. xor ah, ah /* zero AH */
  2495. mov bx, ax /* copy AX into BX */
  2496. shl eax, 16 /* shift 2 bytes of EAX left */
  2497. mov ax, bx /* copy BX into AX */
  2498. movd mm1, eax /* copy EAX into MM1 */
  2499. movd mm2, eax /* copy EAX into MM2 */
  2500. punpckldq mm1, mm2 /* fill higher words of MM1 with C */
  2501. pxor mm0, mm0 /* zero MM0 register */
  2502. mov eax, Src1 /* load Src1 address into eax */
  2503. mov edi, Dest /* load Dest address into edi */
  2504. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  2505. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  2506. cmp al, 128 /* if (C <= 128) execute more efficient code */
  2507. jg L10251
  2508. align 16 /* 16 byte alignment of the loop entry */
  2509. L10250:
  2510. movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */
  2511. movq mm4, mm3 /* copy MM3 into MM4 */
  2512. punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */
  2513. punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */
  2514. pmullw mm3, mm1 /* mul low bytes of SrcDest and MM1 */
  2515. pmullw mm4, mm1 /* mul high bytes of SrcDest and MM1 */
  2516. packuswb mm3, mm4 /* pack words back into bytes with saturation */
  2517. movq [edi], mm3 /* store result in Dest */
  2518. add eax, 8 /* increase Src1 register pointer by 8 */
  2519. add edi, 8 /* increase Dest register pointer by 8 */
  2520. dec ecx /* decrease loop counter */
  2521. jnz L10250 /* check loop termination, proceed if required */
  2522. jmp L10252
  2523. align 16 /* 16 byte alignment of the loop entry */
  2524. L10251:
  2525. movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */
  2526. movq mm4, mm3 /* copy MM3 into MM4 */
  2527. punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */
  2528. punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */
  2529. pmullw mm3, mm1 /* mul low bytes of SrcDest and MM1 */
  2530. pmullw mm4, mm1 /* mul high bytes of SrcDest and MM1 */
  2531. /* ** Take abs value of the results (signed words) ** */
  2532. movq mm5, mm3 /* copy mm3 into mm5 */
  2533. movq mm6, mm4 /* copy mm4 into mm6 */
  2534. psraw mm5, 15 /* fill mm5 words with word sign bit */
  2535. psraw mm6, 15 /* fill mm6 words with word sign bit */
  2536. pxor mm3, mm5 /* take 1's compliment of only neg words */
  2537. pxor mm4, mm6 /* take 1's compliment of only neg words */
  2538. psubsw mm3, mm5 /* add 1 to only neg words, W-(-1) or W-0 */
  2539. psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */
  2540. packuswb mm3, mm4 /* pack words back into bytes with saturation */
  2541. movq [edi], mm3 /* store result in Dest */
  2542. add eax, 8 /* increase Src1 register pointer by 8 */
  2543. add edi, 8 /* increase Dest register pointer by 8 */
  2544. dec ecx /* decrease loop counter */
  2545. jnz L10251 /* check loop termination, proceed if required */
  2546. L10252:
  2547. emms /* exit MMX state */
  2548. popa
  2549. }
  2550. #else
  2551. asm volatile
  2552. ("pusha \n\t"
  2553. /* ** Duplicate C in 4 words of MM1 ** */
  2554. "mov %3, %%al \n\t" /* load C into AL */
  2555. "xor %%ah, %%ah \n\t" /* zero AH */
  2556. "mov %%ax, %%bx \n\t" /* copy AX into BX */
  2557. "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */
  2558. "mov %%bx, %%ax \n\t" /* copy BX into AX */
  2559. "movd %%eax, %%mm1 \n\t" /* copy EAX into MM1 */
  2560. "movd %%eax, %%mm2 \n\t" /* copy EAX into MM2 */
  2561. "punpckldq %%mm2, %%mm1 \n\t" /* fill higher words of MM1 with C */
  2562. "pxor %%mm0, %%mm0 \n\t" /* zero MM0 register */
  2563. "mov %1, %%eax \n\t" /* load Src1 address into eax */
  2564. "mov %0, %%edi \n\t" /* load Dest address into edi */
  2565. "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  2566. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  2567. "cmp $128, %%al \n\t" /* if (C <= 128) execute more efficient code */
  2568. "jg 2f \n\t" ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  2569. "1: movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */
  2570. "movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */
  2571. "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */
  2572. "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */
  2573. "pmullw %%mm1, %%mm3 \n\t" /* mul low bytes of SrcDest and MM1 */
  2574. "pmullw %%mm1, %%mm4 \n\t" /* mul high bytes of SrcDest and MM1 */
  2575. "packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */
  2576. "movq %%mm3, (%%edi) \n\t" /* store result in Dest */
  2577. "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
  2578. "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
  2579. "dec %%ecx \n\t" /* decrease loop counter */
  2580. "jnz 1b \n\t" /* check loop termination, proceed if required */
  2581. "jmp 3f \n\t" ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  2582. "2: movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */
  2583. "movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */
  2584. "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */
  2585. "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */
  2586. "pmullw %%mm1, %%mm3 \n\t" /* mul low bytes of SrcDest and MM1 */
  2587. "pmullw %%mm1, %%mm4 \n\t" /* mul high bytes of SrcDest and MM1 */
  2588. /* ** Take abs value of the results (signed words) ** */
  2589. "movq %%mm3, %%mm5 \n\t" /* copy mm3 into mm5 */
  2590. "movq %%mm4, %%mm6 \n\t" /* copy mm4 into mm6 */
  2591. "psraw $15, %%mm5 \n\t" /* fill mm5 words with word sign bit */
  2592. "psraw $15, %%mm6 \n\t" /* fill mm6 words with word sign bit */
  2593. "pxor %%mm5, %%mm3 \n\t" /* take 1's compliment of only neg. words */
  2594. "pxor %%mm6, %%mm4 \n\t" /* take 1's compliment of only neg. words */
  2595. "psubsw %%mm5, %%mm3 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
  2596. "psubsw %%mm6, %%mm4 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
  2597. "packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */
  2598. "movq %%mm3, (%%edi) \n\t" /* store result in Dest */
  2599. "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
  2600. "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
  2601. "dec %%ecx \n\t" /* decrease loop counter */
  2602. "jnz 2b \n\t" /* check loop termination, proceed if required */
  2603. "3: emms \n\t" /* exit MMX state */
  2604. "popa \n\t":"=m" (Dest) /* %0 */
  2605. :"m"(Src1), /* %1 */
  2606. "m"(SrcLength), /* %2 */
  2607. "m"(C) /* %3 */
  2608. );
  2609. #endif
  2610. return (0);
  2611. #else
  2612. return (-1);
  2613. #endif
  2614. }
  2615. /*!
  2616. \brief Filter using MultByByte: D = saturation255(S * C)
  2617. \param Src1 Pointer to the start of the source byte array (S).
  2618. \param Dest Pointer to the start of the destination byte array (D).
  2619. \param length The number of bytes in the source arrays.
  2620. \param C Constant to multiply with (C).
  2621. \return Returns 0 for success or -1 for error.
  2622. */
  2623. int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
  2624. {
  2625. unsigned int i, istart;
  2626. int iC;
  2627. unsigned char *cursrc1;
  2628. unsigned char *curdest;
  2629. int result;
  2630. /* Validate input parameters */
  2631. if ((Src1 == NULL) || (Dest == NULL))
  2632. return(-1);
  2633. if (length == 0)
  2634. return(0);
  2635. /* Special case: C==1 */
  2636. if (C == 1) {
  2637. memcpy(Src1, Dest, length);
  2638. return (0);
  2639. }
  2640. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  2641. SDL_imageFilterMultByByteMMX(Src1, Dest, length, C);
  2642. /* Check for unaligned bytes */
  2643. if ((length & 7) > 0) {
  2644. /* Setup to process unaligned bytes */
  2645. istart = length & 0xfffffff8;
  2646. cursrc1 = &Src1[istart];
  2647. curdest = &Dest[istart];
  2648. } else {
  2649. /* No unaligned bytes - we are done */
  2650. return (0);
  2651. }
  2652. } else {
  2653. /* Setup to process whole image */
  2654. istart = 0;
  2655. cursrc1 = Src1;
  2656. curdest = Dest;
  2657. }
  2658. /* C routine to process image */
  2659. iC = (int) C;
  2660. for (i = istart; i < length; i++) {
  2661. result = (int) *cursrc1 * iC;
  2662. if (result > 255)
  2663. result = 255;
  2664. *curdest = (unsigned char) result;
  2665. /* Advance pointers */
  2666. cursrc1++;
  2667. curdest++;
  2668. }
  2669. return (0);
  2670. }
  2671. /*!
  2672. \brief Internal MMX Filter using ShiftRightAndMultByByteMMX: D = saturation255((S >> N) * C)
  2673. \param Src1 Pointer to the start of the source byte array (S).
  2674. \param Dest Pointer to the start of the destination byte array (D).
  2675. \param SrcLength The number of bytes in the source array.
  2676. \param N Number of bit-positions to shift (N). Valid range is 0 to 8.
  2677. \param C Constant to multiply with (C).
  2678. \return Returns 0 for success or -1 for error.
  2679. */
  2680. int SDL_imageFilterShiftRightAndMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
  2681. unsigned char C)
  2682. {
  2683. #ifdef USE_MMX
  2684. #if !defined(GCC__)
  2685. __asm
  2686. {
  2687. pusha
  2688. /* ** Duplicate C in 4 words of MM1 ** */
  2689. mov al, C /* load C into AL */
  2690. xor ah, ah /* zero AH */
  2691. mov bx, ax /* copy AX into BX */
  2692. shl eax, 16 /* shift 2 bytes of EAX left */
  2693. mov ax, bx /* copy BX into AX */
  2694. movd mm1, eax /* copy EAX into MM1 */
  2695. movd mm2, eax /* copy EAX into MM2 */
  2696. punpckldq mm1, mm2 /* fill higher words of MM1 with C */
  2697. xor ecx, ecx /* zero ECX */
  2698. mov cl, N /* load N into CL */
  2699. movd mm7, ecx /* copy N into MM7 */
  2700. pxor mm0, mm0 /* zero MM0 register */
  2701. mov eax, Src1 /* load Src1 address into eax */
  2702. mov edi, Dest /* load Dest address into edi */
  2703. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  2704. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  2705. align 16 /* 16 byte alignment of the loop entry */
  2706. L1026:
  2707. movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */
  2708. movq mm4, mm3 /* copy MM3 into MM4 */
  2709. punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */
  2710. punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */
  2711. psrlw mm3, mm7 /* shift 4 WORDS of MM3 (N) bits to the right */
  2712. psrlw mm4, mm7 /* shift 4 WORDS of MM4 (N) bits to the right */
  2713. pmullw mm3, mm1 /* mul low bytes of SrcDest by MM1 */
  2714. pmullw mm4, mm1 /* mul high bytes of SrcDest by MM1 */
  2715. packuswb mm3, mm4 /* pack words back into bytes with saturation */
  2716. movq [edi], mm3 /* store result in Dest */
  2717. add eax, 8 /* increase Src1 register pointer by 8 */
  2718. add edi, 8 /* increase Dest register pointer by 8 */
  2719. dec ecx /* decrease loop counter */
  2720. jnz L1026 /* check loop termination, proceed if required */
  2721. emms /* exit MMX state */
  2722. popa
  2723. }
  2724. #else
  2725. asm volatile
  2726. ("pusha \n\t"
  2727. /* ** Duplicate C in 4 words of MM1 ** */
  2728. "mov %4, %%al \n\t" /* load C into AL */
  2729. "xor %%ah, %%ah \n\t" /* zero AH */
  2730. "mov %%ax, %%bx \n\t" /* copy AX into BX */
  2731. "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */
  2732. "mov %%bx, %%ax \n\t" /* copy BX into AX */
  2733. "movd %%eax, %%mm1 \n\t" /* copy EAX into MM1 */
  2734. "movd %%eax, %%mm2 \n\t" /* copy EAX into MM2 */
  2735. "punpckldq %%mm2, %%mm1 \n\t" /* fill higher words of MM1 with C */
  2736. "xor %%ecx, %%ecx \n\t" /* zero ECX */
  2737. "mov %3, %%cl \n\t" /* load N into CL */
  2738. "movd %%ecx, %%mm7 \n\t" /* copy N into MM7 */
  2739. "pxor %%mm0, %%mm0 \n\t" /* zero MM0 register */
  2740. "mov %1, %%eax \n\t" /* load Src1 address into eax */
  2741. "mov %0, %%edi \n\t" /* load Dest address into edi */
  2742. "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  2743. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  2744. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  2745. "1: movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */
  2746. "movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */
  2747. "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */
  2748. "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */
  2749. "psrlw %%mm7, %%mm3 \n\t" /* shift 4 WORDS of MM3 (N) bits to the right */
  2750. "psrlw %%mm7, %%mm4 \n\t" /* shift 4 WORDS of MM4 (N) bits to the right */
  2751. "pmullw %%mm1, %%mm3 \n\t" /* mul low bytes of SrcDest by MM1 */
  2752. "pmullw %%mm1, %%mm4 \n\t" /* mul high bytes of SrcDest by MM1 */
  2753. "packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */
  2754. "movq %%mm3, (%%edi) \n\t" /* store result in Dest */
  2755. "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
  2756. "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
  2757. "dec %%ecx \n\t" /* decrease loop counter */
  2758. "jnz 1b \n\t" /* check loop termination, proceed if required */
  2759. "emms \n\t" /* exit MMX state */
  2760. "popa \n\t":"=m" (Dest) /* %0 */
  2761. :"m"(Src1), /* %1 */
  2762. "m"(SrcLength), /* %2 */
  2763. "m"(N), /* %3 */
  2764. "m"(C) /* %4 */
  2765. );
  2766. #endif
  2767. return (0);
  2768. #else
  2769. return (-1);
  2770. #endif
  2771. }
  2772. /*!
  2773. \brief Filter using ShiftRightAndMultByByte: D = saturation255((S >> N) * C)
  2774. \param Src1 Pointer to the start of the source byte array (S).
  2775. \param Dest Pointer to the start of the destination byte array (D).
  2776. \param length The number of bytes in the source array.
  2777. \param N Number of bit-positions to shift (N). Valid range is 0 to 8.
  2778. \param C Constant to multiply with (C).
  2779. \return Returns 0 for success or -1 for error.
  2780. */
  2781. int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N,
  2782. unsigned char C)
  2783. {
  2784. unsigned int i, istart;
  2785. int iC;
  2786. unsigned char *cursrc1;
  2787. unsigned char *curdest;
  2788. int result;
  2789. /* Validate input parameters */
  2790. if ((Src1 == NULL) || (Dest == NULL))
  2791. return(-1);
  2792. if (length == 0)
  2793. return(0);
  2794. /* Check shift */
  2795. if (N > 8) {
  2796. return (-1);
  2797. }
  2798. /* Special case: N==0 && C==1 */
  2799. if ((N == 0) && (C == 1)) {
  2800. memcpy(Src1, Dest, length);
  2801. return (0);
  2802. }
  2803. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  2804. SDL_imageFilterShiftRightAndMultByByteMMX(Src1, Dest, length, N, C);
  2805. /* Check for unaligned bytes */
  2806. if ((length & 7) > 0) {
  2807. /* Setup to process unaligned bytes */
  2808. istart = length & 0xfffffff8;
  2809. cursrc1 = &Src1[istart];
  2810. curdest = &Dest[istart];
  2811. } else {
  2812. /* No unaligned bytes - we are done */
  2813. return (0);
  2814. }
  2815. } else {
  2816. /* Setup to process whole image */
  2817. istart = 0;
  2818. cursrc1 = Src1;
  2819. curdest = Dest;
  2820. }
  2821. /* C routine to process image */
  2822. iC = (int) C;
  2823. for (i = istart; i < length; i++) {
  2824. result = (int) (*cursrc1 >> N) * iC;
  2825. if (result > 255)
  2826. result = 255;
  2827. *curdest = (unsigned char) result;
  2828. /* Advance pointers */
  2829. cursrc1++;
  2830. curdest++;
  2831. }
  2832. return (0);
  2833. }
  2834. /*!
  2835. \brief Internal MMX Filter using ShiftLeftByte: D = (S << N)
  2836. \param Src1 Pointer to the start of the source byte array (S).
  2837. \param Dest Pointer to the start of the destination byte array (D).
  2838. \param SrcLength The number of bytes in the source arrays.
  2839. \param N Number of bit-positions to shift (N). Valid range is 0 to 8.
  2840. \param Mask Byte array containing 8 bytes of 0xFE value.
  2841. \return Returns 0 for success or -1 for error.
  2842. */
  2843. int SDL_imageFilterShiftLeftByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
  2844. unsigned char *Mask)
  2845. {
  2846. #ifdef USE_MMX
  2847. #if !defined(GCC__)
  2848. __asm
  2849. {
  2850. pusha
  2851. mov edx, Mask /* load Mask address into edx */
  2852. movq mm0, [edx] /* load Mask into mm0 */
  2853. xor ecx, ecx /* zero ECX */
  2854. mov cl, N /* load loop counter (N) into CL */
  2855. movd mm3, ecx /* copy (N) into MM3 */
  2856. pcmpeqb mm1, mm1 /* generate all 1's in mm1 */
  2857. L10270: /* ** Prepare proper bit-Mask in MM1 ** */
  2858. psllw mm1, 1 /* shift 4 WORDS of MM1 1 bit to the left */
  2859. pand mm1, mm0 // apply Mask to 8 BYTES of MM1 */
  2860. /* byte 0x0f, 0xdb, 0xc8 */
  2861. dec cl /* decrease loop counter */
  2862. jnz L10270 /* check loop termination, proceed if required */
  2863. /* ** Shift all bytes of the image ** */
  2864. mov eax, Src1 /* load Src1 address into eax */
  2865. mov edi, Dest /* load SrcDest address into edi */
  2866. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  2867. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  2868. align 16 /* 16 byte alignment of the loop entry */
  2869. L10271:
  2870. movq mm0, [eax] /* load 8 bytes from Src1 into MM0 */
  2871. psllw mm0, mm3 /* shift 4 WORDS of MM0 (N) bits to the left */
  2872. pand mm0, mm1 // apply proper bit-Mask to 8 BYTES of MM0 */
  2873. /* byte 0x0f, 0xdb, 0xc1 */
  2874. movq [edi], mm0 /* store result in Dest */
  2875. add eax, 8 /* increase Src1 register pointer by 8 */
  2876. add edi, 8 /* increase Dest register pointer by 8 */
  2877. dec ecx /* decrease loop counter */
  2878. jnz L10271 /* check loop termination, proceed if required */
  2879. emms /* exit MMX state */
  2880. popa
  2881. }
  2882. #else
  2883. asm volatile
  2884. ("pusha \n\t" "movl %4, %%edx \n\t" /* load Mask address into edx */
  2885. "movq (%%edx), %%mm0 \n\t" /* load Mask into mm0 */
  2886. "xor %%ecx, %%ecx \n\t" /* zero ECX */
  2887. "mov %3, %%cl \n\t" /* load loop counter (N) into CL */
  2888. "movd %%ecx, %%mm3 \n\t" /* copy (N) into MM3 */
  2889. "pcmpeqb %%mm1, %%mm1 \n\t" /* generate all 1's in mm1 */
  2890. "1: \n\t" /* ** Prepare proper bit-Mask in MM1 ** */
  2891. "psllw $1, %%mm1 \n\t" /* shift 4 WORDS of MM1 1 bit to the left */
  2892. /* "pand %%mm0, %%mm1 \n\t" // apply Mask to 8 BYTES of MM1 */
  2893. ".byte 0x0f, 0xdb, 0xc8 \n\t" "dec %%cl \n\t" /* decrease loop counter */
  2894. "jnz 1b \n\t" /* check loop termination, proceed if required */
  2895. /* ** Shift all bytes of the image ** */
  2896. "mov %1, %%eax \n\t" /* load Src1 address into eax */
  2897. "mov %0, %%edi \n\t" /* load SrcDest address into edi */
  2898. "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  2899. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  2900. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  2901. "2: movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into MM0 */
  2902. "psllw %%mm3, %%mm0 \n\t" /* shift 4 WORDS of MM0 (N) bits to the left */
  2903. /* "pand %%mm1, %%mm0 \n\t" // apply proper bit-Mask to 8 BYTES of MM0 */
  2904. ".byte 0x0f, 0xdb, 0xc1 \n\t" "movq %%mm0, (%%edi) \n\t" /* store result in Dest */
  2905. "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
  2906. "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
  2907. "dec %%ecx \n\t" /* decrease loop counter */
  2908. "jnz 2b \n\t" /* check loop termination, proceed if required */
  2909. "emms \n\t" /* exit MMX state */
  2910. "popa \n\t":"=m" (Dest) /* %0 */
  2911. :"m"(Src1), /* %1 */
  2912. "m"(SrcLength), /* %2 */
  2913. "m"(N), /* %3 */
  2914. "m"(Mask) /* %4 */
  2915. );
  2916. #endif
  2917. return (0);
  2918. #else
  2919. return (-1);
  2920. #endif
  2921. }
  2922. /*!
  2923. \brief Filter using ShiftLeftByte: D = (S << N)
  2924. \param Src1 Pointer to the start of the source byte array (S).
  2925. \param Dest Pointer to the start of the destination byte array (D).
  2926. \param length The number of bytes in the source arrays.
  2927. \param N Number of bit-positions to shift (N). Valid range is 0 to 8.
  2928. \return Returns 0 for success or -1 for error.
  2929. */
  2930. int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
  2931. {
  2932. static unsigned char Mask[8] = { 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE };
  2933. unsigned int i, istart;
  2934. unsigned char *cursrc1, *curdest;
  2935. int result;
  2936. /* Validate input parameters */
  2937. if ((Src1 == NULL) || (Dest == NULL))
  2938. return(-1);
  2939. if (length == 0)
  2940. return(0);
  2941. if (N > 8) {
  2942. return (-1);
  2943. }
  2944. /* Special case: N==0 */
  2945. if (N == 0) {
  2946. memcpy(Src1, Dest, length);
  2947. return (0);
  2948. }
  2949. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  2950. SDL_imageFilterShiftLeftByteMMX(Src1, Dest, length, N, Mask);
  2951. /* Check for unaligned bytes */
  2952. if ((length & 7) > 0) {
  2953. /* Setup to process unaligned bytes */
  2954. istart = length & 0xfffffff8;
  2955. cursrc1 = &Src1[istart];
  2956. curdest = &Dest[istart];
  2957. } else {
  2958. /* No unaligned bytes - we are done */
  2959. return (0);
  2960. }
  2961. } else {
  2962. /* Setup to process whole image */
  2963. istart = 0;
  2964. cursrc1 = Src1;
  2965. curdest = Dest;
  2966. }
  2967. /* C routine to process image */
  2968. for (i = istart; i < length; i++) {
  2969. result = ((int) *cursrc1 << N) & 0xff;
  2970. *curdest = (unsigned char) result;
  2971. /* Advance pointers */
  2972. cursrc1++;
  2973. curdest++;
  2974. }
  2975. return (0);
  2976. }
  2977. /*!
  2978. \brief Internal MMX Filter using ShiftLeftUint: D = ((uint)S << N)
  2979. \param Src1 Pointer to the start of the source byte array (S).
  2980. \param Dest Pointer to the start of the destination byte array (D).
  2981. \param SrcLength The number of bytes in the source array.
  2982. \param N Number of bit-positions to shift (N). Valid range is 0 to 32.
  2983. \return Returns 0 for success or -1 for error.
  2984. */
  2985. int SDL_imageFilterShiftLeftUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
  2986. {
  2987. #ifdef USE_MMX
  2988. #if !defined(GCC__)
  2989. __asm
  2990. {
  2991. pusha
  2992. mov eax, Src1 /* load Src1 address into eax */
  2993. mov edi, Dest /* load Dest address into edi */
  2994. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  2995. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  2996. align 16 /* 16 byte alignment of the loop entry */
  2997. L12023:
  2998. movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
  2999. pslld mm0, N /* MM0=SrcDest+C (add 8 bytes with saturation) */
  3000. movq [edi], mm0 /* store result in SrcDest */
  3001. add eax, 8 /* increase Src1 register pointer by 8 */
  3002. add edi, 8 /* increase Dest register pointer by 8 */
  3003. dec ecx /* decrease loop counter */
  3004. jnz L12023 /* check loop termination, proceed if required */
  3005. emms /* exit MMX state */
  3006. popa
  3007. }
  3008. #else
  3009. asm volatile
  3010. ("pusha \n\t"
  3011. "mov %1, %%eax \n\t" /* load Src1 address into eax */
  3012. "mov %0, %%edi \n\t" /* load Dest address into edi */
  3013. "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  3014. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  3015. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  3016. "1: movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */
  3017. "pslld %3, %%mm0 \n\t" /* MM0=SrcDest+C (add 8 bytes with saturation) */
  3018. "movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */
  3019. "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
  3020. "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
  3021. "dec %%ecx \n\t" /* decrease loop counter */
  3022. "jnz 1b \n\t" /* check loop termination, proceed if required */
  3023. "emms \n\t" /* exit MMX state */
  3024. "popa \n\t":"=m" (Dest) /* %0 */
  3025. :"m"(Src1), /* %1 */
  3026. "m"(SrcLength), /* %2 */
  3027. "m"(N) /* %3 */
  3028. );
  3029. #endif
  3030. return (0);
  3031. #else
  3032. return (-1);
  3033. #endif
  3034. }
  3035. /*!
  3036. \brief Filter using ShiftLeftUint: D = ((uint)S << N)
  3037. \param Src1 Pointer to the start of the source byte array (S).
  3038. \param Dest Pointer to the start of the destination byte array (D).
  3039. \param length The number of bytes in the source array.
  3040. \param N Number of bit-positions to shift (N). Valid range is 0 to 32.
  3041. \return Returns 0 for success or -1 for error.
  3042. */
  3043. int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
  3044. {
  3045. unsigned int i, istart;
  3046. unsigned char *cursrc1, *curdest;
  3047. unsigned int *icursrc1, *icurdest;
  3048. int result;
  3049. /* Validate input parameters */
  3050. if ((Src1 == NULL) || (Dest == NULL))
  3051. return(-1);
  3052. if (length == 0)
  3053. return(0);
  3054. if (N > 32) {
  3055. return (-1);
  3056. }
  3057. /* Special case: N==0 */
  3058. if (N == 0) {
  3059. memcpy(Src1, Dest, length);
  3060. return (0);
  3061. }
  3062. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  3063. SDL_imageFilterShiftLeftUintMMX(Src1, Dest, length, N);
  3064. /* Check for unaligned bytes */
  3065. if ((length & 7) > 0) {
  3066. /* Setup to process unaligned bytes */
  3067. istart = length & 0xfffffff8;
  3068. cursrc1 = &Src1[istart];
  3069. curdest = &Dest[istart];
  3070. } else {
  3071. /* No unaligned bytes - we are done */
  3072. return (0);
  3073. }
  3074. } else {
  3075. /* Setup to process whole image */
  3076. istart = 0;
  3077. cursrc1 = Src1;
  3078. curdest = Dest;
  3079. }
  3080. /* C routine to process image */
  3081. icursrc1=(unsigned int *)cursrc1;
  3082. icurdest=(unsigned int *)curdest;
  3083. for (i = istart; i < length; i += 4) {
  3084. if ((i+4)<length) {
  3085. result = ((unsigned int)*icursrc1 << N);
  3086. *icurdest = (unsigned int)result;
  3087. }
  3088. /* Advance pointers */
  3089. icursrc1++;
  3090. icurdest++;
  3091. }
  3092. return (0);
  3093. }
  3094. /*!
  3095. \brief Internal MMX Filter ShiftLeft: D = saturation255(S << N)
  3096. \param Src1 Pointer to the start of the source byte array (S1).
  3097. \param Dest Pointer to the start of the destination byte array (D).
  3098. \param SrcLength The number of bytes in the source array.
  3099. \param N Number of bit-positions to shift (N). Valid range is 0 to 8.
  3100. \return Returns 0 for success or -1 for error.
  3101. */
  3102. int SDL_imageFilterShiftLeftMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
  3103. {
  3104. #ifdef USE_MMX
  3105. #if !defined(GCC__)
  3106. __asm
  3107. {
  3108. pusha
  3109. xor eax, eax /* zero EAX */
  3110. mov al, N /* load N into AL */
  3111. movd mm7, eax /* copy N into MM7 */
  3112. pxor mm0, mm0 /* zero MM0 register */
  3113. mov eax, Src1 /* load Src1 address into eax */
  3114. mov edi, Dest /* load Dest address into edi */
  3115. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  3116. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  3117. cmp al, 7 /* if (N <= 7) execute more efficient code */
  3118. jg L10281
  3119. align 16 /* 16 byte alignment of the loop entry */
  3120. L10280:
  3121. movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */
  3122. movq mm4, mm3 /* copy MM3 into MM4 */
  3123. punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */
  3124. punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */
  3125. psllw mm3, mm7 /* shift 4 WORDS of MM3 (N) bits to the right */
  3126. psllw mm4, mm7 /* shift 4 WORDS of MM4 (N) bits to the right */
  3127. packuswb mm3, mm4 /* pack words back into bytes with saturation */
  3128. movq [edi], mm3 /* store result in Dest */
  3129. add eax, 8 /* increase Src1 register pointer by 8 */
  3130. add edi, 8 /* increase Dest register pointer by 8 */
  3131. dec ecx /* decrease loop counter */
  3132. jnz L10280 /* check loop termination, proceed if required */
  3133. jmp L10282
  3134. align 16 /* 16 byte alignment of the loop entry */
  3135. L10281:
  3136. movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */
  3137. movq mm4, mm3 /* copy MM3 into MM4 */
  3138. punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */
  3139. punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */
  3140. psllw mm3, mm7 /* shift 4 WORDS of MM3 (N) bits to the right */
  3141. psllw mm4, mm7 /* shift 4 WORDS of MM4 (N) bits to the right */
  3142. /* ** Take abs value of the signed words ** */
  3143. movq mm5, mm3 /* copy mm3 into mm5 */
  3144. movq mm6, mm4 /* copy mm4 into mm6 */
  3145. psraw mm5, 15 /* fill mm5 words with word sign bit */
  3146. psraw mm6, 15 /* fill mm6 words with word sign bit */
  3147. pxor mm3, mm5 /* take 1's compliment of only neg words */
  3148. pxor mm4, mm6 /* take 1's compliment of only neg words */
  3149. psubsw mm3, mm5 /* add 1 to only neg words, W-(-1) or W-0 */
  3150. psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */
  3151. packuswb mm3, mm4 /* pack words back into bytes with saturation */
  3152. movq [edi], mm3 /* store result in Dest */
  3153. add eax, 8 /* increase Src1 register pointer by 8 */
  3154. add edi, 8 /* increase Dest register pointer by 8 */
  3155. dec ecx /* decrease loop counter */
  3156. jnz L10281 /* check loop termination, proceed if required */
  3157. L10282:
  3158. emms /* exit MMX state */
  3159. popa
  3160. }
  3161. #else
  3162. asm volatile
  3163. ("pusha \n\t" "xor %%eax, %%eax \n\t" /* zero EAX */
  3164. "mov %3, %%al \n\t" /* load N into AL */
  3165. "movd %%eax, %%mm7 \n\t" /* copy N into MM7 */
  3166. "pxor %%mm0, %%mm0 \n\t" /* zero MM0 register */
  3167. "mov %1, %%eax \n\t" /* load Src1 address into eax */
  3168. "mov %0, %%edi \n\t" /* load Dest address into edi */
  3169. "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  3170. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  3171. "cmp $7, %%al \n\t" /* if (N <= 7) execute more efficient code */
  3172. "jg 2f \n\t" ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  3173. "1: movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */
  3174. "movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */
  3175. "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */
  3176. "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */
  3177. "psllw %%mm7, %%mm3 \n\t" /* shift 4 WORDS of MM3 (N) bits to the right */
  3178. "psllw %%mm7, %%mm4 \n\t" /* shift 4 WORDS of MM4 (N) bits to the right */
  3179. "packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */
  3180. "movq %%mm3, (%%edi) \n\t" /* store result in Dest */
  3181. "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
  3182. "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
  3183. "dec %%ecx \n\t" /* decrease loop counter */
  3184. "jnz 1b \n\t" /* check loop termination, proceed if required */
  3185. "jmp 3f \n\t" ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  3186. "2: movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */
  3187. "movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */
  3188. "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */
  3189. "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */
  3190. "psllw %%mm7, %%mm3 \n\t" /* shift 4 WORDS of MM3 (N) bits to the right */
  3191. "psllw %%mm7, %%mm4 \n\t" /* shift 4 WORDS of MM4 (N) bits to the right */
  3192. /* ** Take abs value of the signed words ** */
  3193. "movq %%mm3, %%mm5 \n\t" /* copy mm3 into mm5 */
  3194. "movq %%mm4, %%mm6 \n\t" /* copy mm4 into mm6 */
  3195. "psraw $15, %%mm5 \n\t" /* fill mm5 words with word sign bit */
  3196. "psraw $15, %%mm6 \n\t" /* fill mm6 words with word sign bit */
  3197. "pxor %%mm5, %%mm3 \n\t" /* take 1's compliment of only neg. words */
  3198. "pxor %%mm6, %%mm4 \n\t" /* take 1's compliment of only neg. words */
  3199. "psubsw %%mm5, %%mm3 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
  3200. "psubsw %%mm6, %%mm4 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
  3201. "packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */
  3202. "movq %%mm3, (%%edi) \n\t" /* store result in Dest */
  3203. "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
  3204. "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
  3205. "dec %%ecx \n\t" /* decrease loop counter */
  3206. "jnz 2b \n\t" /* check loop termination, proceed if required */
  3207. "3: emms \n\t" /* exit MMX state */
  3208. "popa \n\t":"=m" (Dest) /* %0 */
  3209. :"m"(Src1), /* %1 */
  3210. "m"(SrcLength), /* %2 */
  3211. "m"(N) /* %3 */
  3212. );
  3213. #endif
  3214. return (0);
  3215. #else
  3216. return (-1);
  3217. #endif
  3218. }
  3219. /*!
  3220. \brief Filter ShiftLeft: D = saturation255(S << N)
  3221. \param Src1 Pointer to the start of the source byte array (S1).
  3222. \param Dest Pointer to the start of the destination byte array (D).
  3223. \param length The number of bytes in the source array.
  3224. \param N Number of bit-positions to shift (N). Valid range is 0 to 8.
  3225. \return Returns 0 for success or -1 for error.
  3226. */
  3227. int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
  3228. {
  3229. unsigned int i, istart;
  3230. unsigned char *cursrc1, *curdest;
  3231. int result;
  3232. /* Validate input parameters */
  3233. if ((Src1 == NULL) || (Dest == NULL))
  3234. return(-1);
  3235. if (length == 0)
  3236. return(0);
  3237. if (N > 8) {
  3238. return (-1);
  3239. }
  3240. /* Special case: N==0 */
  3241. if (N == 0) {
  3242. memcpy(Src1, Dest, length);
  3243. return (0);
  3244. }
  3245. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  3246. SDL_imageFilterShiftLeftMMX(Src1, Dest, length, N);
  3247. /* Check for unaligned bytes */
  3248. if ((length & 7) > 0) {
  3249. /* Setup to process unaligned bytes */
  3250. istart = length & 0xfffffff8;
  3251. cursrc1 = &Src1[istart];
  3252. curdest = &Dest[istart];
  3253. } else {
  3254. /* No unaligned bytes - we are done */
  3255. return (0);
  3256. }
  3257. } else {
  3258. /* Setup to process whole image */
  3259. istart = 0;
  3260. cursrc1 = Src1;
  3261. curdest = Dest;
  3262. }
  3263. /* C routine to process image */
  3264. for (i = istart; i < length; i++) {
  3265. result = (int) *cursrc1 << N;
  3266. if (result > 255)
  3267. result = 255;
  3268. *curdest = (unsigned char) result;
  3269. /* Advance pointers */
  3270. cursrc1++;
  3271. curdest++;
  3272. }
  3273. return (0);
  3274. }
  3275. /*!
  3276. \brief MMX BinarizeUsingThreshold: D = (S >= T) ? 255:0
  3277. \param Src1 Pointer to the start of the source byte array (S).
  3278. \param Dest Pointer to the start of the destination byte array (D).
  3279. \param SrcLength The number of bytes in the source array.
  3280. \param T The threshold boundary (inclusive).
  3281. \return Returns 0 for success or -1 for error.
  3282. */
  3283. int SDL_imageFilterBinarizeUsingThresholdMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char T)
  3284. {
  3285. #ifdef USE_MMX
  3286. #if !defined(GCC__)
  3287. __asm
  3288. {
  3289. pusha
  3290. /* ** Duplicate T in 8 bytes of MM3 ** */
  3291. pcmpeqb mm1, mm1 /* generate all 1's in mm1 */
  3292. pcmpeqb mm2, mm2 /* generate all 1's in mm2 */
  3293. mov al, T /* load T into AL */
  3294. mov ah, al /* copy AL into AH */
  3295. mov bx, ax /* copy AX into BX */
  3296. shl eax, 16 /* shift 2 bytes of EAX left */
  3297. mov ax, bx /* copy BX into AX */
  3298. movd mm3, eax /* copy EAX into MM3 */
  3299. movd mm4, eax /* copy EAX into MM4 */
  3300. punpckldq mm3, mm4 /* fill higher bytes of MM3 with T */
  3301. psubusb mm2, mm3 /* store 0xFF - T in MM2 */
  3302. mov eax, Src1 /* load Src1 address into eax */
  3303. mov edi, Dest /* load Dest address into edi */
  3304. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  3305. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  3306. align 16 /* 16 byte alignment of the loop entry */
  3307. L1029:
  3308. movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
  3309. paddusb mm0, mm2 /* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */
  3310. pcmpeqb mm0, mm1 /* binarize 255:0, comparing to 255 */
  3311. movq [edi], mm0 /* store result in SrcDest */
  3312. add eax, 8 /* increase Src1 register pointer by 8 */
  3313. add edi, 8 /* increase Dest register pointer by 8 */
  3314. dec ecx /* decrease loop counter */
  3315. jnz L1029 /* check loop termination, proceed if required */
  3316. emms /* exit MMX state */
  3317. popa
  3318. }
  3319. #else
  3320. asm volatile
  3321. ("pusha \n\t"
  3322. /* ** Duplicate T in 8 bytes of MM3 ** */
  3323. "pcmpeqb %%mm1, %%mm1 \n\t" /* generate all 1's in mm1 */
  3324. "pcmpeqb %%mm2, %%mm2 \n\t" /* generate all 1's in mm2 */
  3325. "mov %3, %%al \n\t" /* load T into AL */
  3326. "mov %%al, %%ah \n\t" /* copy AL into AH */
  3327. "mov %%ax, %%bx \n\t" /* copy AX into BX */
  3328. "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */
  3329. "mov %%bx, %%ax \n\t" /* copy BX into AX */
  3330. "movd %%eax, %%mm3 \n\t" /* copy EAX into MM3 */
  3331. "movd %%eax, %%mm4 \n\t" /* copy EAX into MM4 */
  3332. "punpckldq %%mm4, %%mm3 \n\t" /* fill higher bytes of MM3 with T */
  3333. "psubusb %%mm3, %%mm2 \n\t" /* store 0xFF - T in MM2 */
  3334. "mov %1, %%eax \n\t" /* load Src1 address into eax */
  3335. "mov %0, %%edi \n\t" /* load Dest address into edi */
  3336. "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  3337. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  3338. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  3339. "1: \n\t"
  3340. "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */
  3341. "paddusb %%mm2, %%mm0 \n\t" /* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */
  3342. "pcmpeqb %%mm1, %%mm0 \n\t" /* binarize 255:0, comparing to 255 */
  3343. "movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */
  3344. "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
  3345. "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
  3346. "dec %%ecx \n\t" /* decrease loop counter */
  3347. "jnz 1b \n\t" /* check loop termination, proceed if required */
  3348. "emms \n\t" /* exit MMX state */
  3349. "popa \n\t":"=m" (Dest) /* %0 */
  3350. :"m"(Src1), /* %1 */
  3351. "m"(SrcLength), /* %2 */
  3352. "m"(T) /* %3 */
  3353. );
  3354. #endif
  3355. return (0);
  3356. #else
  3357. return (-1);
  3358. #endif
  3359. }
  3360. /*!
  3361. \brief Filter using BinarizeUsingThreshold: D = (S >= T) ? 255:0
  3362. \param Src1 Pointer to the start of the source byte array (S).
  3363. \param Dest Pointer to the start of the destination byte array (D).
  3364. \param length The number of bytes in the source array.
  3365. \param T The threshold boundary (inclusive).
  3366. \return Returns 0 for success or -1 for error.
  3367. */
  3368. int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char T)
  3369. {
  3370. unsigned int i, istart;
  3371. unsigned char *cursrc1;
  3372. unsigned char *curdest;
  3373. /* Validate input parameters */
  3374. if ((Src1 == NULL) || (Dest == NULL))
  3375. return(-1);
  3376. if (length == 0)
  3377. return(0);
  3378. /* Special case: T==0 */
  3379. if (T == 0) {
  3380. memset(Dest, 255, length);
  3381. return (0);
  3382. }
  3383. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  3384. SDL_imageFilterBinarizeUsingThresholdMMX(Src1, Dest, length, T);
  3385. /* Check for unaligned bytes */
  3386. if ((length & 7) > 0) {
  3387. /* Setup to process unaligned bytes */
  3388. istart = length & 0xfffffff8;
  3389. cursrc1 = &Src1[istart];
  3390. curdest = &Dest[istart];
  3391. } else {
  3392. /* No unaligned bytes - we are done */
  3393. return (0);
  3394. }
  3395. } else {
  3396. /* Setup to process whole image */
  3397. istart = 0;
  3398. cursrc1 = Src1;
  3399. curdest = Dest;
  3400. }
  3401. /* C routine to process image */
  3402. for (i = istart; i < length; i++) {
  3403. *curdest = ((unsigned char) *cursrc1 >= T) ? 255 : 0;
  3404. /* Advance pointers */
  3405. cursrc1++;
  3406. curdest++;
  3407. }
  3408. return (0);
  3409. }
  3410. /*!
  3411. \brief Internal MMX Filter using ClipToRange: D = (S >= Tmin) & (S <= Tmax) S:Tmin | Tmax
  3412. \param Src1 Pointer to the start of the source byte array (S).
  3413. \param Dest Pointer to the start of the destination byte array (D).
  3414. \param SrcLength The number of bytes in the source array.
  3415. \param Tmin Lower (inclusive) boundary of the clipping range.
  3416. \param Tmax Upper (inclusive) boundary of the clipping range.
  3417. \return Returns 0 for success or -1 for error.
  3418. */
  3419. int SDL_imageFilterClipToRangeMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char Tmin,
  3420. unsigned char Tmax)
  3421. {
  3422. #ifdef USE_MMX
  3423. #if !defined(GCC__)
  3424. __asm
  3425. {
  3426. pusha
  3427. pcmpeqb mm1, mm1 /* generate all 1's in mm1 */
  3428. /* ** Duplicate Tmax in 8 bytes of MM3 ** */
  3429. mov al, Tmax /* load Tmax into AL */
  3430. mov ah, al /* copy AL into AH */
  3431. mov bx, ax /* copy AX into BX */
  3432. shl eax, 16 /* shift 2 bytes of EAX left */
  3433. mov ax, bx /* copy BX into AX */
  3434. movd mm3, eax /* copy EAX into MM3 */
  3435. movd mm4, eax /* copy EAX into MM4 */
  3436. punpckldq mm3, mm4 /* fill higher bytes of MM3 with Tmax */
  3437. psubusb mm1, mm3 /* store 0xFF - Tmax in MM1 */
  3438. /* ** Duplicate Tmin in 8 bytes of MM5 ** */
  3439. mov al, Tmin /* load Tmin into AL */
  3440. mov ah, al /* copy AL into AH */
  3441. mov bx, ax /* copy AX into BX */
  3442. shl eax, 16 /* shift 2 bytes of EAX left */
  3443. mov ax, bx /* copy BX into AX */
  3444. movd mm5, eax /* copy EAX into MM5 */
  3445. movd mm4, eax /* copy EAX into MM4 */
  3446. punpckldq mm5, mm4 /* fill higher bytes of MM5 with Tmin */
  3447. movq mm7, mm5 /* copy MM5 into MM7 */
  3448. paddusb mm7, mm1 /* store 0xFF - Tmax + Tmin in MM7 */
  3449. mov eax, Src1 /* load Src1 address into eax */
  3450. mov edi, Dest /* load Dest address into edi */
  3451. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  3452. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  3453. align 16 /* 16 byte alignment of the loop entry */
  3454. L1030:
  3455. movq mm0, [eax] /* load 8 bytes from Src1 into MM0 */
  3456. paddusb mm0, mm1 /* MM0=SrcDest+(0xFF-Tmax) */
  3457. psubusb mm0, mm7 /* MM0=MM0-(0xFF-Tmax+Tmin) */
  3458. paddusb mm0, mm5 /* MM0=MM0+Tmin */
  3459. movq [edi], mm0 /* store result in Dest */
  3460. add eax, 8 /* increase Src1 register pointer by 8 */
  3461. add edi, 8 /* increase Dest register pointer by 8 */
  3462. dec ecx /* decrease loop counter */
  3463. jnz L1030 /* check loop termination, proceed if required */
  3464. emms /* exit MMX state */
  3465. popa
  3466. }
  3467. #else
  3468. asm volatile
  3469. ("pusha \n\t" "pcmpeqb %%mm1, %%mm1 \n\t" /* generate all 1's in mm1 */
  3470. /* ** Duplicate Tmax in 8 bytes of MM3 ** */
  3471. "mov %4, %%al \n\t" /* load Tmax into AL */
  3472. "mov %%al, %%ah \n\t" /* copy AL into AH */
  3473. "mov %%ax, %%bx \n\t" /* copy AX into BX */
  3474. "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */
  3475. "mov %%bx, %%ax \n\t" /* copy BX into AX */
  3476. "movd %%eax, %%mm3 \n\t" /* copy EAX into MM3 */
  3477. "movd %%eax, %%mm4 \n\t" /* copy EAX into MM4 */
  3478. "punpckldq %%mm4, %%mm3 \n\t" /* fill higher bytes of MM3 with Tmax */
  3479. "psubusb %%mm3, %%mm1 \n\t" /* store 0xFF - Tmax in MM1 */
  3480. /* ** Duplicate Tmin in 8 bytes of MM5 ** */
  3481. "mov %3, %%al \n\t" /* load Tmin into AL */
  3482. "mov %%al, %%ah \n\t" /* copy AL into AH */
  3483. "mov %%ax, %%bx \n\t" /* copy AX into BX */
  3484. "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */
  3485. "mov %%bx, %%ax \n\t" /* copy BX into AX */
  3486. "movd %%eax, %%mm5 \n\t" /* copy EAX into MM5 */
  3487. "movd %%eax, %%mm4 \n\t" /* copy EAX into MM4 */
  3488. "punpckldq %%mm4, %%mm5 \n\t" /* fill higher bytes of MM5 with Tmin */
  3489. "movq %%mm5, %%mm7 \n\t" /* copy MM5 into MM7 */
  3490. "paddusb %%mm1, %%mm7 \n\t" /* store 0xFF - Tmax + Tmin in MM7 */
  3491. "mov %1, %%eax \n\t" /* load Src1 address into eax */
  3492. "mov %0, %%edi \n\t" /* load Dest address into edi */
  3493. "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  3494. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  3495. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  3496. "1: \n\t"
  3497. "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into MM0 */
  3498. "paddusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest+(0xFF-Tmax) */
  3499. "psubusb %%mm7, %%mm0 \n\t" /* MM0=MM0-(0xFF-Tmax+Tmin) */
  3500. "paddusb %%mm5, %%mm0 \n\t" /* MM0=MM0+Tmin */
  3501. "movq %%mm0, (%%edi) \n\t" /* store result in Dest */
  3502. "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
  3503. "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
  3504. "dec %%ecx \n\t" /* decrease loop counter */
  3505. "jnz 1b \n\t" /* check loop termination, proceed if required */
  3506. "emms \n\t" /* exit MMX state */
  3507. "popa \n\t":"=m" (Dest) /* %0 */
  3508. :"m"(Src1), /* %1 */
  3509. "m"(SrcLength), /* %2 */
  3510. "m"(Tmin), /* %3 */
  3511. "m"(Tmax) /* %4 */
  3512. );
  3513. #endif
  3514. return (0);
  3515. #else
  3516. return (-1);
  3517. #endif
  3518. }
  3519. /*!
  3520. \brief Filter using ClipToRange: D = (S >= Tmin) & (S <= Tmax) S:Tmin | Tmax
  3521. \param Src1 Pointer to the start of the source byte array (S).
  3522. \param Dest Pointer to the start of the destination byte array (D).
  3523. \param length The number of bytes in the source array.
  3524. \param Tmin Lower (inclusive) boundary of the clipping range.
  3525. \param Tmax Upper (inclusive) boundary of the clipping range.
  3526. \return Returns 0 for success or -1 for error.
  3527. */
  3528. int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char Tmin,
  3529. unsigned char Tmax)
  3530. {
  3531. unsigned int i, istart;
  3532. unsigned char *cursrc1;
  3533. unsigned char *curdest;
  3534. /* Validate input parameters */
  3535. if ((Src1 == NULL) || (Dest == NULL))
  3536. return(-1);
  3537. if (length == 0)
  3538. return(0);
  3539. /* Special case: Tmin==0 && Tmax = 255 */
  3540. if ((Tmin == 0) && (Tmax == 25)) {
  3541. memcpy(Src1, Dest, length);
  3542. return (0);
  3543. }
  3544. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  3545. SDL_imageFilterClipToRangeMMX(Src1, Dest, length, Tmin, Tmax);
  3546. /* Check for unaligned bytes */
  3547. if ((length & 7) > 0) {
  3548. /* Setup to process unaligned bytes */
  3549. istart = length & 0xfffffff8;
  3550. cursrc1 = &Src1[istart];
  3551. curdest = &Dest[istart];
  3552. } else {
  3553. /* No unaligned bytes - we are done */
  3554. return (0);
  3555. }
  3556. } else {
  3557. /* Setup to process whole image */
  3558. istart = 0;
  3559. cursrc1 = Src1;
  3560. curdest = Dest;
  3561. }
  3562. /* C routine to process image */
  3563. for (i = istart; i < length; i++) {
  3564. if (*cursrc1 < Tmin) {
  3565. *curdest = Tmin;
  3566. } else if (*cursrc1 > Tmax) {
  3567. *curdest = Tmax;
  3568. } else {
  3569. *curdest = *cursrc1;
  3570. }
  3571. /* Advance pointers */
  3572. cursrc1++;
  3573. curdest++;
  3574. }
  3575. return (0);
  3576. }
  3577. /*!
  3578. \brief Internal MMX Filter using NormalizeLinear: D = saturation255((Nmax - Nmin)/(Cmax - Cmin)*(S - Cmin) + Nmin)
  3579. \param Src1 Pointer to the start of the source byte array (S).
  3580. \param Dest Pointer to the start of the destination byte array (D).
  3581. \param SrcLength The number of bytes in the source array.
  3582. \param Cmin Normalization constant (Cmin).
  3583. \param Cmax Normalization constant (Cmax).
  3584. \param Nmin Normalization constant (Nmin).
  3585. \param Nmax Normalization constant (Nmax).
  3586. \return Returns 0 for success or -1 for error.
  3587. */
  3588. int SDL_imageFilterNormalizeLinearMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, int Cmin, int Cmax,
  3589. int Nmin, int Nmax)
  3590. {
  3591. #ifdef USE_MMX
  3592. #if !defined(GCC__)
  3593. __asm
  3594. {
  3595. pusha
  3596. mov ax, WORD PTR Nmax /* load Nmax in AX */
  3597. mov bx, WORD PTR Cmax /* load Cmax in BX */
  3598. sub ax, WORD PTR Nmin /* AX = Nmax - Nmin */
  3599. sub bx, WORD PTR Cmin /* BX = Cmax - Cmin */
  3600. jz L10311 /* check division by zero */
  3601. xor dx, dx /* prepare for division, zero DX */
  3602. div bx /* AX = AX/BX */
  3603. jmp L10312
  3604. L10311:
  3605. mov ax, 255 /* if div by zero, assume result max byte value */
  3606. L10312: /* ** Duplicate AX in 4 words of MM0 ** */
  3607. mov bx, ax /* copy AX into BX */
  3608. shl eax, 16 /* shift 2 bytes of EAX left */
  3609. mov ax, bx /* copy BX into AX */
  3610. movd mm0, eax /* copy EAX into MM0 */
  3611. movd mm1, eax /* copy EAX into MM1 */
  3612. punpckldq mm0, mm1 /* fill higher words of MM0 with AX */
  3613. /* ** Duplicate Cmin in 4 words of MM1 ** */
  3614. mov ax, WORD PTR Cmin /* load Cmin into AX */
  3615. mov bx, ax /* copy AX into BX */
  3616. shl eax, 16 /* shift 2 bytes of EAX left */
  3617. mov ax, bx /* copy BX into AX */
  3618. movd mm1, eax /* copy EAX into MM1 */
  3619. movd mm2, eax /* copy EAX into MM2 */
  3620. punpckldq mm1, mm2 /* fill higher words of MM1 with Cmin */
  3621. /* ** Duplicate Nmin in 4 words of MM2 ** */
  3622. mov ax, WORD PTR Nmin /* load Nmin into AX */
  3623. mov bx, ax /* copy AX into BX */
  3624. shl eax, 16 /* shift 2 bytes of EAX left */
  3625. mov ax, bx /* copy BX into AX */
  3626. movd mm2, eax /* copy EAX into MM2 */
  3627. movd mm3, eax /* copy EAX into MM3 */
  3628. punpckldq mm2, mm3 /* fill higher words of MM2 with Nmin */
  3629. pxor mm7, mm7 /* zero MM7 register */
  3630. mov eax, Src1 /* load Src1 address into eax */
  3631. mov edi, Dest /* load Dest address into edi */
  3632. mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
  3633. shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
  3634. align 16 /* 16 byte alignment of the loop entry */
  3635. L1031:
  3636. movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */
  3637. movq mm4, mm3 /* copy MM3 into MM4 */
  3638. punpcklbw mm3, mm7 /* unpack low bytes of SrcDest into words */
  3639. punpckhbw mm4, mm7 /* unpack high bytes of SrcDest into words */
  3640. psubusb mm3, mm1 /* S-Cmin, low bytes */
  3641. psubusb mm4, mm1 /* S-Cmin, high bytes */
  3642. pmullw mm3, mm0 /* MM0*(S-Cmin), low bytes */
  3643. pmullw mm4, mm0 /* MM0*(S-Cmin), high bytes */
  3644. paddusb mm3, mm2 /* MM0*(S-Cmin)+Nmin, low bytes */
  3645. paddusb mm4, mm2 /* MM0*(S-Cmin)+Nmin, high bytes */
  3646. /* ** Take abs value of the signed words ** */
  3647. movq mm5, mm3 /* copy mm3 into mm5 */
  3648. movq mm6, mm4 /* copy mm4 into mm6 */
  3649. psraw mm5, 15 /* fill mm5 words with word sign bit */
  3650. psraw mm6, 15 /* fill mm6 words with word sign bit */
  3651. pxor mm3, mm5 /* take 1's compliment of only neg words */
  3652. pxor mm4, mm6 /* take 1's compliment of only neg words */
  3653. psubsw mm3, mm5 /* add 1 to only neg words, W-(-1) or W-0 */
  3654. psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */
  3655. packuswb mm3, mm4 /* pack words back into bytes with saturation */
  3656. movq [edi], mm3 /* store result in Dest */
  3657. add eax, 8 /* increase Src1 register pointer by 8 */
  3658. add edi, 8 /* increase Dest register pointer by 8 */
  3659. dec ecx /* decrease loop counter */
  3660. jnz L1031 /* check loop termination, proceed if required */
  3661. emms /* exit MMX state */
  3662. popa
  3663. }
  3664. #else
  3665. asm volatile
  3666. ("pusha \n\t" "mov %6, %%ax \n\t" /* load Nmax in AX */
  3667. "mov %4, %%bx \n\t" /* load Cmax in BX */
  3668. "sub %5, %%ax \n\t" /* AX = Nmax - Nmin */
  3669. "sub %3, %%bx \n\t" /* BX = Cmax - Cmin */
  3670. "jz 1f \n\t" /* check division by zero */
  3671. "xor %%dx, %%dx \n\t" /* prepare for division, zero DX */
  3672. "div %%bx \n\t" /* AX = AX/BX */
  3673. "jmp 2f \n\t" "1: \n\t" "mov $255, %%ax \n\t" /* if div by zero, assume result max. byte value */
  3674. "2: \n\t" /* ** Duplicate AX in 4 words of MM0 ** */
  3675. "mov %%ax, %%bx \n\t" /* copy AX into BX */
  3676. "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */
  3677. "mov %%bx, %%ax \n\t" /* copy BX into AX */
  3678. "movd %%eax, %%mm0 \n\t" /* copy EAX into MM0 */
  3679. "movd %%eax, %%mm1 \n\t" /* copy EAX into MM1 */
  3680. "punpckldq %%mm1, %%mm0 \n\t" /* fill higher words of MM0 with AX */
  3681. /* ** Duplicate Cmin in 4 words of MM1 ** */
  3682. "mov %3, %%ax \n\t" /* load Cmin into AX */
  3683. "mov %%ax, %%bx \n\t" /* copy AX into BX */
  3684. "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */
  3685. "mov %%bx, %%ax \n\t" /* copy BX into AX */
  3686. "movd %%eax, %%mm1 \n\t" /* copy EAX into MM1 */
  3687. "movd %%eax, %%mm2 \n\t" /* copy EAX into MM2 */
  3688. "punpckldq %%mm2, %%mm1 \n\t" /* fill higher words of MM1 with Cmin */
  3689. /* ** Duplicate Nmin in 4 words of MM2 ** */
  3690. "mov %5, %%ax \n\t" /* load Nmin into AX */
  3691. "mov %%ax, %%bx \n\t" /* copy AX into BX */
  3692. "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */
  3693. "mov %%bx, %%ax \n\t" /* copy BX into AX */
  3694. "movd %%eax, %%mm2 \n\t" /* copy EAX into MM2 */
  3695. "movd %%eax, %%mm3 \n\t" /* copy EAX into MM3 */
  3696. "punpckldq %%mm3, %%mm2 \n\t" /* fill higher words of MM2 with Nmin */
  3697. "pxor %%mm7, %%mm7 \n\t" /* zero MM7 register */
  3698. "mov %1, %%eax \n\t" /* load Src1 address into eax */
  3699. "mov %0, %%edi \n\t" /* load Dest address into edi */
  3700. "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
  3701. "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
  3702. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  3703. "1: \n\t"
  3704. "movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */
  3705. "movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */
  3706. "punpcklbw %%mm7, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */
  3707. "punpckhbw %%mm7, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */
  3708. "psubusb %%mm1, %%mm3 \n\t" /* S-Cmin, low bytes */
  3709. "psubusb %%mm1, %%mm4 \n\t" /* S-Cmin, high bytes */
  3710. "pmullw %%mm0, %%mm3 \n\t" /* MM0*(S-Cmin), low bytes */
  3711. "pmullw %%mm0, %%mm4 \n\t" /* MM0*(S-Cmin), high bytes */
  3712. "paddusb %%mm2, %%mm3 \n\t" /* MM0*(S-Cmin)+Nmin, low bytes */
  3713. "paddusb %%mm2, %%mm4 \n\t" /* MM0*(S-Cmin)+Nmin, high bytes */
  3714. /* ** Take abs value of the signed words ** */
  3715. "movq %%mm3, %%mm5 \n\t" /* copy mm3 into mm5 */
  3716. "movq %%mm4, %%mm6 \n\t" /* copy mm4 into mm6 */
  3717. "psraw $15, %%mm5 \n\t" /* fill mm5 words with word sign bit */
  3718. "psraw $15, %%mm6 \n\t" /* fill mm6 words with word sign bit */
  3719. "pxor %%mm5, %%mm3 \n\t" /* take 1's compliment of only neg. words */
  3720. "pxor %%mm6, %%mm4 \n\t" /* take 1's compliment of only neg. words */
  3721. "psubsw %%mm5, %%mm3 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
  3722. "psubsw %%mm6, %%mm4 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
  3723. "packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */
  3724. "movq %%mm3, (%%edi) \n\t" /* store result in Dest */
  3725. "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
  3726. "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
  3727. "dec %%ecx \n\t" /* decrease loop counter */
  3728. "jnz 1b \n\t" /* check loop termination, proceed if required */
  3729. "emms \n\t" /* exit MMX state */
  3730. "popa \n\t":"=m" (Dest) /* %0 */
  3731. :"m"(Src1), /* %1 */
  3732. "m"(SrcLength), /* %2 */
  3733. "m"(Cmin), /* %3 */
  3734. "m"(Cmax), /* %4 */
  3735. "m"(Nmin), /* %5 */
  3736. "m"(Nmax) /* %6 */
  3737. );
  3738. #endif
  3739. return (0);
  3740. #else
  3741. return (-1);
  3742. #endif
  3743. }
  3744. /*!
  3745. \brief Filter using NormalizeLinear: D = saturation255((Nmax - Nmin)/(Cmax - Cmin)*(S - Cmin) + Nmin)
  3746. \param Src Pointer to the start of the source byte array (S).
  3747. \param Dest Pointer to the start of the destination byte array (D).
  3748. \param length The number of bytes in the source array.
  3749. \param Cmin Normalization constant.
  3750. \param Cmax Normalization constant.
  3751. \param Nmin Normalization constant.
  3752. \param Nmax Normalization constant.
  3753. \return Returns 0 for success or -1 for error.
  3754. */
  3755. int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin, int Cmax, int Nmin,
  3756. int Nmax)
  3757. {
  3758. unsigned int i, istart;
  3759. unsigned char *cursrc;
  3760. unsigned char *curdest;
  3761. int dN, dC, factor;
  3762. int result;
  3763. /* Validate input parameters */
  3764. if ((Src == NULL) || (Dest == NULL))
  3765. return(-1);
  3766. if (length == 0)
  3767. return(0);
  3768. if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
  3769. SDL_imageFilterNormalizeLinearMMX(Src, Dest, length, Cmin, Cmax, Nmin, Nmax);
  3770. /* Check for unaligned bytes */
  3771. if ((length & 7) > 0) {
  3772. /* Setup to process unaligned bytes */
  3773. istart = length & 0xfffffff8;
  3774. cursrc = &Src[istart];
  3775. curdest = &Dest[istart];
  3776. } else {
  3777. /* No unaligned bytes - we are done */
  3778. return (0);
  3779. }
  3780. } else {
  3781. /* Setup to process whole image */
  3782. istart = 0;
  3783. cursrc = Src;
  3784. curdest = Dest;
  3785. }
  3786. /* C routine to process image */
  3787. dC = Cmax - Cmin;
  3788. if (dC == 0)
  3789. return (0);
  3790. dN = Nmax - Nmin;
  3791. factor = dN / dC;
  3792. for (i = istart; i < length; i++) {
  3793. result = factor * ((int) (*cursrc) - Cmin) + Nmin;
  3794. if (result > 255)
  3795. result = 255;
  3796. *curdest = (unsigned char) result;
  3797. /* Advance pointers */
  3798. cursrc++;
  3799. curdest++;
  3800. }
  3801. return (0);
  3802. }
  3803. /* ------------------------------------------------------------------------------------ */
  3804. /*!
  3805. \brief Filter using ConvolveKernel3x3Divide: Dij = saturation0and255( ... )
  3806. \param Src The source 2D byte array to convolve. Should be different from destination.
  3807. \param Dest The destination 2D byte array to store the result in. Should be different from source.
  3808. \param rows Number of rows in source/destination array. Must be >2.
  3809. \param columns Number of columns in source/destination array. Must be >2.
  3810. \param Kernel The 2D convolution kernel of size 3x3.
  3811. \param Divisor The divisor of the convolution sum. Must be >0.
  3812. Note: Non-MMX implementation not available for this function.
  3813. \return Returns 1 if filter was applied, 0 otherwise.
  3814. */
  3815. int SDL_imageFilterConvolveKernel3x3Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
  3816. signed short *Kernel, unsigned char Divisor)
  3817. {
  3818. /* Validate input parameters */
  3819. if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
  3820. return(-1);
  3821. if ((columns < 3) || (rows < 3) || (Divisor == 0))
  3822. return (-1);
  3823. if ((SDL_imageFilterMMXdetect())) {
  3824. #ifdef USE_MMX
  3825. #if !defined(GCC__)
  3826. __asm
  3827. {
  3828. pusha
  3829. pxor mm0, mm0 /* zero MM0 */
  3830. xor ebx, ebx /* zero EBX */
  3831. mov bl, Divisor /* load Divisor into BL */
  3832. mov edx, Kernel /* load Kernel address into EDX */
  3833. movq mm5, [edx] /* MM5 = {0,K2,K1,K0} */
  3834. add edx, 8 /* second row |K0 K1 K2 0| */
  3835. movq mm6, [edx] /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */
  3836. add edx, 8 /* third row |K6 K7 K8 0| */
  3837. movq mm7, [edx] /* MM7 = {0,K8,K7,K6} */
  3838. /* ---, */
  3839. mov eax, columns /* load columns into EAX */
  3840. mov esi, Src /* ESI = Src row 0 address */
  3841. mov edi, Dest /* load Dest address to EDI */
  3842. add edi, eax /* EDI = EDI + columns */
  3843. inc edi /* 1 byte offset from the left edge */
  3844. mov edx, rows /* initialize ROWS counter */
  3845. sub edx, 2 /* do not use first and last row */
  3846. /* ---, */
  3847. L10320:
  3848. mov ecx, eax /* initialize COLUMS counter */
  3849. sub ecx, 2 /* do not use first and last column */
  3850. align 16 /* 16 byte alignment of the loop entry */
  3851. L10322:
  3852. /* ---, */
  3853. movq mm1, [esi] /* load 8 bytes of the image first row */
  3854. add esi, eax /* move one row below */
  3855. movq mm2, [esi] /* load 8 bytes of the image second row */
  3856. add esi, eax /* move one row below */
  3857. movq mm3, [esi] /* load 8 bytes of the image third row */
  3858. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  3859. punpcklbw mm2, mm0 /* unpack first 4 bytes into words */
  3860. punpcklbw mm3, mm0 /* unpack first 4 bytes into words */
  3861. pmullw mm1, mm5 /* multiply words first row image*Kernel */
  3862. pmullw mm2, mm6 /* multiply words second row image*Kernel */
  3863. pmullw mm3, mm7 /* multiply words third row image*Kernel */
  3864. paddsw mm1, mm2 /* add 4 words of the first and second rows */
  3865. paddsw mm1, mm3 /* add 4 words of the third row and result */
  3866. movq mm2, mm1 /* copy MM1 into MM2 */
  3867. psrlq mm1, 32 /* shift 2 left words to the right */
  3868. paddsw mm1, mm2 /* add 2 left and 2 right result words */
  3869. movq mm3, mm1 /* copy MM1 into MM3 */
  3870. psrlq mm1, 16 /* shift 1 left word to the right */
  3871. paddsw mm1, mm3 /* add 1 left and 1 right result words */
  3872. /* --, */
  3873. movd mm2, eax /* save EAX in MM2 */
  3874. movd mm3, edx /* save EDX in MM3 */
  3875. movd eax, mm1 /* copy MM1 into EAX */
  3876. psraw mm1, 15 /* spread sign bit of the result */
  3877. movd edx, mm1 /* fill EDX with a sign bit */
  3878. idiv bx /* IDIV - VERY EXPENSIVE */
  3879. movd mm1, eax /* move result of division into MM1 */
  3880. packuswb mm1, mm0 /* pack division result with saturation */
  3881. movd eax, mm1 /* copy saturated result into EAX */
  3882. mov [edi], al /* copy a byte result into Dest */
  3883. movd edx, mm3 /* restore saved EDX */
  3884. movd eax, mm2 /* restore saved EAX */
  3885. /* --, */
  3886. sub esi, eax /* move two rows up */
  3887. sub esi, eax /* */
  3888. inc esi /* move Src pointer to the next pixel */
  3889. inc edi /* move Dest pointer to the next pixel */
  3890. /* ---, */
  3891. dec ecx /* decrease loop counter COLUMNS */
  3892. jnz L10322 /* check loop termination, proceed if required */
  3893. add esi, 2 /* move to the next row in Src */
  3894. add edi, 2 /* move to the next row in Dest */
  3895. dec edx /* decrease loop counter ROWS */
  3896. jnz L10320 /* check loop termination, proceed if required */
  3897. /* ---, */
  3898. emms /* exit MMX state */
  3899. popa
  3900. }
  3901. #else
  3902. asm volatile
  3903. ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
  3904. "xor %%ebx, %%ebx \n\t" /* zero EBX */
  3905. "mov %5, %%bl \n\t" /* load Divisor into BL */
  3906. "mov %4, %%edx \n\t" /* load Kernel address into EDX */
  3907. "movq (%%edx), %%mm5 \n\t" /* MM5 = {0,K2,K1,K0} */
  3908. "add $8, %%edx \n\t" /* second row |K0 K1 K2 0| */
  3909. "movq (%%edx), %%mm6 \n\t" /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */
  3910. "add $8, %%edx \n\t" /* third row |K6 K7 K8 0| */
  3911. "movq (%%edx), %%mm7 \n\t" /* MM7 = {0,K8,K7,K6} */
  3912. /* --- */
  3913. "mov %3, %%eax \n\t" /* load columns into EAX */
  3914. "mov %1, %%esi \n\t" /* ESI = Src row 0 address */
  3915. "mov %0, %%edi \n\t" /* load Dest address to EDI */
  3916. "add %%eax, %%edi \n\t" /* EDI = EDI + columns */
  3917. "inc %%edi \n\t" /* 1 byte offset from the left edge */
  3918. "mov %2, %%edx \n\t" /* initialize ROWS counter */
  3919. "sub $2, %%edx \n\t" /* do not use first and last row */
  3920. /* --- */
  3921. ".L10320: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */
  3922. "sub $2, %%ecx \n\t" /* do not use first and last column */
  3923. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  3924. ".L10322: \n\t"
  3925. /* --- */
  3926. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the image first row */
  3927. "add %%eax, %%esi \n\t" /* move one row below */
  3928. "movq (%%esi), %%mm2 \n\t" /* load 8 bytes of the image second row */
  3929. "add %%eax, %%esi \n\t" /* move one row below */
  3930. "movq (%%esi), %%mm3 \n\t" /* load 8 bytes of the image third row */
  3931. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  3932. "punpcklbw %%mm0, %%mm2 \n\t" /* unpack first 4 bytes into words */
  3933. "punpcklbw %%mm0, %%mm3 \n\t" /* unpack first 4 bytes into words */
  3934. "pmullw %%mm5, %%mm1 \n\t" /* multiply words first row image*Kernel */
  3935. "pmullw %%mm6, %%mm2 \n\t" /* multiply words second row image*Kernel */
  3936. "pmullw %%mm7, %%mm3 \n\t" /* multiply words third row image*Kernel */
  3937. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the first and second rows */
  3938. "paddsw %%mm3, %%mm1 \n\t" /* add 4 words of the third row and result */
  3939. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  3940. "psrlq $32, %%mm1 \n\t" /* shift 2 left words to the right */
  3941. "paddsw %%mm2, %%mm1 \n\t" /* add 2 left and 2 right result words */
  3942. "movq %%mm1, %%mm3 \n\t" /* copy MM1 into MM3 */
  3943. "psrlq $16, %%mm1 \n\t" /* shift 1 left word to the right */
  3944. "paddsw %%mm3, %%mm1 \n\t" /* add 1 left and 1 right result words */
  3945. /* -- */
  3946. "movd %%eax, %%mm2 \n\t" /* save EAX in MM2 */
  3947. "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */
  3948. "movd %%mm1, %%eax \n\t" /* copy MM1 into EAX */
  3949. "psraw $15, %%mm1 \n\t" /* spread sign bit of the result */
  3950. "movd %%mm1, %%edx \n\t" /* fill EDX with a sign bit */
  3951. "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */
  3952. "movd %%eax, %%mm1 \n\t" /* move result of division into MM1 */
  3953. "packuswb %%mm0, %%mm1 \n\t" /* pack division result with saturation */
  3954. "movd %%mm1, %%eax \n\t" /* copy saturated result into EAX */
  3955. "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
  3956. "movd %%mm3, %%edx \n\t" /* restore saved EDX */
  3957. "movd %%mm2, %%eax \n\t" /* restore saved EAX */
  3958. /* -- */
  3959. "sub %%eax, %%esi \n\t" /* move two rows up */
  3960. "sub %%eax, %%esi \n\t" /* */
  3961. "inc %%esi \n\t" /* move Src pointer to the next pixel */
  3962. "inc %%edi \n\t" /* move Dest pointer to the next pixel */
  3963. /* --- */
  3964. "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
  3965. "jnz .L10322 \n\t" /* check loop termination, proceed if required */
  3966. "add $2, %%esi \n\t" /* move to the next row in Src */
  3967. "add $2, %%edi \n\t" /* move to the next row in Dest */
  3968. "dec %%edx \n\t" /* decrease loop counter ROWS */
  3969. "jnz .L10320 \n\t" /* check loop termination, proceed if required */
  3970. /* --- */
  3971. "emms \n\t" /* exit MMX state */
  3972. "popa \n\t":"=m" (Dest) /* %0 */
  3973. :"m"(Src), /* %1 */
  3974. "m"(rows), /* %2 */
  3975. "m"(columns), /* %3 */
  3976. "m"(Kernel), /* %4 */
  3977. "m"(Divisor) /* %5 */
  3978. );
  3979. #endif
  3980. #endif
  3981. return (0);
  3982. } else {
  3983. /* No non-MMX implementation yet */
  3984. return (-1);
  3985. }
  3986. }
  3987. /*!
  3988. \brief Filter using ConvolveKernel5x5Divide: Dij = saturation0and255( ... )
  3989. \param Src The source 2D byte array to convolve. Should be different from destination.
  3990. \param Dest The destination 2D byte array to store the result in. Should be different from source.
  3991. \param rows Number of rows in source/destination array. Must be >4.
  3992. \param columns Number of columns in source/destination array. Must be >4.
  3993. \param Kernel The 2D convolution kernel of size 5x5.
  3994. \param Divisor The divisor of the convolution sum. Must be >0.
  3995. Note: Non-MMX implementation not available for this function.
  3996. \return Returns 1 if filter was applied, 0 otherwise.
  3997. */
  3998. int SDL_imageFilterConvolveKernel5x5Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
  3999. signed short *Kernel, unsigned char Divisor)
  4000. {
  4001. /* Validate input parameters */
  4002. if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
  4003. return(-1);
  4004. if ((columns < 5) || (rows < 5) || (Divisor == 0))
  4005. return (-1);
  4006. if ((SDL_imageFilterMMXdetect())) {
  4007. #ifdef USE_MMX
  4008. #if !defined(GCC__)
  4009. __asm
  4010. {
  4011. pusha
  4012. pxor mm0, mm0 /* zero MM0 */
  4013. xor ebx, ebx /* zero EBX */
  4014. mov bl, Divisor /* load Divisor into BL */
  4015. movd mm5, ebx /* copy Divisor into MM5 */
  4016. mov edx, Kernel /* load Kernel address into EDX */
  4017. mov esi, Src /* load Src address to ESI */
  4018. mov edi, Dest /* load Dest address to EDI */
  4019. add edi, 2 /* 2 column offset from the left edge */
  4020. mov eax, columns /* load columns into EAX */
  4021. shl eax, 1 /* EAX = columns * 2 */
  4022. add edi, eax /* 2 row offset from the top edge */
  4023. shr eax, 1 /* EAX = columns */
  4024. mov ebx, rows /* initialize ROWS counter */
  4025. sub ebx, 4 /* do not use first 2 and last 2 rows */
  4026. /* ---, */
  4027. L10330:
  4028. mov ecx, eax /* initialize COLUMNS counter */
  4029. sub ecx, 4 /* do not use first 2 and last 2 columns */
  4030. align 16 /* 16 byte alignment of the loop entry */
  4031. L10332:
  4032. pxor mm7, mm7 /* zero MM7 (accumulator) */
  4033. movd mm6, esi /* save ESI in MM6 */
  4034. /* --- 1 */
  4035. movq mm1, [esi] /* load 8 bytes of the Src */
  4036. movq mm2, mm1 /* copy MM1 into MM2 */
  4037. add esi, eax /* move Src pointer 1 row below */
  4038. movq mm3, [edx] /* load 4 words of Kernel */
  4039. add edx, 8 /* move pointer to other 4 words */
  4040. movq mm4, [edx] /* load 4 words of Kernel */
  4041. add edx, 8 /* move pointer to other 4 words */
  4042. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4043. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4044. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  4045. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  4046. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4047. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4048. /* --- 2 */
  4049. movq mm1, [esi] /* load 8 bytes of the Src */
  4050. movq mm2, mm1 /* copy MM1 into MM2 */
  4051. add esi, eax /* move Src pointer 1 row below */
  4052. movq mm3, [edx] /* load 4 words of Kernel */
  4053. add edx, 8 /* move pointer to other 4 words */
  4054. movq mm4, [edx] /* load 4 words of Kernel */
  4055. add edx, 8 /* move pointer to other 4 words */
  4056. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4057. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4058. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  4059. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  4060. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4061. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4062. /* --- 3 */
  4063. movq mm1, [esi] /* load 8 bytes of the Src */
  4064. movq mm2, mm1 /* copy MM1 into MM2 */
  4065. add esi, eax /* move Src pointer 1 row below */
  4066. movq mm3, [edx] /* load 4 words of Kernel */
  4067. add edx, 8 /* move pointer to other 4 words */
  4068. movq mm4, [edx] /* load 4 words of Kernel */
  4069. add edx, 8 /* move pointer to other 4 words */
  4070. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4071. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4072. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  4073. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  4074. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4075. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4076. /* --- 4 */
  4077. movq mm1, [esi] /* load 8 bytes of the Src */
  4078. movq mm2, mm1 /* copy MM1 into MM2 */
  4079. add esi, eax /* move Src pointer 1 row below */
  4080. movq mm3, [edx] /* load 4 words of Kernel */
  4081. add edx, 8 /* move pointer to other 4 words */
  4082. movq mm4, [edx] /* load 4 words of Kernel */
  4083. add edx, 8 /* move pointer to other 4 words */
  4084. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4085. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4086. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  4087. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  4088. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4089. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4090. /* --- 5 */
  4091. movq mm1, [esi] /* load 8 bytes of the Src */
  4092. movq mm2, mm1 /* copy MM1 into MM2 */
  4093. movq mm3, [edx] /* load 4 words of Kernel */
  4094. add edx, 8 /* move pointer to other 4 words */
  4095. movq mm4, [edx] /* load 4 words of Kernel */
  4096. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4097. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4098. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  4099. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  4100. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4101. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4102. /* ---, */
  4103. movq mm3, mm7 /* copy MM7 into MM3 */
  4104. psrlq mm7, 32 /* shift 2 left words to the right */
  4105. paddsw mm7, mm3 /* add 2 left and 2 right result words */
  4106. movq mm2, mm7 /* copy MM7 into MM2 */
  4107. psrlq mm7, 16 /* shift 1 left word to the right */
  4108. paddsw mm7, mm2 /* add 1 left and 1 right result words */
  4109. /* ---, */
  4110. movd mm1, eax /* save EDX in MM1 */
  4111. movd mm2, ebx /* save EDX in MM2 */
  4112. movd mm3, edx /* save EDX in MM3 */
  4113. movd eax, mm7 /* load summation result into EAX */
  4114. psraw mm7, 15 /* spread sign bit of the result */
  4115. movd ebx, mm5 /* load Divisor into EBX */
  4116. movd edx, mm7 /* fill EDX with a sign bit */
  4117. idiv bx /* IDIV - VERY EXPENSIVE */
  4118. movd mm7, eax /* move result of division into MM7 */
  4119. packuswb mm7, mm0 /* pack division result with saturation */
  4120. movd eax, mm7 /* copy saturated result into EAX */
  4121. mov [edi], al /* copy a byte result into Dest */
  4122. movd edx, mm3 /* restore saved EDX */
  4123. movd ebx, mm2 /* restore saved EBX */
  4124. movd eax, mm1 /* restore saved EAX */
  4125. /* --, */
  4126. movd esi, mm6 /* move Src pointer to the top pixel */
  4127. sub edx, 72 /* EDX = Kernel address */
  4128. inc esi /* move Src pointer to the next pixel */
  4129. inc edi /* move Dest pointer to the next pixel */
  4130. /* ---, */
  4131. dec ecx /* decrease loop counter COLUMNS */
  4132. jnz L10332 /* check loop termination, proceed if required */
  4133. add esi, 4 /* move to the next row in Src */
  4134. add edi, 4 /* move to the next row in Dest */
  4135. dec ebx /* decrease loop counter ROWS */
  4136. jnz L10330 /* check loop termination, proceed if required */
  4137. /* ---, */
  4138. emms /* exit MMX state */
  4139. popa
  4140. }
  4141. #else
  4142. asm volatile
  4143. ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
  4144. "xor %%ebx, %%ebx \n\t" /* zero EBX */
  4145. "mov %5, %%bl \n\t" /* load Divisor into BL */
  4146. "movd %%ebx, %%mm5 \n\t" /* copy Divisor into MM5 */
  4147. "mov %4, %%edx \n\t" /* load Kernel address into EDX */
  4148. "mov %1, %%esi \n\t" /* load Src address to ESI */
  4149. "mov %0, %%edi \n\t" /* load Dest address to EDI */
  4150. "add $2, %%edi \n\t" /* 2 column offset from the left edge */
  4151. "mov %3, %%eax \n\t" /* load columns into EAX */
  4152. "shl $1, %%eax \n\t" /* EAX = columns * 2 */
  4153. "add %%eax, %%edi \n\t" /* 2 row offset from the top edge */
  4154. "shr $1, %%eax \n\t" /* EAX = columns */
  4155. "mov %2, %%ebx \n\t" /* initialize ROWS counter */
  4156. "sub $4, %%ebx \n\t" /* do not use first 2 and last 2 rows */
  4157. /* --- */
  4158. ".L10330: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
  4159. "sub $4, %%ecx \n\t" /* do not use first 2 and last 2 columns */
  4160. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  4161. ".L10332: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
  4162. "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
  4163. /* --- 1 */
  4164. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  4165. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  4166. "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  4167. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  4168. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4169. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  4170. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4171. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  4172. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  4173. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  4174. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  4175. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  4176. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  4177. /* --- 2 */
  4178. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  4179. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  4180. "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  4181. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  4182. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4183. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  4184. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4185. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  4186. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  4187. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  4188. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  4189. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  4190. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  4191. /* --- 3 */
  4192. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  4193. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  4194. "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  4195. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  4196. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4197. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  4198. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4199. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  4200. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  4201. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  4202. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  4203. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  4204. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  4205. /* --- 4 */
  4206. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  4207. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  4208. "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  4209. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  4210. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4211. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  4212. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4213. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  4214. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  4215. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  4216. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  4217. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  4218. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  4219. /* --- 5 */
  4220. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  4221. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  4222. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  4223. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4224. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  4225. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  4226. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  4227. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  4228. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  4229. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  4230. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  4231. /* --- */
  4232. "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
  4233. "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
  4234. "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */
  4235. "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */
  4236. "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */
  4237. "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */
  4238. /* --- */
  4239. "movd %%eax, %%mm1 \n\t" /* save EDX in MM1 */
  4240. "movd %%ebx, %%mm2 \n\t" /* save EDX in MM2 */
  4241. "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */
  4242. "movd %%mm7, %%eax \n\t" /* load summation result into EAX */
  4243. "psraw $15, %%mm7 \n\t" /* spread sign bit of the result */
  4244. "movd %%mm5, %%ebx \n\t" /* load Divisor into EBX */
  4245. "movd %%mm7, %%edx \n\t" /* fill EDX with a sign bit */
  4246. "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */
  4247. "movd %%eax, %%mm7 \n\t" /* move result of division into MM7 */
  4248. "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */
  4249. "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */
  4250. "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
  4251. "movd %%mm3, %%edx \n\t" /* restore saved EDX */
  4252. "movd %%mm2, %%ebx \n\t" /* restore saved EBX */
  4253. "movd %%mm1, %%eax \n\t" /* restore saved EAX */
  4254. /* -- */
  4255. "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */
  4256. "sub $72, %%edx \n\t" /* EDX = Kernel address */
  4257. "inc %%esi \n\t" /* move Src pointer to the next pixel */
  4258. "inc %%edi \n\t" /* move Dest pointer to the next pixel */
  4259. /* --- */
  4260. "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
  4261. "jnz .L10332 \n\t" /* check loop termination, proceed if required */
  4262. "add $4, %%esi \n\t" /* move to the next row in Src */
  4263. "add $4, %%edi \n\t" /* move to the next row in Dest */
  4264. "dec %%ebx \n\t" /* decrease loop counter ROWS */
  4265. "jnz .L10330 \n\t" /* check loop termination, proceed if required */
  4266. /* --- */
  4267. "emms \n\t" /* exit MMX state */
  4268. "popa \n\t":"=m" (Dest) /* %0 */
  4269. :"m"(Src), /* %1 */
  4270. "m"(rows), /* %2 */
  4271. "m"(columns), /* %3 */
  4272. "m"(Kernel), /* %4 */
  4273. "m"(Divisor) /* %5 */
  4274. );
  4275. #endif
  4276. #endif
  4277. return (0);
  4278. } else {
  4279. /* No non-MMX implementation yet */
  4280. return (-1);
  4281. }
  4282. }
  4283. /*!
  4284. \brief Filter using ConvolveKernel7x7Divide: Dij = saturation0and255( ... )
  4285. \param Src The source 2D byte array to convolve. Should be different from destination.
  4286. \param Dest The destination 2D byte array to store the result in. Should be different from source.
  4287. \param rows Number of rows in source/destination array. Must be >6.
  4288. \param columns Number of columns in source/destination array. Must be >6.
  4289. \param Kernel The 2D convolution kernel of size 7x7.
  4290. \param Divisor The divisor of the convolution sum. Must be >0.
  4291. Note: Non-MMX implementation not available for this function.
  4292. \return Returns 1 if filter was applied, 0 otherwise.
  4293. */
  4294. int SDL_imageFilterConvolveKernel7x7Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
  4295. signed short *Kernel, unsigned char Divisor)
  4296. {
  4297. /* Validate input parameters */
  4298. if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
  4299. return(-1);
  4300. if ((columns < 7) || (rows < 7) || (Divisor == 0))
  4301. return (-1);
  4302. if ((SDL_imageFilterMMXdetect())) {
  4303. #ifdef USE_MMX
  4304. #if !defined(GCC__)
  4305. __asm
  4306. {
  4307. pusha
  4308. pxor mm0, mm0 /* zero MM0 */
  4309. xor ebx, ebx /* zero EBX */
  4310. mov bl, Divisor /* load Divisor into BL */
  4311. movd mm5, ebx /* copy Divisor into MM5 */
  4312. mov edx, Kernel /* load Kernel address into EDX */
  4313. mov esi, Src /* load Src address to ESI */
  4314. mov edi, Dest /* load Dest address to EDI */
  4315. add edi, 3 /* 3 column offset from the left edge */
  4316. mov eax, columns /* load columns into EAX */
  4317. add edi, eax /* 3 row offset from the top edge */
  4318. add edi, eax
  4319. add edi, eax
  4320. mov ebx, rows /* initialize ROWS counter */
  4321. sub ebx, 6 /* do not use first 3 and last 3 rows */
  4322. /* ---, */
  4323. L10340:
  4324. mov ecx, eax /* initialize COLUMNS counter */
  4325. sub ecx, 6 /* do not use first 3 and last 3 columns */
  4326. align 16 /* 16 byte alignment of the loop entry */
  4327. L10342:
  4328. pxor mm7, mm7 /* zero MM7 (accumulator) */
  4329. movd mm6, esi /* save ESI in MM6 */
  4330. /* --- 1 */
  4331. movq mm1, [esi] /* load 8 bytes of the Src */
  4332. movq mm2, mm1 /* copy MM1 into MM2 */
  4333. add esi, eax /* move Src pointer 1 row below */
  4334. movq mm3, [edx] /* load 4 words of Kernel */
  4335. add edx, 8 /* move pointer to other 4 words */
  4336. movq mm4, [edx] /* load 4 words of Kernel */
  4337. add edx, 8 /* move pointer to other 4 words */
  4338. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4339. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4340. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  4341. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  4342. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4343. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4344. /* --- 2 */
  4345. movq mm1, [esi] /* load 8 bytes of the Src */
  4346. movq mm2, mm1 /* copy MM1 into MM2 */
  4347. add esi, eax /* move Src pointer 1 row below */
  4348. movq mm3, [edx] /* load 4 words of Kernel */
  4349. add edx, 8 /* move pointer to other 4 words */
  4350. movq mm4, [edx] /* load 4 words of Kernel */
  4351. add edx, 8 /* move pointer to other 4 words */
  4352. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4353. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4354. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  4355. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  4356. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4357. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4358. /* --- 3 */
  4359. movq mm1, [esi] /* load 8 bytes of the Src */
  4360. movq mm2, mm1 /* copy MM1 into MM2 */
  4361. add esi, eax /* move Src pointer 1 row below */
  4362. movq mm3, [edx] /* load 4 words of Kernel */
  4363. add edx, 8 /* move pointer to other 4 words */
  4364. movq mm4, [edx] /* load 4 words of Kernel */
  4365. add edx, 8 /* move pointer to other 4 words */
  4366. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4367. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4368. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  4369. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  4370. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4371. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4372. /* --- 4 */
  4373. movq mm1, [esi] /* load 8 bytes of the Src */
  4374. movq mm2, mm1 /* copy MM1 into MM2 */
  4375. add esi, eax /* move Src pointer 1 row below */
  4376. movq mm3, [edx] /* load 4 words of Kernel */
  4377. add edx, 8 /* move pointer to other 4 words */
  4378. movq mm4, [edx] /* load 4 words of Kernel */
  4379. add edx, 8 /* move pointer to other 4 words */
  4380. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4381. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4382. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  4383. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  4384. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4385. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4386. /* --- 5 */
  4387. movq mm1, [esi] /* load 8 bytes of the Src */
  4388. movq mm2, mm1 /* copy MM1 into MM2 */
  4389. add esi, eax /* move Src pointer 1 row below */
  4390. movq mm3, [edx] /* load 4 words of Kernel */
  4391. add edx, 8 /* move pointer to other 4 words */
  4392. movq mm4, [edx] /* load 4 words of Kernel */
  4393. add edx, 8 /* move pointer to other 4 words */
  4394. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4395. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4396. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  4397. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  4398. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4399. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4400. /* --- 6 */
  4401. movq mm1, [esi] /* load 8 bytes of the Src */
  4402. movq mm2, mm1 /* copy MM1 into MM2 */
  4403. add esi, eax /* move Src pointer 1 row below */
  4404. movq mm3, [edx] /* load 4 words of Kernel */
  4405. add edx, 8 /* move pointer to other 4 words */
  4406. movq mm4, [edx] /* load 4 words of Kernel */
  4407. add edx, 8 /* move pointer to other 4 words */
  4408. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4409. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4410. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  4411. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  4412. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4413. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4414. /* --- 7 */
  4415. movq mm1, [esi] /* load 8 bytes of the Src */
  4416. movq mm2, mm1 /* copy MM1 into MM2 */
  4417. movq mm3, [edx] /* load 4 words of Kernel */
  4418. add edx, 8 /* move pointer to other 4 words */
  4419. movq mm4, [edx] /* load 4 words of Kernel */
  4420. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4421. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4422. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  4423. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  4424. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4425. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4426. /* ---, */
  4427. movq mm3, mm7 /* copy MM7 into MM3 */
  4428. psrlq mm7, 32 /* shift 2 left words to the right */
  4429. paddsw mm7, mm3 /* add 2 left and 2 right result words */
  4430. movq mm2, mm7 /* copy MM7 into MM2 */
  4431. psrlq mm7, 16 /* shift 1 left word to the right */
  4432. paddsw mm7, mm2 /* add 1 left and 1 right result words */
  4433. /* ---, */
  4434. movd mm1, eax /* save EDX in MM1 */
  4435. movd mm2, ebx /* save EDX in MM2 */
  4436. movd mm3, edx /* save EDX in MM3 */
  4437. movd eax, mm7 /* load summation result into EAX */
  4438. psraw mm7, 15 /* spread sign bit of the result */
  4439. movd ebx, mm5 /* load Divisor into EBX */
  4440. movd edx, mm7 /* fill EDX with a sign bit */
  4441. idiv bx /* IDIV - VERY EXPENSIVE */
  4442. movd mm7, eax /* move result of division into MM7 */
  4443. packuswb mm7, mm0 /* pack division result with saturation */
  4444. movd eax, mm7 /* copy saturated result into EAX */
  4445. mov [edi], al /* copy a byte result into Dest */
  4446. movd edx, mm3 /* restore saved EDX */
  4447. movd ebx, mm2 /* restore saved EBX */
  4448. movd eax, mm1 /* restore saved EAX */
  4449. /* --, */
  4450. movd esi, mm6 /* move Src pointer to the top pixel */
  4451. sub edx, 104 /* EDX = Kernel address */
  4452. inc esi /* move Src pointer to the next pixel */
  4453. inc edi /* move Dest pointer to the next pixel */
  4454. /* ---, */
  4455. dec ecx /* decrease loop counter COLUMNS */
  4456. jnz L10342 /* check loop termination, proceed if required */
  4457. add esi, 6 /* move to the next row in Src */
  4458. add edi, 6 /* move to the next row in Dest */
  4459. dec ebx /* decrease loop counter ROWS */
  4460. jnz L10340 /* check loop termination, proceed if required */
  4461. /* ---, */
  4462. emms /* exit MMX state */
  4463. popa
  4464. }
  4465. #else
  4466. asm volatile
  4467. ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
  4468. "xor %%ebx, %%ebx \n\t" /* zero EBX */
  4469. "mov %5, %%bl \n\t" /* load Divisor into BL */
  4470. "movd %%ebx, %%mm5 \n\t" /* copy Divisor into MM5 */
  4471. "mov %4, %%edx \n\t" /* load Kernel address into EDX */
  4472. "mov %1, %%esi \n\t" /* load Src address to ESI */
  4473. "mov %0, %%edi \n\t" /* load Dest address to EDI */
  4474. "add $3, %%edi \n\t" /* 3 column offset from the left edge */
  4475. "mov %3, %%eax \n\t" /* load columns into EAX */
  4476. "add %%eax, %%edi \n\t" /* 3 row offset from the top edge */
  4477. "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */
  4478. "sub $6, %%ebx \n\t" /* do not use first 3 and last 3 rows */
  4479. /* --- */
  4480. ".L10340: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
  4481. "sub $6, %%ecx \n\t" /* do not use first 3 and last 3 columns */
  4482. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  4483. ".L10342: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
  4484. "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
  4485. /* --- 1 */
  4486. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  4487. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  4488. "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  4489. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  4490. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4491. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  4492. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4493. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  4494. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  4495. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  4496. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  4497. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  4498. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  4499. /* --- 2 */
  4500. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  4501. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  4502. "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  4503. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  4504. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4505. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  4506. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4507. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  4508. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  4509. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  4510. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  4511. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  4512. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  4513. /* --- 3 */
  4514. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  4515. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  4516. "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  4517. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  4518. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4519. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  4520. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4521. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  4522. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  4523. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  4524. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  4525. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  4526. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  4527. /* --- 4 */
  4528. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  4529. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  4530. "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  4531. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  4532. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4533. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  4534. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4535. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  4536. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  4537. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  4538. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  4539. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  4540. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  4541. /* --- 5 */
  4542. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  4543. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  4544. "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  4545. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  4546. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4547. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  4548. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4549. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  4550. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  4551. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  4552. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  4553. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  4554. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  4555. /* --- 6 */
  4556. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  4557. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  4558. "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  4559. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  4560. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4561. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  4562. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4563. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  4564. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  4565. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  4566. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  4567. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  4568. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  4569. /* --- 7 */
  4570. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  4571. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  4572. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  4573. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4574. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  4575. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  4576. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  4577. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  4578. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  4579. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  4580. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  4581. /* --- */
  4582. "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
  4583. "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
  4584. "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */
  4585. "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */
  4586. "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */
  4587. "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */
  4588. /* --- */
  4589. "movd %%eax, %%mm1 \n\t" /* save EDX in MM1 */
  4590. "movd %%ebx, %%mm2 \n\t" /* save EDX in MM2 */
  4591. "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */
  4592. "movd %%mm7, %%eax \n\t" /* load summation result into EAX */
  4593. "psraw $15, %%mm7 \n\t" /* spread sign bit of the result */
  4594. "movd %%mm5, %%ebx \n\t" /* load Divisor into EBX */
  4595. "movd %%mm7, %%edx \n\t" /* fill EDX with a sign bit */
  4596. "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */
  4597. "movd %%eax, %%mm7 \n\t" /* move result of division into MM7 */
  4598. "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */
  4599. "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */
  4600. "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
  4601. "movd %%mm3, %%edx \n\t" /* restore saved EDX */
  4602. "movd %%mm2, %%ebx \n\t" /* restore saved EBX */
  4603. "movd %%mm1, %%eax \n\t" /* restore saved EAX */
  4604. /* -- */
  4605. "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */
  4606. "sub $104, %%edx \n\t" /* EDX = Kernel address */
  4607. "inc %%esi \n\t" /* move Src pointer to the next pixel */
  4608. "inc %%edi \n\t" /* move Dest pointer to the next pixel */
  4609. /* --- */
  4610. "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
  4611. "jnz .L10342 \n\t" /* check loop termination, proceed if required */
  4612. "add $6, %%esi \n\t" /* move to the next row in Src */
  4613. "add $6, %%edi \n\t" /* move to the next row in Dest */
  4614. "dec %%ebx \n\t" /* decrease loop counter ROWS */
  4615. "jnz .L10340 \n\t" /* check loop termination, proceed if required */
  4616. /* --- */
  4617. "emms \n\t" /* exit MMX state */
  4618. "popa \n\t":"=m" (Dest) /* %0 */
  4619. :"m"(Src), /* %1 */
  4620. "m"(rows), /* %2 */
  4621. "m"(columns), /* %3 */
  4622. "m"(Kernel), /* %4 */
  4623. "m"(Divisor) /* %5 */
  4624. );
  4625. #endif
  4626. #endif
  4627. return (0);
  4628. } else {
  4629. /* No non-MMX implementation yet */
  4630. return (-1);
  4631. }
  4632. }
  4633. /*!
  4634. \brief Filter using ConvolveKernel9x9Divide: Dij = saturation0and255( ... )
  4635. \param Src The source 2D byte array to convolve. Should be different from destination.
  4636. \param Dest The destination 2D byte array to store the result in. Should be different from source.
  4637. \param rows Number of rows in source/destination array. Must be >8.
  4638. \param columns Number of columns in source/destination array. Must be >8.
  4639. \param Kernel The 2D convolution kernel of size 9x9.
  4640. \param Divisor The divisor of the convolution sum. Must be >0.
  4641. Note: Non-MMX implementation not available for this function.
  4642. \return Returns 1 if filter was applied, 0 otherwise.
  4643. */
  4644. int SDL_imageFilterConvolveKernel9x9Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
  4645. signed short *Kernel, unsigned char Divisor)
  4646. {
  4647. /* Validate input parameters */
  4648. if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
  4649. return(-1);
  4650. if ((columns < 9) || (rows < 9) || (Divisor == 0))
  4651. return (-1);
  4652. if ((SDL_imageFilterMMXdetect())) {
  4653. #ifdef USE_MMX
  4654. #if !defined(GCC__)
  4655. __asm
  4656. {
  4657. pusha
  4658. pxor mm0, mm0 /* zero MM0 */
  4659. xor ebx, ebx /* zero EBX */
  4660. mov bl, Divisor /* load Divisor into BL */
  4661. movd mm5, ebx /* copy Divisor into MM5 */
  4662. mov edx, Kernel /* load Kernel address into EDX */
  4663. mov esi, Src /* load Src address to ESI */
  4664. mov edi, Dest /* load Dest address to EDI */
  4665. add edi, 4 /* 4 column offset from the left edge */
  4666. mov eax, columns /* load columns into EAX */
  4667. add edi, eax /* 4 row offset from the top edge */
  4668. add edi, eax
  4669. add edi, eax
  4670. add edi, eax
  4671. mov ebx, rows /* initialize ROWS counter */
  4672. sub ebx, 8 /* do not use first 4 and last 4 rows */
  4673. /* ---, */
  4674. L10350:
  4675. mov ecx, eax /* initialize COLUMNS counter */
  4676. sub ecx, 8 /* do not use first 4 and last 4 columns */
  4677. align 16 /* 16 byte alignment of the loop entry */
  4678. L10352:
  4679. pxor mm7, mm7 /* zero MM7 (accumulator) */
  4680. movd mm6, esi /* save ESI in MM6 */
  4681. /* --- 1 */
  4682. movq mm1, [esi] /* load 8 bytes of the Src */
  4683. movq mm2, mm1 /* copy MM1 into MM2 */
  4684. inc esi /* move pointer to the next 8 bytes of Src */
  4685. movq mm3, [edx] /* load 4 words of Kernel */
  4686. add edx, 8 /* move pointer to other 4 words */
  4687. movq mm4, [edx] /* load 4 words of Kernel */
  4688. add edx, 8 /* move pointer to other 4 words */
  4689. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4690. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4691. pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
  4692. pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
  4693. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4694. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4695. movq mm1, [esi] /* load 8 bytes of the Src */
  4696. dec esi
  4697. add esi, eax /* move Src pointer 1 row below */
  4698. movq mm3, [edx] /* load 4 words of Kernel */
  4699. add edx, 8 /* move pointer to other 4 words */
  4700. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4701. pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
  4702. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4703. /* --- 2 */
  4704. movq mm1, [esi] /* load 8 bytes of the Src */
  4705. movq mm2, mm1 /* copy MM1 into MM2 */
  4706. inc esi /* move pointer to the next 8 bytes of Src */
  4707. movq mm3, [edx] /* load 4 words of Kernel */
  4708. add edx, 8 /* move pointer to other 4 words */
  4709. movq mm4, [edx] /* load 4 words of Kernel */
  4710. add edx, 8 /* move pointer to other 4 words */
  4711. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4712. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4713. pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
  4714. pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
  4715. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4716. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4717. movq mm1, [esi] /* load 8 bytes of the Src */
  4718. dec esi
  4719. add esi, eax /* move Src pointer 1 row below */
  4720. movq mm3, [edx] /* load 4 words of Kernel */
  4721. add edx, 8 /* move pointer to other 4 words */
  4722. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4723. pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
  4724. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4725. /* --- 3 */
  4726. movq mm1, [esi] /* load 8 bytes of the Src */
  4727. movq mm2, mm1 /* copy MM1 into MM2 */
  4728. inc esi /* move pointer to the next 8 bytes of Src */
  4729. movq mm3, [edx] /* load 4 words of Kernel */
  4730. add edx, 8 /* move pointer to other 4 words */
  4731. movq mm4, [edx] /* load 4 words of Kernel */
  4732. add edx, 8 /* move pointer to other 4 words */
  4733. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4734. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4735. pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
  4736. pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
  4737. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4738. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4739. movq mm1, [esi] /* load 8 bytes of the Src */
  4740. dec esi
  4741. add esi, eax /* move Src pointer 1 row below */
  4742. movq mm3, [edx] /* load 4 words of Kernel */
  4743. add edx, 8 /* move pointer to other 4 words */
  4744. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4745. pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
  4746. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4747. /* --- 4 */
  4748. movq mm1, [esi] /* load 8 bytes of the Src */
  4749. movq mm2, mm1 /* copy MM1 into MM2 */
  4750. inc esi /* move pointer to the next 8 bytes of Src */
  4751. movq mm3, [edx] /* load 4 words of Kernel */
  4752. add edx, 8 /* move pointer to other 4 words */
  4753. movq mm4, [edx] /* load 4 words of Kernel */
  4754. add edx, 8 /* move pointer to other 4 words */
  4755. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4756. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4757. pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
  4758. pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
  4759. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4760. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4761. movq mm1, [esi] /* load 8 bytes of the Src */
  4762. dec esi
  4763. add esi, eax /* move Src pointer 1 row below */
  4764. movq mm3, [edx] /* load 4 words of Kernel */
  4765. add edx, 8 /* move pointer to other 4 words */
  4766. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4767. pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
  4768. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4769. /* --- 5 */
  4770. movq mm1, [esi] /* load 8 bytes of the Src */
  4771. movq mm2, mm1 /* copy MM1 into MM2 */
  4772. inc esi /* move pointer to the next 8 bytes of Src */
  4773. movq mm3, [edx] /* load 4 words of Kernel */
  4774. add edx, 8 /* move pointer to other 4 words */
  4775. movq mm4, [edx] /* load 4 words of Kernel */
  4776. add edx, 8 /* move pointer to other 4 words */
  4777. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4778. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4779. pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
  4780. pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
  4781. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4782. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4783. movq mm1, [esi] /* load 8 bytes of the Src */
  4784. dec esi
  4785. add esi, eax /* move Src pointer 1 row below */
  4786. movq mm3, [edx] /* load 4 words of Kernel */
  4787. add edx, 8 /* move pointer to other 4 words */
  4788. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4789. pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
  4790. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4791. /* --- 6 */
  4792. movq mm1, [esi] /* load 8 bytes of the Src */
  4793. movq mm2, mm1 /* copy MM1 into MM2 */
  4794. inc esi /* move pointer to the next 8 bytes of Src */
  4795. movq mm3, [edx] /* load 4 words of Kernel */
  4796. add edx, 8 /* move pointer to other 4 words */
  4797. movq mm4, [edx] /* load 4 words of Kernel */
  4798. add edx, 8 /* move pointer to other 4 words */
  4799. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4800. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4801. pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
  4802. pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
  4803. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4804. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4805. movq mm1, [esi] /* load 8 bytes of the Src */
  4806. dec esi
  4807. add esi, eax /* move Src pointer 1 row below */
  4808. movq mm3, [edx] /* load 4 words of Kernel */
  4809. add edx, 8 /* move pointer to other 4 words */
  4810. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4811. pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
  4812. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4813. /* --- 7 */
  4814. movq mm1, [esi] /* load 8 bytes of the Src */
  4815. movq mm2, mm1 /* copy MM1 into MM2 */
  4816. inc esi /* move pointer to the next 8 bytes of Src */
  4817. movq mm3, [edx] /* load 4 words of Kernel */
  4818. add edx, 8 /* move pointer to other 4 words */
  4819. movq mm4, [edx] /* load 4 words of Kernel */
  4820. add edx, 8 /* move pointer to other 4 words */
  4821. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4822. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4823. pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
  4824. pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
  4825. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4826. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4827. movq mm1, [esi] /* load 8 bytes of the Src */
  4828. dec esi
  4829. add esi, eax /* move Src pointer 1 row below */
  4830. movq mm3, [edx] /* load 4 words of Kernel */
  4831. add edx, 8 /* move pointer to other 4 words */
  4832. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4833. pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
  4834. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4835. /* --- 8 */
  4836. movq mm1, [esi] /* load 8 bytes of the Src */
  4837. movq mm2, mm1 /* copy MM1 into MM2 */
  4838. inc esi /* move pointer to the next 8 bytes of Src */
  4839. movq mm3, [edx] /* load 4 words of Kernel */
  4840. add edx, 8 /* move pointer to other 4 words */
  4841. movq mm4, [edx] /* load 4 words of Kernel */
  4842. add edx, 8 /* move pointer to other 4 words */
  4843. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4844. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4845. pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
  4846. pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
  4847. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4848. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4849. movq mm1, [esi] /* load 8 bytes of the Src */
  4850. dec esi
  4851. add esi, eax /* move Src pointer 1 row below */
  4852. movq mm3, [edx] /* load 4 words of Kernel */
  4853. add edx, 8 /* move pointer to other 4 words */
  4854. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4855. pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
  4856. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4857. /* --- 9 */
  4858. movq mm1, [esi] /* load 8 bytes of the Src */
  4859. movq mm2, mm1 /* copy MM1 into MM2 */
  4860. inc esi /* move pointer to the next 8 bytes of Src */
  4861. movq mm3, [edx] /* load 4 words of Kernel */
  4862. add edx, 8 /* move pointer to other 4 words */
  4863. movq mm4, [edx] /* load 4 words of Kernel */
  4864. add edx, 8 /* move pointer to other 4 words */
  4865. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4866. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  4867. pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
  4868. pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
  4869. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  4870. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4871. movq mm1, [esi] /* load 8 bytes of the Src */
  4872. movq mm3, [edx] /* load 4 words of Kernel */
  4873. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  4874. pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
  4875. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  4876. /* ---, */
  4877. movq mm3, mm7 /* copy MM7 into MM3 */
  4878. psrlq mm7, 32 /* shift 2 left words to the right */
  4879. paddsw mm7, mm3 /* add 2 left and 2 right result words */
  4880. movq mm2, mm7 /* copy MM7 into MM2 */
  4881. psrlq mm7, 16 /* shift 1 left word to the right */
  4882. paddsw mm7, mm2 /* add 1 left and 1 right result words */
  4883. /* ---, */
  4884. movd mm1, eax /* save EDX in MM1 */
  4885. movd mm2, ebx /* save EDX in MM2 */
  4886. movd mm3, edx /* save EDX in MM3 */
  4887. movd eax, mm7 /* load summation result into EAX */
  4888. psraw mm7, 15 /* spread sign bit of the result */
  4889. movd ebx, mm5 /* load Divisor into EBX */
  4890. movd edx, mm7 /* fill EDX with a sign bit */
  4891. idiv bx /* IDIV - VERY EXPENSIVE */
  4892. movd mm7, eax /* move result of division into MM7 */
  4893. packuswb mm7, mm0 /* pack division result with saturation */
  4894. movd eax, mm7 /* copy saturated result into EAX */
  4895. mov [edi], al /* copy a byte result into Dest */
  4896. movd edx, mm3 /* restore saved EDX */
  4897. movd ebx, mm2 /* restore saved EBX */
  4898. movd eax, mm1 /* restore saved EAX */
  4899. /* --, */
  4900. movd esi, mm6 /* move Src pointer to the top pixel */
  4901. sub edx, 208 /* EDX = Kernel address */
  4902. inc esi /* move Src pointer to the next pixel */
  4903. inc edi /* move Dest pointer to the next pixel */
  4904. /* ---, */
  4905. dec ecx /* decrease loop counter COLUMNS */
  4906. jnz L10352 /* check loop termination, proceed if required */
  4907. add esi, 8 /* move to the next row in Src */
  4908. add edi, 8 /* move to the next row in Dest */
  4909. dec ebx /* decrease loop counter ROWS */
  4910. jnz L10350 /* check loop termination, proceed if required */
  4911. /* ---, */
  4912. emms /* exit MMX state */
  4913. popa
  4914. }
  4915. #else
  4916. asm volatile
  4917. ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
  4918. "xor %%ebx, %%ebx \n\t" /* zero EBX */
  4919. "mov %5, %%bl \n\t" /* load Divisor into BL */
  4920. "movd %%ebx, %%mm5 \n\t" /* copy Divisor into MM5 */
  4921. "mov %4, %%edx \n\t" /* load Kernel address into EDX */
  4922. "mov %1, %%esi \n\t" /* load Src address to ESI */
  4923. "mov %0, %%edi \n\t" /* load Dest address to EDI */
  4924. "add $4, %%edi \n\t" /* 4 column offset from the left edge */
  4925. "mov %3, %%eax \n\t" /* load columns into EAX */
  4926. "add %%eax, %%edi \n\t" /* 4 row offset from the top edge */
  4927. "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */
  4928. "sub $8, %%ebx \n\t" /* do not use first 4 and last 4 rows */
  4929. /* --- */
  4930. ".L10350: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
  4931. "sub $8, %%ecx \n\t" /* do not use first 4 and last 4 columns */
  4932. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  4933. ".L10352: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
  4934. "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
  4935. /* --- 1 */
  4936. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  4937. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  4938. "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
  4939. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  4940. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4941. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  4942. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4943. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  4944. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  4945. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  4946. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  4947. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  4948. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  4949. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  4950. "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  4951. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  4952. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4953. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  4954. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  4955. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  4956. /* --- 2 */
  4957. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  4958. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  4959. "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
  4960. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  4961. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4962. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  4963. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4964. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  4965. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  4966. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  4967. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  4968. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  4969. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  4970. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  4971. "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  4972. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  4973. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4974. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  4975. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  4976. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  4977. /* --- 3 */
  4978. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  4979. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  4980. "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
  4981. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  4982. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4983. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  4984. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4985. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  4986. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  4987. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  4988. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  4989. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  4990. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  4991. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  4992. "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  4993. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  4994. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  4995. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  4996. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  4997. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  4998. /* --- 4 */
  4999. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5000. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  5001. "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
  5002. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5003. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5004. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  5005. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5006. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5007. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  5008. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5009. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  5010. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  5011. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5012. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5013. "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  5014. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5015. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5016. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5017. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5018. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5019. /* --- 5 */
  5020. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5021. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  5022. "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
  5023. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5024. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5025. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  5026. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5027. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5028. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  5029. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5030. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  5031. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  5032. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5033. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5034. "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  5035. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5036. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5037. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5038. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5039. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5040. /* --- 6 */
  5041. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5042. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  5043. "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
  5044. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5045. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5046. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  5047. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5048. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5049. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  5050. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5051. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  5052. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  5053. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5054. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5055. "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  5056. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5057. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5058. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5059. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5060. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5061. /* --- 7 */
  5062. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5063. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  5064. "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
  5065. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5066. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5067. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  5068. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5069. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5070. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  5071. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5072. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  5073. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  5074. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5075. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5076. "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  5077. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5078. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5079. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5080. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5081. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5082. /* --- 8 */
  5083. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5084. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  5085. "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
  5086. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5087. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5088. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  5089. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5090. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5091. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  5092. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5093. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  5094. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  5095. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5096. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5097. "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  5098. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5099. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5100. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5101. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5102. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5103. /* --- 9 */
  5104. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5105. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  5106. "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
  5107. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5108. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5109. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  5110. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5111. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5112. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  5113. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5114. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  5115. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  5116. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5117. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5118. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5119. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5120. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5121. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5122. /* --- */
  5123. "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
  5124. "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
  5125. "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */
  5126. "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */
  5127. "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */
  5128. "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */
  5129. /* --- */
  5130. "movd %%eax, %%mm1 \n\t" /* save EDX in MM1 */
  5131. "movd %%ebx, %%mm2 \n\t" /* save EDX in MM2 */
  5132. "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */
  5133. "movd %%mm7, %%eax \n\t" /* load summation result into EAX */
  5134. "psraw $15, %%mm7 \n\t" /* spread sign bit of the result */
  5135. "movd %%mm5, %%ebx \n\t" /* load Divisor into EBX */
  5136. "movd %%mm7, %%edx \n\t" /* fill EDX with a sign bit */
  5137. "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */
  5138. "movd %%eax, %%mm7 \n\t" /* move result of division into MM7 */
  5139. "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */
  5140. "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */
  5141. "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
  5142. "movd %%mm3, %%edx \n\t" /* restore saved EDX */
  5143. "movd %%mm2, %%ebx \n\t" /* restore saved EBX */
  5144. "movd %%mm1, %%eax \n\t" /* restore saved EAX */
  5145. /* -- */
  5146. "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */
  5147. "sub $208, %%edx \n\t" /* EDX = Kernel address */
  5148. "inc %%esi \n\t" /* move Src pointer to the next pixel */
  5149. "inc %%edi \n\t" /* move Dest pointer to the next pixel */
  5150. /* --- */
  5151. "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
  5152. "jnz .L10352 \n\t" /* check loop termination, proceed if required */
  5153. "add $8, %%esi \n\t" /* move to the next row in Src */
  5154. "add $8, %%edi \n\t" /* move to the next row in Dest */
  5155. "dec %%ebx \n\t" /* decrease loop counter ROWS */
  5156. "jnz .L10350 \n\t" /* check loop termination, proceed if required */
  5157. /* --- */
  5158. "emms \n\t" /* exit MMX state */
  5159. "popa \n\t":"=m" (Dest) /* %0 */
  5160. :"m"(Src), /* %1 */
  5161. "m"(rows), /* %2 */
  5162. "m"(columns), /* %3 */
  5163. "m"(Kernel), /* %4 */
  5164. "m"(Divisor) /* %5 */
  5165. );
  5166. #endif
  5167. #endif
  5168. return (0);
  5169. } else {
  5170. /* No non-MMX implementation yet */
  5171. return (-1);
  5172. }
  5173. }
  5174. /*!
  5175. \brief Filter using ConvolveKernel3x3ShiftRight: Dij = saturation0and255( ... )
  5176. \param Src The source 2D byte array to convolve. Should be different from destination.
  5177. \param Dest The destination 2D byte array to store the result in. Should be different from source.
  5178. \param rows Number of rows in source/destination array. Must be >2.
  5179. \param columns Number of columns in source/destination array. Must be >2.
  5180. \param Kernel The 2D convolution kernel of size 3x3.
  5181. \param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.
  5182. Note: Non-MMX implementation not available for this function.
  5183. \return Returns 1 if filter was applied, 0 otherwise.
  5184. */
  5185. int SDL_imageFilterConvolveKernel3x3ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
  5186. signed short *Kernel, unsigned char NRightShift)
  5187. {
  5188. /* Validate input parameters */
  5189. if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
  5190. return(-1);
  5191. if ((columns < 3) || (rows < 3) || (NRightShift > 7))
  5192. return (-1);
  5193. if ((SDL_imageFilterMMXdetect())) {
  5194. #ifdef USE_MMX
  5195. #if !defined(GCC__)
  5196. __asm
  5197. {
  5198. pusha
  5199. pxor mm0, mm0 /* zero MM0 */
  5200. xor ebx, ebx /* zero EBX */
  5201. mov bl, NRightShift /* load NRightShift into BL */
  5202. movd mm4, ebx /* copy NRightShift into MM4 */
  5203. mov edx, Kernel /* load Kernel address into EDX */
  5204. movq mm5, [edx] /* MM5 = {0,K2,K1,K0} */
  5205. add edx, 8 /* second row |K0 K1 K2 0| */
  5206. movq mm6, [edx] /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */
  5207. add edx, 8 /* third row |K6 K7 K8 0| */
  5208. movq mm7, [edx] /* MM7 = {0,K8,K7,K6} */
  5209. /* ---, */
  5210. mov eax, columns /* load columns into EAX */
  5211. mov esi, Src /* ESI = Src row 0 address */
  5212. mov edi, Dest /* load Dest address to EDI */
  5213. add edi, eax /* EDI = EDI + columns */
  5214. inc edi /* 1 byte offset from the left edge */
  5215. mov edx, rows /* initialize ROWS counter */
  5216. sub edx, 2 /* do not use first and last row */
  5217. /* ---, */
  5218. L10360:
  5219. mov ecx, eax /* initialize COLUMS counter */
  5220. sub ecx, 2 /* do not use first and last column */
  5221. align 16 /* 16 byte alignment of the loop entry */
  5222. L10362:
  5223. /* ---, */
  5224. movq mm1, [esi] /* load 8 bytes of the image first row */
  5225. add esi, eax /* move one row below */
  5226. movq mm2, [esi] /* load 8 bytes of the image second row */
  5227. add esi, eax /* move one row below */
  5228. movq mm3, [esi] /* load 8 bytes of the image third row */
  5229. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  5230. punpcklbw mm2, mm0 /* unpack first 4 bytes into words */
  5231. punpcklbw mm3, mm0 /* unpack first 4 bytes into words */
  5232. psrlw mm1, mm4 /* shift right each pixel NshiftRight times */
  5233. psrlw mm2, mm4 /* shift right each pixel NshiftRight times */
  5234. psrlw mm3, mm4 /* shift right each pixel NshiftRight times */
  5235. pmullw mm1, mm5 /* multiply words first row image*Kernel */
  5236. pmullw mm2, mm6 /* multiply words second row image*Kernel */
  5237. pmullw mm3, mm7 /* multiply words third row image*Kernel */
  5238. paddsw mm1, mm2 /* add 4 words of the first and second rows */
  5239. paddsw mm1, mm3 /* add 4 words of the third row and result */
  5240. movq mm2, mm1 /* copy MM1 into MM2 */
  5241. psrlq mm1, 32 /* shift 2 left words to the right */
  5242. paddsw mm1, mm2 /* add 2 left and 2 right result words */
  5243. movq mm3, mm1 /* copy MM1 into MM3 */
  5244. psrlq mm1, 16 /* shift 1 left word to the right */
  5245. paddsw mm1, mm3 /* add 1 left and 1 right result words */
  5246. packuswb mm1, mm0 /* pack shift result with saturation */
  5247. movd ebx, mm1 /* copy saturated result into EBX */
  5248. mov [edi], bl /* copy a byte result into Dest */
  5249. /* --, */
  5250. sub esi, eax /* move two rows up */
  5251. sub esi, eax
  5252. inc esi /* move Src pointer to the next pixel */
  5253. inc edi /* move Dest pointer to the next pixel */
  5254. /* ---, */
  5255. dec ecx /* decrease loop counter COLUMNS */
  5256. jnz L10362 /* check loop termination, proceed if required */
  5257. add esi, 2 /* move to the next row in Src */
  5258. add edi, 2 /* move to the next row in Dest */
  5259. dec edx /* decrease loop counter ROWS */
  5260. jnz L10360 /* check loop termination, proceed if required */
  5261. /* ---, */
  5262. emms /* exit MMX state */
  5263. popa
  5264. }
  5265. #else
  5266. asm volatile
  5267. ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
  5268. "xor %%ebx, %%ebx \n\t" /* zero EBX */
  5269. "mov %5, %%bl \n\t" /* load NRightShift into BL */
  5270. "movd %%ebx, %%mm4 \n\t" /* copy NRightShift into MM4 */
  5271. "mov %4, %%edx \n\t" /* load Kernel address into EDX */
  5272. "movq (%%edx), %%mm5 \n\t" /* MM5 = {0,K2,K1,K0} */
  5273. "add $8, %%edx \n\t" /* second row |K0 K1 K2 0| */
  5274. "movq (%%edx), %%mm6 \n\t" /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */
  5275. "add $8, %%edx \n\t" /* third row |K6 K7 K8 0| */
  5276. "movq (%%edx), %%mm7 \n\t" /* MM7 = {0,K8,K7,K6} */
  5277. /* --- */
  5278. "mov %3, %%eax \n\t" /* load columns into EAX */
  5279. "mov %1, %%esi \n\t" /* ESI = Src row 0 address */
  5280. "mov %0, %%edi \n\t" /* load Dest address to EDI */
  5281. "add %%eax, %%edi \n\t" /* EDI = EDI + columns */
  5282. "inc %%edi \n\t" /* 1 byte offset from the left edge */
  5283. "mov %2, %%edx \n\t" /* initialize ROWS counter */
  5284. "sub $2, %%edx \n\t" /* do not use first and last row */
  5285. /* --- */
  5286. ".L10360: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */
  5287. "sub $2, %%ecx \n\t" /* do not use first and last column */
  5288. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  5289. ".L10362: \n\t"
  5290. /* --- */
  5291. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the image first row */
  5292. "add %%eax, %%esi \n\t" /* move one row below */
  5293. "movq (%%esi), %%mm2 \n\t" /* load 8 bytes of the image second row */
  5294. "add %%eax, %%esi \n\t" /* move one row below */
  5295. "movq (%%esi), %%mm3 \n\t" /* load 8 bytes of the image third row */
  5296. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5297. "punpcklbw %%mm0, %%mm2 \n\t" /* unpack first 4 bytes into words */
  5298. "punpcklbw %%mm0, %%mm3 \n\t" /* unpack first 4 bytes into words */
  5299. "psrlw %%mm4, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  5300. "psrlw %%mm4, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  5301. "psrlw %%mm4, %%mm3 \n\t" /* shift right each pixel NshiftRight times */
  5302. "pmullw %%mm5, %%mm1 \n\t" /* multiply words first row image*Kernel */
  5303. "pmullw %%mm6, %%mm2 \n\t" /* multiply words second row image*Kernel */
  5304. "pmullw %%mm7, %%mm3 \n\t" /* multiply words third row image*Kernel */
  5305. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the first and second rows */
  5306. "paddsw %%mm3, %%mm1 \n\t" /* add 4 words of the third row and result */
  5307. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  5308. "psrlq $32, %%mm1 \n\t" /* shift 2 left words to the right */
  5309. "paddsw %%mm2, %%mm1 \n\t" /* add 2 left and 2 right result words */
  5310. "movq %%mm1, %%mm3 \n\t" /* copy MM1 into MM3 */
  5311. "psrlq $16, %%mm1 \n\t" /* shift 1 left word to the right */
  5312. "paddsw %%mm3, %%mm1 \n\t" /* add 1 left and 1 right result words */
  5313. "packuswb %%mm0, %%mm1 \n\t" /* pack shift result with saturation */
  5314. "movd %%mm1, %%ebx \n\t" /* copy saturated result into EBX */
  5315. "mov %%bl, (%%edi) \n\t" /* copy a byte result into Dest */
  5316. /* -- */
  5317. "sub %%eax, %%esi \n\t" /* move two rows up */
  5318. "sub %%eax, %%esi \n\t" "inc %%esi \n\t" /* move Src pointer to the next pixel */
  5319. "inc %%edi \n\t" /* move Dest pointer to the next pixel */
  5320. /* --- */
  5321. "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
  5322. "jnz .L10362 \n\t" /* check loop termination, proceed if required */
  5323. "add $2, %%esi \n\t" /* move to the next row in Src */
  5324. "add $2, %%edi \n\t" /* move to the next row in Dest */
  5325. "dec %%edx \n\t" /* decrease loop counter ROWS */
  5326. "jnz .L10360 \n\t" /* check loop termination, proceed if required */
  5327. /* --- */
  5328. "emms \n\t" /* exit MMX state */
  5329. "popa \n\t":"=m" (Dest) /* %0 */
  5330. :"m"(Src), /* %1 */
  5331. "m"(rows), /* %2 */
  5332. "m"(columns), /* %3 */
  5333. "m"(Kernel), /* %4 */
  5334. "m"(NRightShift) /* %5 */
  5335. );
  5336. #endif
  5337. #endif
  5338. return (0);
  5339. } else {
  5340. /* No non-MMX implementation yet */
  5341. return (-1);
  5342. }
  5343. }
  5344. /*!
  5345. \brief Filter using ConvolveKernel5x5ShiftRight: Dij = saturation0and255( ... )
  5346. \param Src The source 2D byte array to convolve. Should be different from destination.
  5347. \param Dest The destination 2D byte array to store the result in. Should be different from source.
  5348. \param rows Number of rows in source/destination array. Must be >4.
  5349. \param columns Number of columns in source/destination array. Must be >4.
  5350. \param Kernel The 2D convolution kernel of size 5x5.
  5351. \param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.
  5352. Note: Non-MMX implementation not available for this function.
  5353. \return Returns 1 if filter was applied, 0 otherwise.
  5354. */
  5355. int SDL_imageFilterConvolveKernel5x5ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
  5356. signed short *Kernel, unsigned char NRightShift)
  5357. {
  5358. /* Validate input parameters */
  5359. if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
  5360. return(-1);
  5361. if ((columns < 5) || (rows < 5) || (NRightShift > 7))
  5362. return (-1);
  5363. if ((SDL_imageFilterMMXdetect())) {
  5364. #ifdef USE_MMX
  5365. #if !defined(GCC__)
  5366. __asm
  5367. {
  5368. pusha
  5369. pxor mm0, mm0 /* zero MM0 */
  5370. xor ebx, ebx /* zero EBX */
  5371. mov bl, NRightShift /* load NRightShift into BL */
  5372. movd mm5, ebx /* copy NRightShift into MM5 */
  5373. mov edx, Kernel /* load Kernel address into EDX */
  5374. mov esi, Src /* load Src address to ESI */
  5375. mov edi, Dest /* load Dest address to EDI */
  5376. add edi, 2 /* 2 column offset from the left edge */
  5377. mov eax, columns /* load columns into EAX */
  5378. shl eax, 1 /* EAX = columns * 2 */
  5379. add edi, eax /* 2 row offset from the top edge */
  5380. shr eax, 1 /* EAX = columns */
  5381. mov ebx, rows /* initialize ROWS counter */
  5382. sub ebx, 4 /* do not use first 2 and last 2 rows */
  5383. /* ---, */
  5384. L10370:
  5385. mov ecx, eax /* initialize COLUMNS counter */
  5386. sub ecx, 4 /* do not use first 2 and last 2 columns */
  5387. align 16 /* 16 byte alignment of the loop entry */
  5388. L10372:
  5389. pxor mm7, mm7 /* zero MM7 (accumulator) */
  5390. movd mm6, esi /* save ESI in MM6 */
  5391. /* --- 1 */
  5392. movq mm1, [esi] /* load 8 bytes of the Src */
  5393. movq mm2, mm1 /* copy MM1 into MM2 */
  5394. add esi, eax /* move Src pointer 1 row below */
  5395. movq mm3, [edx] /* load 4 words of Kernel */
  5396. add edx, 8 /* move pointer to other 4 words */
  5397. movq mm4, [edx] /* load 4 words of Kernel */
  5398. add edx, 8 /* move pointer to other 4 words */
  5399. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  5400. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  5401. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  5402. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  5403. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  5404. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  5405. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  5406. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  5407. /* --- 2 */
  5408. movq mm1, [esi] /* load 8 bytes of the Src */
  5409. movq mm2, mm1 /* copy MM1 into MM2 */
  5410. add esi, eax /* move Src pointer 1 row below */
  5411. movq mm3, [edx] /* load 4 words of Kernel */
  5412. add edx, 8 /* move pointer to other 4 words */
  5413. movq mm4, [edx] /* load 4 words of Kernel */
  5414. add edx, 8 /* move pointer to other 4 words */
  5415. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  5416. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  5417. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  5418. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  5419. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  5420. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  5421. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  5422. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  5423. /* --- 3 */
  5424. movq mm1, [esi] /* load 8 bytes of the Src */
  5425. movq mm2, mm1 /* copy MM1 into MM2 */
  5426. add esi, eax /* move Src pointer 1 row below */
  5427. movq mm3, [edx] /* load 4 words of Kernel */
  5428. add edx, 8 /* move pointer to other 4 words */
  5429. movq mm4, [edx] /* load 4 words of Kernel */
  5430. add edx, 8 /* move pointer to other 4 words */
  5431. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  5432. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  5433. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  5434. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  5435. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  5436. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  5437. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  5438. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  5439. /* --- 4 */
  5440. movq mm1, [esi] /* load 8 bytes of the Src */
  5441. movq mm2, mm1 /* copy MM1 into MM2 */
  5442. add esi, eax /* move Src pointer 1 row below */
  5443. movq mm3, [edx] /* load 4 words of Kernel */
  5444. add edx, 8 /* move pointer to other 4 words */
  5445. movq mm4, [edx] /* load 4 words of Kernel */
  5446. add edx, 8 /* move pointer to other 4 words */
  5447. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  5448. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  5449. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  5450. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  5451. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  5452. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  5453. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  5454. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  5455. /* --- 5 */
  5456. movq mm1, [esi] /* load 8 bytes of the Src */
  5457. movq mm2, mm1 /* copy MM1 into MM2 */
  5458. movq mm3, [edx] /* load 4 words of Kernel */
  5459. add edx, 8 /* move pointer to other 4 words */
  5460. movq mm4, [edx] /* load 4 words of Kernel */
  5461. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  5462. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  5463. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  5464. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  5465. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  5466. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  5467. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  5468. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  5469. /* ---, */
  5470. movq mm3, mm7 /* copy MM7 into MM3 */
  5471. psrlq mm7, 32 /* shift 2 left words to the right */
  5472. paddsw mm7, mm3 /* add 2 left and 2 right result words */
  5473. movq mm2, mm7 /* copy MM7 into MM2 */
  5474. psrlq mm7, 16 /* shift 1 left word to the right */
  5475. paddsw mm7, mm2 /* add 1 left and 1 right result words */
  5476. movd mm1, eax /* save EAX in MM1 */
  5477. packuswb mm7, mm0 /* pack division result with saturation */
  5478. movd eax, mm7 /* copy saturated result into EAX */
  5479. mov [edi], al /* copy a byte result into Dest */
  5480. movd eax, mm1 /* restore saved EAX */
  5481. /* --, */
  5482. movd esi, mm6 /* move Src pointer to the top pixel */
  5483. sub edx, 72 /* EDX = Kernel address */
  5484. inc esi /* move Src pointer to the next pixel */
  5485. inc edi /* move Dest pointer to the next pixel */
  5486. /* ---, */
  5487. dec ecx /* decrease loop counter COLUMNS */
  5488. jnz L10372 /* check loop termination, proceed if required */
  5489. add esi, 4 /* move to the next row in Src */
  5490. add edi, 4 /* move to the next row in Dest */
  5491. dec ebx /* decrease loop counter ROWS */
  5492. jnz L10370 /* check loop termination, proceed if required */
  5493. /* ---, */
  5494. emms /* exit MMX state */
  5495. popa
  5496. }
  5497. #else
  5498. asm volatile
  5499. ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
  5500. "xor %%ebx, %%ebx \n\t" /* zero EBX */
  5501. "mov %5, %%bl \n\t" /* load NRightShift into BL */
  5502. "movd %%ebx, %%mm5 \n\t" /* copy NRightShift into MM5 */
  5503. "mov %4, %%edx \n\t" /* load Kernel address into EDX */
  5504. "mov %1, %%esi \n\t" /* load Src address to ESI */
  5505. "mov %0, %%edi \n\t" /* load Dest address to EDI */
  5506. "add $2, %%edi \n\t" /* 2 column offset from the left edge */
  5507. "mov %3, %%eax \n\t" /* load columns into EAX */
  5508. "shl $1, %%eax \n\t" /* EAX = columns * 2 */
  5509. "add %%eax, %%edi \n\t" /* 2 row offset from the top edge */
  5510. "shr $1, %%eax \n\t" /* EAX = columns */
  5511. "mov %2, %%ebx \n\t" /* initialize ROWS counter */
  5512. "sub $4, %%ebx \n\t" /* do not use first 2 and last 2 rows */
  5513. /* --- */
  5514. ".L10370: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
  5515. "sub $4, %%ecx \n\t" /* do not use first 2 and last 2 columns */
  5516. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  5517. ".L10372: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
  5518. "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
  5519. /* --- 1 */
  5520. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5521. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  5522. "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  5523. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5524. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5525. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  5526. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5527. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5528. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  5529. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  5530. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  5531. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5532. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  5533. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  5534. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5535. /* --- 2 */
  5536. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5537. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  5538. "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  5539. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5540. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5541. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  5542. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5543. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5544. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  5545. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  5546. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  5547. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5548. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  5549. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  5550. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5551. /* --- 3 */
  5552. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5553. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  5554. "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  5555. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5556. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5557. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  5558. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5559. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5560. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  5561. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  5562. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  5563. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5564. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  5565. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  5566. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5567. /* --- 4 */
  5568. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5569. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  5570. "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  5571. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5572. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5573. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  5574. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5575. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5576. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  5577. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  5578. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  5579. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5580. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  5581. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  5582. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5583. /* --- 5 */
  5584. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5585. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  5586. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5587. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5588. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  5589. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5590. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  5591. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  5592. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  5593. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5594. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  5595. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  5596. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5597. /* --- */
  5598. "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
  5599. "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
  5600. "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */
  5601. "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */
  5602. "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */
  5603. "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */
  5604. "movd %%eax, %%mm1 \n\t" /* save EAX in MM1 */
  5605. "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */
  5606. "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */
  5607. "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
  5608. "movd %%mm1, %%eax \n\t" /* restore saved EAX */
  5609. /* -- */
  5610. "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */
  5611. "sub $72, %%edx \n\t" /* EDX = Kernel address */
  5612. "inc %%esi \n\t" /* move Src pointer to the next pixel */
  5613. "inc %%edi \n\t" /* move Dest pointer to the next pixel */
  5614. /* --- */
  5615. "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
  5616. "jnz .L10372 \n\t" /* check loop termination, proceed if required */
  5617. "add $4, %%esi \n\t" /* move to the next row in Src */
  5618. "add $4, %%edi \n\t" /* move to the next row in Dest */
  5619. "dec %%ebx \n\t" /* decrease loop counter ROWS */
  5620. "jnz .L10370 \n\t" /* check loop termination, proceed if required */
  5621. /* --- */
  5622. "emms \n\t" /* exit MMX state */
  5623. "popa \n\t":"=m" (Dest) /* %0 */
  5624. :"m"(Src), /* %1 */
  5625. "m"(rows), /* %2 */
  5626. "m"(columns), /* %3 */
  5627. "m"(Kernel), /* %4 */
  5628. "m"(NRightShift) /* %5 */
  5629. );
  5630. #endif
  5631. #endif
  5632. return (0);
  5633. } else {
  5634. /* No non-MMX implementation yet */
  5635. return (-1);
  5636. }
  5637. }
  5638. /*!
  5639. \brief Filter using ConvolveKernel7x7ShiftRight: Dij = saturation0and255( ... )
  5640. \param Src The source 2D byte array to convolve. Should be different from destination.
  5641. \param Dest The destination 2D byte array to store the result in. Should be different from source.
  5642. \param rows Number of rows in source/destination array. Must be >6.
  5643. \param columns Number of columns in source/destination array. Must be >6.
  5644. \param Kernel The 2D convolution kernel of size 7x7.
  5645. \param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.
  5646. Note: Non-MMX implementation not available for this function.
  5647. \return Returns 1 if filter was applied, 0 otherwise.
  5648. */
  5649. int SDL_imageFilterConvolveKernel7x7ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
  5650. signed short *Kernel, unsigned char NRightShift)
  5651. {
  5652. /* Validate input parameters */
  5653. if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
  5654. return(-1);
  5655. if ((columns < 7) || (rows < 7) || (NRightShift > 7))
  5656. return (-1);
  5657. if ((SDL_imageFilterMMXdetect())) {
  5658. #ifdef USE_MMX
  5659. #if !defined(GCC__)
  5660. __asm
  5661. {
  5662. pusha
  5663. pxor mm0, mm0 /* zero MM0 */
  5664. xor ebx, ebx /* zero EBX */
  5665. mov bl, NRightShift /* load NRightShift into BL */
  5666. movd mm5, ebx /* copy NRightShift into MM5 */
  5667. mov edx, Kernel /* load Kernel address into EDX */
  5668. mov esi, Src /* load Src address to ESI */
  5669. mov edi, Dest /* load Dest address to EDI */
  5670. add edi, 3 /* 3 column offset from the left edge */
  5671. mov eax, columns /* load columns into EAX */
  5672. add edi, eax /* 3 row offset from the top edge */
  5673. add edi, eax
  5674. add edi, eax
  5675. mov ebx, rows /* initialize ROWS counter */
  5676. sub ebx, 6 /* do not use first 3 and last 3 rows */
  5677. /* ---, */
  5678. L10380:
  5679. mov ecx, eax /* initialize COLUMNS counter */
  5680. sub ecx, 6 /* do not use first 3 and last 3 columns */
  5681. align 16 /* 16 byte alignment of the loop entry */
  5682. L10382:
  5683. pxor mm7, mm7 /* zero MM7 (accumulator) */
  5684. movd mm6, esi /* save ESI in MM6 */
  5685. /* --- 1 */
  5686. movq mm1, [esi] /* load 8 bytes of the Src */
  5687. movq mm2, mm1 /* copy MM1 into MM2 */
  5688. add esi, eax /* move Src pointer 1 row below */
  5689. movq mm3, [edx] /* load 4 words of Kernel */
  5690. add edx, 8 /* move pointer to other 4 words */
  5691. movq mm4, [edx] /* load 4 words of Kernel */
  5692. add edx, 8 /* move pointer to other 4 words */
  5693. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  5694. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  5695. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  5696. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  5697. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  5698. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  5699. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  5700. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  5701. /* --- 2 */
  5702. movq mm1, [esi] /* load 8 bytes of the Src */
  5703. movq mm2, mm1 /* copy MM1 into MM2 */
  5704. add esi, eax /* move Src pointer 1 row below */
  5705. movq mm3, [edx] /* load 4 words of Kernel */
  5706. add edx, 8 /* move pointer to other 4 words */
  5707. movq mm4, [edx] /* load 4 words of Kernel */
  5708. add edx, 8 /* move pointer to other 4 words */
  5709. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  5710. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  5711. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  5712. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  5713. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  5714. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  5715. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  5716. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  5717. /* --- 3 */
  5718. movq mm1, [esi] /* load 8 bytes of the Src */
  5719. movq mm2, mm1 /* copy MM1 into MM2 */
  5720. add esi, eax /* move Src pointer 1 row below */
  5721. movq mm3, [edx] /* load 4 words of Kernel */
  5722. add edx, 8 /* move pointer to other 4 words */
  5723. movq mm4, [edx] /* load 4 words of Kernel */
  5724. add edx, 8 /* move pointer to other 4 words */
  5725. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  5726. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  5727. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  5728. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  5729. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  5730. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  5731. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  5732. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  5733. /* --- 4 */
  5734. movq mm1, [esi] /* load 8 bytes of the Src */
  5735. movq mm2, mm1 /* copy MM1 into MM2 */
  5736. add esi, eax /* move Src pointer 1 row below */
  5737. movq mm3, [edx] /* load 4 words of Kernel */
  5738. add edx, 8 /* move pointer to other 4 words */
  5739. movq mm4, [edx] /* load 4 words of Kernel */
  5740. add edx, 8 /* move pointer to other 4 words */
  5741. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  5742. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  5743. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  5744. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  5745. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  5746. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  5747. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  5748. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  5749. /* --- 5 */
  5750. movq mm1, [esi] /* load 8 bytes of the Src */
  5751. movq mm2, mm1 /* copy MM1 into MM2 */
  5752. add esi, eax /* move Src pointer 1 row below */
  5753. movq mm3, [edx] /* load 4 words of Kernel */
  5754. add edx, 8 /* move pointer to other 4 words */
  5755. movq mm4, [edx] /* load 4 words of Kernel */
  5756. add edx, 8 /* move pointer to other 4 words */
  5757. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  5758. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  5759. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  5760. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  5761. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  5762. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  5763. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  5764. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  5765. /* --- 6 */
  5766. movq mm1, [esi] /* load 8 bytes of the Src */
  5767. movq mm2, mm1 /* copy MM1 into MM2 */
  5768. add esi, eax /* move Src pointer 1 row below */
  5769. movq mm3, [edx] /* load 4 words of Kernel */
  5770. add edx, 8 /* move pointer to other 4 words */
  5771. movq mm4, [edx] /* load 4 words of Kernel */
  5772. add edx, 8 /* move pointer to other 4 words */
  5773. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  5774. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  5775. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  5776. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  5777. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  5778. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  5779. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  5780. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  5781. /* --- 7 */
  5782. movq mm1, [esi] /* load 8 bytes of the Src */
  5783. movq mm2, mm1 /* copy MM1 into MM2 */
  5784. movq mm3, [edx] /* load 4 words of Kernel */
  5785. add edx, 8 /* move pointer to other 4 words */
  5786. movq mm4, [edx] /* load 4 words of Kernel */
  5787. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  5788. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  5789. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  5790. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  5791. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  5792. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  5793. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  5794. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  5795. /* ---, */
  5796. movq mm3, mm7 /* copy MM7 into MM3 */
  5797. psrlq mm7, 32 /* shift 2 left words to the right */
  5798. paddsw mm7, mm3 /* add 2 left and 2 right result words */
  5799. movq mm2, mm7 /* copy MM7 into MM2 */
  5800. psrlq mm7, 16 /* shift 1 left word to the right */
  5801. paddsw mm7, mm2 /* add 1 left and 1 right result words */
  5802. movd mm1, eax /* save EAX in MM1 */
  5803. packuswb mm7, mm0 /* pack division result with saturation */
  5804. movd eax, mm7 /* copy saturated result into EAX */
  5805. mov [edi], al /* copy a byte result into Dest */
  5806. movd eax, mm1 /* restore saved EAX */
  5807. /* --, */
  5808. movd esi, mm6 /* move Src pointer to the top pixel */
  5809. sub edx, 104 /* EDX = Kernel address */
  5810. inc esi /* move Src pointer to the next pixel */
  5811. inc edi /* move Dest pointer to the next pixel */
  5812. /* ---, */
  5813. dec ecx /* decrease loop counter COLUMNS */
  5814. jnz L10382 /* check loop termination, proceed if required */
  5815. add esi, 6 /* move to the next row in Src */
  5816. add edi, 6 /* move to the next row in Dest */
  5817. dec ebx /* decrease loop counter ROWS */
  5818. jnz L10380 /* check loop termination, proceed if required */
  5819. /* ---, */
  5820. emms /* exit MMX state */
  5821. popa
  5822. }
  5823. #else
  5824. asm volatile
  5825. ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
  5826. "xor %%ebx, %%ebx \n\t" /* zero EBX */
  5827. "mov %5, %%bl \n\t" /* load NRightShift into BL */
  5828. "movd %%ebx, %%mm5 \n\t" /* copy NRightShift into MM5 */
  5829. "mov %4, %%edx \n\t" /* load Kernel address into EDX */
  5830. "mov %1, %%esi \n\t" /* load Src address to ESI */
  5831. "mov %0, %%edi \n\t" /* load Dest address to EDI */
  5832. "add $3, %%edi \n\t" /* 3 column offset from the left edge */
  5833. "mov %3, %%eax \n\t" /* load columns into EAX */
  5834. "add %%eax, %%edi \n\t" /* 3 row offset from the top edge */
  5835. "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */
  5836. "sub $6, %%ebx \n\t" /* do not use first 3 and last 3 rows */
  5837. /* --- */
  5838. ".L10380: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
  5839. "sub $6, %%ecx \n\t" /* do not use first 3 and last 3 columns */
  5840. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  5841. ".L10382: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
  5842. "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
  5843. /* --- 1 */
  5844. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5845. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  5846. "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  5847. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5848. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5849. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  5850. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5851. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5852. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  5853. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  5854. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  5855. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5856. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  5857. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  5858. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5859. /* --- 2 */
  5860. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5861. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  5862. "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  5863. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5864. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5865. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  5866. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5867. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5868. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  5869. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  5870. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  5871. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5872. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  5873. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  5874. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5875. /* --- 3 */
  5876. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5877. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  5878. "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  5879. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5880. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5881. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  5882. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5883. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5884. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  5885. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  5886. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  5887. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5888. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  5889. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  5890. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5891. /* --- 4 */
  5892. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5893. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  5894. "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  5895. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5896. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5897. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  5898. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5899. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5900. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  5901. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  5902. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  5903. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5904. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  5905. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  5906. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5907. /* --- 5 */
  5908. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5909. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  5910. "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  5911. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5912. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5913. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  5914. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5915. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5916. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  5917. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  5918. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  5919. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5920. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  5921. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  5922. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5923. /* --- 6 */
  5924. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5925. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  5926. "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  5927. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5928. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5929. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  5930. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5931. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5932. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  5933. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  5934. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  5935. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5936. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  5937. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  5938. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5939. /* --- 7 */
  5940. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  5941. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  5942. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  5943. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  5944. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  5945. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  5946. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  5947. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  5948. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  5949. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  5950. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  5951. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  5952. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  5953. /* --- */
  5954. "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
  5955. "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
  5956. "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */
  5957. "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */
  5958. "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */
  5959. "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */
  5960. "movd %%eax, %%mm1 \n\t" /* save EAX in MM1 */
  5961. "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */
  5962. "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */
  5963. "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
  5964. "movd %%mm1, %%eax \n\t" /* restore saved EAX */
  5965. /* -- */
  5966. "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */
  5967. "sub $104, %%edx \n\t" /* EDX = Kernel address */
  5968. "inc %%esi \n\t" /* move Src pointer to the next pixel */
  5969. "inc %%edi \n\t" /* move Dest pointer to the next pixel */
  5970. /* --- */
  5971. "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
  5972. "jnz .L10382 \n\t" /* check loop termination, proceed if required */
  5973. "add $6, %%esi \n\t" /* move to the next row in Src */
  5974. "add $6, %%edi \n\t" /* move to the next row in Dest */
  5975. "dec %%ebx \n\t" /* decrease loop counter ROWS */
  5976. "jnz .L10380 \n\t" /* check loop termination, proceed if required */
  5977. /* --- */
  5978. "emms \n\t" /* exit MMX state */
  5979. "popa \n\t":"=m" (Dest) /* %0 */
  5980. :"m"(Src), /* %1 */
  5981. "m"(rows), /* %2 */
  5982. "m"(columns), /* %3 */
  5983. "m"(Kernel), /* %4 */
  5984. "m"(NRightShift) /* %5 */
  5985. );
  5986. #endif
  5987. #endif
  5988. return (0);
  5989. } else {
  5990. /* No non-MMX implementation yet */
  5991. return (-1);
  5992. }
  5993. }
  5994. /*!
  5995. \brief Filter using ConvolveKernel9x9ShiftRight: Dij = saturation255( ... )
  5996. \param Src The source 2D byte array to convolve. Should be different from destination.
  5997. \param Dest The destination 2D byte array to store the result in. Should be different from source.
  5998. \param rows Number of rows in source/destination array. Must be >8.
  5999. \param columns Number of columns in source/destination array. Must be >8.
  6000. \param Kernel The 2D convolution kernel of size 9x9.
  6001. \param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.
  6002. Note: Non-MMX implementation not available for this function.
  6003. \return Returns 1 if filter was applied, 0 otherwise.
  6004. */
  6005. int SDL_imageFilterConvolveKernel9x9ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
  6006. signed short *Kernel, unsigned char NRightShift)
  6007. {
  6008. /* Validate input parameters */
  6009. if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
  6010. return(-1);
  6011. if ((columns < 9) || (rows < 9) || (NRightShift > 7))
  6012. return (-1);
  6013. if ((SDL_imageFilterMMXdetect())) {
  6014. #ifdef USE_MMX
  6015. #if !defined(GCC__)
  6016. __asm
  6017. {
  6018. pusha
  6019. pxor mm0, mm0 /* zero MM0 */
  6020. xor ebx, ebx /* zero EBX */
  6021. mov bl, NRightShift /* load NRightShift into BL */
  6022. movd mm5, ebx /* copy NRightShift into MM5 */
  6023. mov edx, Kernel /* load Kernel address into EDX */
  6024. mov esi, Src /* load Src address to ESI */
  6025. mov edi, Dest /* load Dest address to EDI */
  6026. add edi, 4 /* 4 column offset from the left edge */
  6027. mov eax, columns /* load columns into EAX */
  6028. add edi, eax /* 4 row offset from the top edge */
  6029. add edi, eax
  6030. add edi, eax
  6031. add edi, eax
  6032. mov ebx, rows /* initialize ROWS counter */
  6033. sub ebx, 8 /* do not use first 4 and last 4 rows */
  6034. /* ---, */
  6035. L10390:
  6036. mov ecx, eax /* initialize COLUMNS counter */
  6037. sub ecx, 8 /* do not use first 4 and last 4 columns */
  6038. align 16 /* 16 byte alignment of the loop entry */
  6039. L10392:
  6040. pxor mm7, mm7 /* zero MM7 (accumulator) */
  6041. movd mm6, esi /* save ESI in MM6 */
  6042. /* --- 1 */
  6043. movq mm1, [esi] /* load 8 bytes of the Src */
  6044. movq mm2, mm1 /* copy MM1 into MM2 */
  6045. inc esi /* move pointer to the next 8 bytes of Src */
  6046. movq mm3, [edx] /* load 4 words of Kernel */
  6047. add edx, 8 /* move pointer to other 4 words */
  6048. movq mm4, [edx] /* load 4 words of Kernel */
  6049. add edx, 8 /* move pointer to other 4 words */
  6050. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  6051. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  6052. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  6053. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  6054. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  6055. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  6056. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  6057. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  6058. movq mm1, [esi] /* load 8 bytes of the Src */
  6059. dec esi
  6060. add esi, eax /* move Src pointer 1 row below */
  6061. movq mm3, [edx] /* load 4 words of Kernel */
  6062. add edx, 8 /* move pointer to other 4 words */
  6063. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  6064. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  6065. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  6066. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  6067. /* --- 2 */
  6068. movq mm1, [esi] /* load 8 bytes of the Src */
  6069. movq mm2, mm1 /* copy MM1 into MM2 */
  6070. inc esi /* move pointer to the next 8 bytes of Src */
  6071. movq mm3, [edx] /* load 4 words of Kernel */
  6072. add edx, 8 /* move pointer to other 4 words */
  6073. movq mm4, [edx] /* load 4 words of Kernel */
  6074. add edx, 8 /* move pointer to other 4 words */
  6075. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  6076. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  6077. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  6078. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  6079. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  6080. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  6081. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  6082. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  6083. movq mm1, [esi] /* load 8 bytes of the Src */
  6084. dec esi
  6085. add esi, eax /* move Src pointer 1 row below */
  6086. movq mm3, [edx] /* load 4 words of Kernel */
  6087. add edx, 8 /* move pointer to other 4 words */
  6088. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  6089. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  6090. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  6091. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  6092. /* --- 3 */
  6093. movq mm1, [esi] /* load 8 bytes of the Src */
  6094. movq mm2, mm1 /* copy MM1 into MM2 */
  6095. inc esi /* move pointer to the next 8 bytes of Src */
  6096. movq mm3, [edx] /* load 4 words of Kernel */
  6097. add edx, 8 /* move pointer to other 4 words */
  6098. movq mm4, [edx] /* load 4 words of Kernel */
  6099. add edx, 8 /* move pointer to other 4 words */
  6100. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  6101. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  6102. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  6103. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  6104. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  6105. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  6106. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  6107. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  6108. movq mm1, [esi] /* load 8 bytes of the Src */
  6109. dec esi
  6110. add esi, eax /* move Src pointer 1 row below */
  6111. movq mm3, [edx] /* load 4 words of Kernel */
  6112. add edx, 8 /* move pointer to other 4 words */
  6113. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  6114. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  6115. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  6116. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  6117. /* --- 4 */
  6118. movq mm1, [esi] /* load 8 bytes of the Src */
  6119. movq mm2, mm1 /* copy MM1 into MM2 */
  6120. inc esi /* move pointer to the next 8 bytes of Src */
  6121. movq mm3, [edx] /* load 4 words of Kernel */
  6122. add edx, 8 /* move pointer to other 4 words */
  6123. movq mm4, [edx] /* load 4 words of Kernel */
  6124. add edx, 8 /* move pointer to other 4 words */
  6125. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  6126. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  6127. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  6128. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  6129. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  6130. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  6131. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  6132. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  6133. movq mm1, [esi] /* load 8 bytes of the Src */
  6134. dec esi
  6135. add esi, eax /* move Src pointer 1 row below */
  6136. movq mm3, [edx] /* load 4 words of Kernel */
  6137. add edx, 8 /* move pointer to other 4 words */
  6138. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  6139. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  6140. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  6141. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  6142. /* --- 5 */
  6143. movq mm1, [esi] /* load 8 bytes of the Src */
  6144. movq mm2, mm1 /* copy MM1 into MM2 */
  6145. inc esi /* move pointer to the next 8 bytes of Src */
  6146. movq mm3, [edx] /* load 4 words of Kernel */
  6147. add edx, 8 /* move pointer to other 4 words */
  6148. movq mm4, [edx] /* load 4 words of Kernel */
  6149. add edx, 8 /* move pointer to other 4 words */
  6150. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  6151. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  6152. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  6153. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  6154. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  6155. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  6156. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  6157. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  6158. movq mm1, [esi] /* load 8 bytes of the Src */
  6159. dec esi
  6160. add esi, eax /* move Src pointer 1 row below */
  6161. movq mm3, [edx] /* load 4 words of Kernel */
  6162. add edx, 8 /* move pointer to other 4 words */
  6163. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  6164. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  6165. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  6166. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  6167. /* --- 6 */
  6168. movq mm1, [esi] /* load 8 bytes of the Src */
  6169. movq mm2, mm1 /* copy MM1 into MM2 */
  6170. inc esi /* move pointer to the next 8 bytes of Src */
  6171. movq mm3, [edx] /* load 4 words of Kernel */
  6172. add edx, 8 /* move pointer to other 4 words */
  6173. movq mm4, [edx] /* load 4 words of Kernel */
  6174. add edx, 8 /* move pointer to other 4 words */
  6175. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  6176. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  6177. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  6178. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  6179. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  6180. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  6181. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  6182. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  6183. movq mm1, [esi] /* load 8 bytes of the Src */
  6184. dec esi
  6185. add esi, eax /* move Src pointer 1 row below */
  6186. movq mm3, [edx] /* load 4 words of Kernel */
  6187. add edx, 8 /* move pointer to other 4 words */
  6188. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  6189. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  6190. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  6191. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  6192. /* --- 7 */
  6193. movq mm1, [esi] /* load 8 bytes of the Src */
  6194. movq mm2, mm1 /* copy MM1 into MM2 */
  6195. inc esi /* move pointer to the next 8 bytes of Src */
  6196. movq mm3, [edx] /* load 4 words of Kernel */
  6197. add edx, 8 /* move pointer to other 4 words */
  6198. movq mm4, [edx] /* load 4 words of Kernel */
  6199. add edx, 8 /* move pointer to other 4 words */
  6200. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  6201. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  6202. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  6203. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  6204. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  6205. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  6206. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  6207. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  6208. movq mm1, [esi] /* load 8 bytes of the Src */
  6209. dec esi
  6210. add esi, eax /* move Src pointer 1 row below */
  6211. movq mm3, [edx] /* load 4 words of Kernel */
  6212. add edx, 8 /* move pointer to other 4 words */
  6213. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  6214. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  6215. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  6216. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  6217. /* --- 8 */
  6218. movq mm1, [esi] /* load 8 bytes of the Src */
  6219. movq mm2, mm1 /* copy MM1 into MM2 */
  6220. inc esi /* move pointer to the next 8 bytes of Src */
  6221. movq mm3, [edx] /* load 4 words of Kernel */
  6222. add edx, 8 /* move pointer to other 4 words */
  6223. movq mm4, [edx] /* load 4 words of Kernel */
  6224. add edx, 8 /* move pointer to other 4 words */
  6225. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  6226. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  6227. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  6228. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  6229. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  6230. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  6231. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  6232. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  6233. movq mm1, [esi] /* load 8 bytes of the Src */
  6234. dec esi
  6235. add esi, eax /* move Src pointer 1 row below */
  6236. movq mm3, [edx] /* load 4 words of Kernel */
  6237. add edx, 8 /* move pointer to other 4 words */
  6238. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  6239. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  6240. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  6241. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  6242. /* --- 9 */
  6243. movq mm1, [esi] /* load 8 bytes of the Src */
  6244. movq mm2, mm1 /* copy MM1 into MM2 */
  6245. inc esi /* move pointer to the next 8 bytes of Src */
  6246. movq mm3, [edx] /* load 4 words of Kernel */
  6247. add edx, 8 /* move pointer to other 4 words */
  6248. movq mm4, [edx] /* load 4 words of Kernel */
  6249. add edx, 8 /* move pointer to other 4 words */
  6250. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  6251. punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
  6252. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  6253. psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
  6254. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  6255. pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
  6256. paddsw mm1, mm2 /* add 4 words of the high and low bytes */
  6257. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  6258. movq mm1, [esi] /* load 8 bytes of the Src */
  6259. movq mm3, [edx] /* load 4 words of Kernel */
  6260. punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
  6261. psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
  6262. pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
  6263. paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
  6264. /* ---, */
  6265. movq mm3, mm7 /* copy MM7 into MM3 */
  6266. psrlq mm7, 32 /* shift 2 left words to the right */
  6267. paddsw mm7, mm3 /* add 2 left and 2 right result words */
  6268. movq mm2, mm7 /* copy MM7 into MM2 */
  6269. psrlq mm7, 16 /* shift 1 left word to the right */
  6270. paddsw mm7, mm2 /* add 1 left and 1 right result words */
  6271. movd mm1, eax /* save EAX in MM1 */
  6272. packuswb mm7, mm0 /* pack division result with saturation */
  6273. movd eax, mm7 /* copy saturated result into EAX */
  6274. mov [edi], al /* copy a byte result into Dest */
  6275. movd eax, mm1 /* restore saved EAX */
  6276. /* --, */
  6277. movd esi, mm6 /* move Src pointer to the top pixel */
  6278. sub edx, 208 /* EDX = Kernel address */
  6279. inc esi /* move Src pointer to the next pixel */
  6280. inc edi /* move Dest pointer to the next pixel */
  6281. /* ---, */
  6282. dec ecx /* decrease loop counter COLUMNS */
  6283. jnz L10392 /* check loop termination, proceed if required */
  6284. add esi, 8 /* move to the next row in Src */
  6285. add edi, 8 /* move to the next row in Dest */
  6286. dec ebx /* decrease loop counter ROWS */
  6287. jnz L10390 /* check loop termination, proceed if required */
  6288. /* ---, */
  6289. emms /* exit MMX state */
  6290. popa
  6291. }
  6292. #else
  6293. asm volatile
  6294. ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
  6295. "xor %%ebx, %%ebx \n\t" /* zero EBX */
  6296. "mov %5, %%bl \n\t" /* load NRightShift into BL */
  6297. "movd %%ebx, %%mm5 \n\t" /* copy NRightShift into MM5 */
  6298. "mov %4, %%edx \n\t" /* load Kernel address into EDX */
  6299. "mov %1, %%esi \n\t" /* load Src address to ESI */
  6300. "mov %0, %%edi \n\t" /* load Dest address to EDI */
  6301. "add $4, %%edi \n\t" /* 4 column offset from the left edge */
  6302. "mov %3, %%eax \n\t" /* load columns into EAX */
  6303. "add %%eax, %%edi \n\t" /* 4 row offset from the top edge */
  6304. "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */
  6305. "sub $8, %%ebx \n\t" /* do not use first 4 and last 4 rows */
  6306. /* --- */
  6307. ".L10390: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
  6308. "sub $8, %%ecx \n\t" /* do not use first 4 and last 4 columns */
  6309. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  6310. ".L10392: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
  6311. "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
  6312. /* --- 1 */
  6313. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  6314. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  6315. "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
  6316. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  6317. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6318. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  6319. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6320. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  6321. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  6322. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  6323. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  6324. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  6325. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  6326. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  6327. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  6328. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  6329. "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  6330. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  6331. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6332. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  6333. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  6334. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  6335. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  6336. /* --- 2 */
  6337. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  6338. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  6339. "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
  6340. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  6341. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6342. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  6343. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6344. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  6345. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  6346. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  6347. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  6348. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  6349. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  6350. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  6351. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  6352. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  6353. "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  6354. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  6355. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6356. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  6357. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  6358. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  6359. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  6360. /* --- 3 */
  6361. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  6362. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  6363. "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
  6364. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  6365. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6366. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  6367. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6368. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  6369. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  6370. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  6371. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  6372. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  6373. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  6374. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  6375. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  6376. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  6377. "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  6378. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  6379. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6380. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  6381. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  6382. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  6383. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  6384. /* --- 4 */
  6385. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  6386. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  6387. "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
  6388. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  6389. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6390. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  6391. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6392. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  6393. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  6394. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  6395. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  6396. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  6397. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  6398. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  6399. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  6400. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  6401. "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  6402. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  6403. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6404. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  6405. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  6406. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  6407. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  6408. /* --- 5 */
  6409. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  6410. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  6411. "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
  6412. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  6413. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6414. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  6415. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6416. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  6417. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  6418. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  6419. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  6420. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  6421. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  6422. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  6423. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  6424. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  6425. "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  6426. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  6427. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6428. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  6429. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  6430. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  6431. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  6432. /* --- 6 */
  6433. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  6434. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  6435. "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
  6436. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  6437. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6438. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  6439. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6440. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  6441. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  6442. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  6443. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  6444. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  6445. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  6446. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  6447. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  6448. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  6449. "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  6450. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  6451. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6452. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  6453. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  6454. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  6455. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  6456. /* --- 7 */
  6457. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  6458. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  6459. "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
  6460. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  6461. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6462. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  6463. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6464. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  6465. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  6466. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  6467. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  6468. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  6469. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  6470. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  6471. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  6472. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  6473. "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  6474. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  6475. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6476. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  6477. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  6478. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  6479. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  6480. /* --- 8 */
  6481. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  6482. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  6483. "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
  6484. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  6485. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6486. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  6487. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6488. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  6489. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  6490. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  6491. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  6492. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  6493. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  6494. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  6495. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  6496. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  6497. "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
  6498. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  6499. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6500. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  6501. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  6502. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  6503. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  6504. /* --- 9 */
  6505. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  6506. "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
  6507. "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
  6508. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  6509. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6510. "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
  6511. "add $8, %%edx \n\t" /* move pointer to other 4 words */
  6512. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  6513. "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
  6514. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  6515. "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  6516. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  6517. "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
  6518. "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
  6519. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  6520. "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
  6521. "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
  6522. "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
  6523. "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
  6524. "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
  6525. "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
  6526. /* --- */
  6527. "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
  6528. "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
  6529. "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */
  6530. "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */
  6531. "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */
  6532. "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */
  6533. "movd %%eax, %%mm1 \n\t" /* save EAX in MM1 */
  6534. "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */
  6535. "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */
  6536. "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
  6537. "movd %%mm1, %%eax \n\t" /* restore saved EAX */
  6538. /* -- */
  6539. "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */
  6540. "sub $208, %%edx \n\t" /* EDX = Kernel address */
  6541. "inc %%esi \n\t" /* move Src pointer to the next pixel */
  6542. "inc %%edi \n\t" /* move Dest pointer to the next pixel */
  6543. /* --- */
  6544. "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
  6545. "jnz .L10392 \n\t" /* check loop termination, proceed if required */
  6546. "add $8, %%esi \n\t" /* move to the next row in Src */
  6547. "add $8, %%edi \n\t" /* move to the next row in Dest */
  6548. "dec %%ebx \n\t" /* decrease loop counter ROWS */
  6549. "jnz .L10390 \n\t" /* check loop termination, proceed if required */
  6550. /* --- */
  6551. "emms \n\t" /* exit MMX state */
  6552. "popa \n\t":"=m" (Dest) /* %0 */
  6553. :"m"(Src), /* %1 */
  6554. "m"(rows), /* %2 */
  6555. "m"(columns), /* %3 */
  6556. "m"(Kernel), /* %4 */
  6557. "m"(NRightShift) /* %5 */
  6558. );
  6559. #endif
  6560. #endif
  6561. return (0);
  6562. } else {
  6563. /* No non-MMX implementation yet */
  6564. return (-1);
  6565. }
  6566. }
  6567. /* ------------------------------------------------------------------------------------ */
  6568. /*!
  6569. \brief Filter using SobelX: Dij = saturation255( ... )
  6570. \param Src The source 2D byte array to sobel-filter. Should be different from destination.
  6571. \param Dest The destination 2D byte array to store the result in. Should be different from source.
  6572. \param rows Number of rows in source/destination array. Must be >2.
  6573. \param columns Number of columns in source/destination array. Must be >7.
  6574. Note: Non-MMX implementation not available for this function.
  6575. \return Returns 1 if filter was applied, 0 otherwise.
  6576. */
  6577. int SDL_imageFilterSobelX(unsigned char *Src, unsigned char *Dest, int rows, int columns)
  6578. {
  6579. /* Validate input parameters */
  6580. if ((Src == NULL) || (Dest == NULL))
  6581. return(-1);
  6582. if ((columns < 8) || (rows < 3))
  6583. return (-1);
  6584. if ((SDL_imageFilterMMXdetect())) {
  6585. #ifdef USE_MMX
  6586. #if !defined(GCC__)
  6587. __asm
  6588. {
  6589. pusha
  6590. pxor mm0, mm0 /* zero MM0 */
  6591. mov eax, columns /* load columns into EAX */
  6592. /* ---, */
  6593. mov esi, Src /* ESI = Src row 0 address */
  6594. mov edi, Dest /* load Dest address to EDI */
  6595. add edi, eax /* EDI = EDI + columns */
  6596. inc edi /* 1 byte offset from the left edge */
  6597. mov edx, rows /* initialize ROWS counter */
  6598. sub edx, 2 /* do not use first and last rows */
  6599. /* ---, */
  6600. L10400:
  6601. mov ecx, eax /* initialize COLUMS counter */
  6602. shr ecx, 3 /* EBX/8 (MMX loads 8 bytes at a time) */
  6603. mov ebx, esi /* save ESI in EBX */
  6604. movd mm1, edi /* save EDI in MM1 */
  6605. align 16 /* 16 byte alignment of the loop entry */
  6606. L10402:
  6607. /* ---, */
  6608. movq mm4, [esi] /* load 8 bytes from Src */
  6609. movq mm5, mm4 /* save MM4 in MM5 */
  6610. add esi, 2 /* move ESI pointer 2 bytes right */
  6611. punpcklbw mm4, mm0 /* unpack 4 low bytes into words */
  6612. punpckhbw mm5, mm0 /* unpack 4 high bytes into words */
  6613. movq mm6, [esi] /* load 8 bytes from Src */
  6614. movq mm7, mm6 /* save MM6 in MM7 */
  6615. sub esi, 2 /* move ESI pointer back 2 bytes left */
  6616. punpcklbw mm6, mm0 /* unpack 4 low bytes into words */
  6617. punpckhbw mm7, mm0 /* unpack 4 high bytes into words */
  6618. add esi, eax /* move to the next row of Src */
  6619. movq mm2, [esi] /* load 8 bytes from Src */
  6620. movq mm3, mm2 /* save MM2 in MM3 */
  6621. add esi, 2 /* move ESI pointer 2 bytes right */
  6622. punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
  6623. punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
  6624. paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */
  6625. paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */
  6626. paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */
  6627. paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */
  6628. movq mm2, [esi] /* load 8 bytes from Src */
  6629. movq mm3, mm2 /* save MM2 in MM3 */
  6630. sub esi, 2 /* move ESI pointer back 2 bytes left */
  6631. punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
  6632. punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
  6633. paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */
  6634. paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */
  6635. paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */
  6636. paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */
  6637. add esi, eax /* move to the next row of Src */
  6638. movq mm2, [esi] /* load 8 bytes from Src */
  6639. movq mm3, mm2 /* save MM2 in MM3 */
  6640. add esi, 2 /* move ESI pointer 2 bytes right */
  6641. punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
  6642. punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
  6643. paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */
  6644. paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */
  6645. movq mm2, [esi] /* load 8 bytes from Src */
  6646. movq mm3, mm2 /* save MM2 in MM3 */
  6647. sub esi, 2 /* move ESI pointer back 2 bytes left */
  6648. punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
  6649. punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
  6650. paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */
  6651. paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */
  6652. /* ---, */
  6653. movq mm2, mm4 /* copy MM4 into MM2 */
  6654. psrlq mm4, 32 /* shift 2 left words to the right */
  6655. psubw mm4, mm2 /* MM4 = MM4 - MM2 */
  6656. movq mm3, mm6 /* copy MM6 into MM3 */
  6657. psrlq mm6, 32 /* shift 2 left words to the right */
  6658. psubw mm6, mm3 /* MM6 = MM6 - MM3 */
  6659. punpckldq mm4, mm6 /* combine 2 words of MM6 and 2 words of MM4 */
  6660. movq mm2, mm5 /* copy MM6 into MM2 */
  6661. psrlq mm5, 32 /* shift 2 left words to the right */
  6662. psubw mm5, mm2 /* MM5 = MM5 - MM2 */
  6663. movq mm3, mm7 /* copy MM7 into MM3 */
  6664. psrlq mm7, 32 /* shift 2 left words to the right */
  6665. psubw mm7, mm3 /* MM7 = MM7 - MM3 */
  6666. punpckldq mm5, mm7 /* combine 2 words of MM7 and 2 words of MM5 */
  6667. /* Take abs values of MM4 and MM5 */
  6668. movq mm6, mm4 /* copy MM4 into MM6 */
  6669. movq mm7, mm5 /* copy MM5 into MM7 */
  6670. psraw mm6, 15 /* fill MM6 words with word sign bit */
  6671. psraw mm7, 15 /* fill MM7 words with word sign bit */
  6672. pxor mm4, mm6 /* take 1's compliment of only neg words */
  6673. pxor mm5, mm7 /* take 1's compliment of only neg words */
  6674. psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */
  6675. psubsw mm5, mm7 /* add 1 to only neg words, W-(-1) or W-0 */
  6676. packuswb mm4, mm5 /* combine and pack/saturate MM5 and MM4 */
  6677. movq [edi], mm4 /* store result in Dest */
  6678. /* ---, */
  6679. sub esi, eax /* move to the current top row in Src */
  6680. sub esi, eax
  6681. add esi, 8 /* move Src pointer to the next 8 pixels */
  6682. add edi, 8 /* move Dest pointer to the next 8 pixels */
  6683. /* ---, */
  6684. dec ecx /* decrease loop counter COLUMNS */
  6685. jnz L10402 /* check loop termination, proceed if required */
  6686. mov esi, ebx /* restore most left current row Src address */
  6687. movd edi, mm1 /* restore most left current row Dest address */
  6688. add esi, eax /* move to the next row in Src */
  6689. add edi, eax /* move to the next row in Dest */
  6690. dec edx /* decrease loop counter ROWS */
  6691. jnz L10400 /* check loop termination, proceed if required */
  6692. /* ---, */
  6693. emms /* exit MMX state */
  6694. popa
  6695. }
  6696. #else
  6697. asm volatile
  6698. ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
  6699. "mov %3, %%eax \n\t" /* load columns into EAX */
  6700. /* --- */
  6701. "mov %1, %%esi \n\t" /* ESI = Src row 0 address */
  6702. "mov %0, %%edi \n\t" /* load Dest address to EDI */
  6703. "add %%eax, %%edi \n\t" /* EDI = EDI + columns */
  6704. "inc %%edi \n\t" /* 1 byte offset from the left edge */
  6705. "mov %2, %%edx \n\t" /* initialize ROWS counter */
  6706. "sub $2, %%edx \n\t" /* do not use first and last rows */
  6707. /* --- */
  6708. ".L10400: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */
  6709. "shr $3, %%ecx \n\t" /* EBX/8 (MMX loads 8 bytes at a time) */
  6710. "mov %%esi, %%ebx \n\t" /* save ESI in EBX */
  6711. "movd %%edi, %%mm1 \n\t" /* save EDI in MM1 */
  6712. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  6713. ".L10402: \n\t"
  6714. /* --- */
  6715. "movq (%%esi), %%mm4 \n\t" /* load 8 bytes from Src */
  6716. "movq %%mm4, %%mm5 \n\t" /* save MM4 in MM5 */
  6717. "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */
  6718. "punpcklbw %%mm0, %%mm4 \n\t" /* unpack 4 low bytes into words */
  6719. "punpckhbw %%mm0, %%mm5 \n\t" /* unpack 4 high bytes into words */
  6720. "movq (%%esi), %%mm6 \n\t" /* load 8 bytes from Src */
  6721. "movq %%mm6, %%mm7 \n\t" /* save MM6 in MM7 */
  6722. "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */
  6723. "punpcklbw %%mm0, %%mm6 \n\t" /* unpack 4 low bytes into words */
  6724. "punpckhbw %%mm0, %%mm7 \n\t" /* unpack 4 high bytes into words */
  6725. "add %%eax, %%esi \n\t" /* move to the next row of Src */
  6726. "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
  6727. "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
  6728. "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */
  6729. "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
  6730. "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
  6731. "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */
  6732. "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */
  6733. "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */
  6734. "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */
  6735. "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
  6736. "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
  6737. "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */
  6738. "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
  6739. "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
  6740. "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */
  6741. "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */
  6742. "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */
  6743. "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */
  6744. "add %%eax, %%esi \n\t" /* move to the next row of Src */
  6745. "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
  6746. "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
  6747. "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */
  6748. "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
  6749. "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
  6750. "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */
  6751. "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */
  6752. "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
  6753. "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
  6754. "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */
  6755. "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
  6756. "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
  6757. "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */
  6758. "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */
  6759. /* --- */
  6760. "movq %%mm4, %%mm2 \n\t" /* copy MM4 into MM2 */
  6761. "psrlq $32, %%mm4 \n\t" /* shift 2 left words to the right */
  6762. "psubw %%mm2, %%mm4 \n\t" /* MM4 = MM4 - MM2 */
  6763. "movq %%mm6, %%mm3 \n\t" /* copy MM6 into MM3 */
  6764. "psrlq $32, %%mm6 \n\t" /* shift 2 left words to the right */
  6765. "psubw %%mm3, %%mm6 \n\t" /* MM6 = MM6 - MM3 */
  6766. "punpckldq %%mm6, %%mm4 \n\t" /* combine 2 words of MM6 and 2 words of MM4 */
  6767. "movq %%mm5, %%mm2 \n\t" /* copy MM6 into MM2 */
  6768. "psrlq $32, %%mm5 \n\t" /* shift 2 left words to the right */
  6769. "psubw %%mm2, %%mm5 \n\t" /* MM5 = MM5 - MM2 */
  6770. "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
  6771. "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
  6772. "psubw %%mm3, %%mm7 \n\t" /* MM7 = MM7 - MM3 */
  6773. "punpckldq %%mm7, %%mm5 \n\t" /* combine 2 words of MM7 and 2 words of MM5 */
  6774. /* Take abs values of MM4 and MM5 */
  6775. "movq %%mm4, %%mm6 \n\t" /* copy MM4 into MM6 */
  6776. "movq %%mm5, %%mm7 \n\t" /* copy MM5 into MM7 */
  6777. "psraw $15, %%mm6 \n\t" /* fill MM6 words with word sign bit */
  6778. "psraw $15, %%mm7 \n\t" /* fill MM7 words with word sign bit */
  6779. "pxor %%mm6, %%mm4 \n\t" /* take 1's compliment of only neg. words */
  6780. "pxor %%mm7, %%mm5 \n\t" /* take 1's compliment of only neg. words */
  6781. "psubsw %%mm6, %%mm4 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
  6782. "psubsw %%mm7, %%mm5 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
  6783. "packuswb %%mm5, %%mm4 \n\t" /* combine and pack/saturate MM5 and MM4 */
  6784. "movq %%mm4, (%%edi) \n\t" /* store result in Dest */
  6785. /* --- */
  6786. "sub %%eax, %%esi \n\t" /* move to the current top row in Src */
  6787. "sub %%eax, %%esi \n\t" "add $8, %%esi \n\t" /* move Src pointer to the next 8 pixels */
  6788. "add $8, %%edi \n\t" /* move Dest pointer to the next 8 pixels */
  6789. /* --- */
  6790. "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
  6791. "jnz .L10402 \n\t" /* check loop termination, proceed if required */
  6792. "mov %%ebx, %%esi \n\t" /* restore most left current row Src address */
  6793. "movd %%mm1, %%edi \n\t" /* restore most left current row Dest address */
  6794. "add %%eax, %%esi \n\t" /* move to the next row in Src */
  6795. "add %%eax, %%edi \n\t" /* move to the next row in Dest */
  6796. "dec %%edx \n\t" /* decrease loop counter ROWS */
  6797. "jnz .L10400 \n\t" /* check loop termination, proceed if required */
  6798. /* --- */
  6799. "emms \n\t" /* exit MMX state */
  6800. "popa \n\t":"=m" (Dest) /* %0 */
  6801. :"m"(Src), /* %1 */
  6802. "m"(rows), /* %2 */
  6803. "m"(columns) /* %3 */
  6804. );
  6805. #endif
  6806. #endif
  6807. return (0);
  6808. } else {
  6809. /* No non-MMX implementation yet */
  6810. return (-1);
  6811. }
  6812. }
  6813. /*!
  6814. \brief Filter using SobelXShiftRight: Dij = saturation255( ... )
  6815. \param Src The source 2D byte array to sobel-filter. Should be different from destination.
  6816. \param Dest The destination 2D byte array to store the result in. Should be different from source.
  6817. \param rows Number of rows in source/destination array. Must be >2.
  6818. \param columns Number of columns in source/destination array. Must be >8.
  6819. \param NRightShift The number of right bit shifts to apply to the filter sum. Must be <7.
  6820. Note: Non-MMX implementation not available for this function.
  6821. \return Returns 1 if filter was applied, 0 otherwise.
  6822. */
  6823. int SDL_imageFilterSobelXShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
  6824. unsigned char NRightShift)
  6825. {
  6826. /* Validate input parameters */
  6827. if ((Src == NULL) || (Dest == NULL))
  6828. return(-1);
  6829. if ((columns < 8) || (rows < 3) || (NRightShift > 7))
  6830. return (-1);
  6831. if ((SDL_imageFilterMMXdetect())) {
  6832. #ifdef USE_MMX
  6833. #if !defined(GCC__)
  6834. __asm
  6835. {
  6836. pusha
  6837. pxor mm0, mm0 /* zero MM0 */
  6838. mov eax, columns /* load columns into EAX */
  6839. xor ebx, ebx /* zero EBX */
  6840. mov bl, NRightShift /* load NRightShift into BL */
  6841. movd mm1, ebx /* copy NRightShift into MM1 */
  6842. /* ---, */
  6843. mov esi, Src /* ESI = Src row 0 address */
  6844. mov edi, Dest /* load Dest address to EDI */
  6845. add edi, eax /* EDI = EDI + columns */
  6846. inc edi /* 1 byte offset from the left edge */
  6847. /* initialize ROWS counter */
  6848. sub rows, 2 /* do not use first and last rows */
  6849. /* ---, */
  6850. L10410:
  6851. mov ecx, eax /* initialize COLUMS counter */
  6852. shr ecx, 3 /* EBX/8 (MMX loads 8 bytes at a time) */
  6853. mov ebx, esi /* save ESI in EBX */
  6854. mov edx, edi /* save EDI in EDX */
  6855. align 16 /* 16 byte alignment of the loop entry */
  6856. L10412:
  6857. /* ---, */
  6858. movq mm4, [esi] /* load 8 bytes from Src */
  6859. movq mm5, mm4 /* save MM4 in MM5 */
  6860. add esi, 2 /* move ESI pointer 2 bytes right */
  6861. punpcklbw mm4, mm0 /* unpack 4 low bytes into words */
  6862. punpckhbw mm5, mm0 /* unpack 4 high bytes into words */
  6863. psrlw mm4, mm1 /* shift right each pixel NshiftRight times */
  6864. psrlw mm5, mm1 /* shift right each pixel NshiftRight times */
  6865. movq mm6, [esi] /* load 8 bytes from Src */
  6866. movq mm7, mm6 /* save MM6 in MM7 */
  6867. sub esi, 2 /* move ESI pointer back 2 bytes left */
  6868. punpcklbw mm6, mm0 /* unpack 4 low bytes into words */
  6869. punpckhbw mm7, mm0 /* unpack 4 high bytes into words */
  6870. psrlw mm6, mm1 /* shift right each pixel NshiftRight times */
  6871. psrlw mm7, mm1 /* shift right each pixel NshiftRight times */
  6872. add esi, eax /* move to the next row of Src */
  6873. movq mm2, [esi] /* load 8 bytes from Src */
  6874. movq mm3, mm2 /* save MM2 in MM3 */
  6875. add esi, 2 /* move ESI pointer 2 bytes right */
  6876. punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
  6877. punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
  6878. psrlw mm2, mm1 /* shift right each pixel NshiftRight times */
  6879. psrlw mm3, mm1 /* shift right each pixel NshiftRight times */
  6880. paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */
  6881. paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */
  6882. paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */
  6883. paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */
  6884. movq mm2, [esi] /* load 8 bytes from Src */
  6885. movq mm3, mm2 /* save MM2 in MM3 */
  6886. sub esi, 2 /* move ESI pointer back 2 bytes left */
  6887. punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
  6888. punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
  6889. psrlw mm2, mm1 /* shift right each pixel NshiftRight times */
  6890. psrlw mm3, mm1 /* shift right each pixel NshiftRight times */
  6891. paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */
  6892. paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */
  6893. paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */
  6894. paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */
  6895. add esi, eax /* move to the next row of Src */
  6896. movq mm2, [esi] /* load 8 bytes from Src */
  6897. movq mm3, mm2 /* save MM2 in MM3 */
  6898. add esi, 2 /* move ESI pointer 2 bytes right */
  6899. punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
  6900. punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
  6901. psrlw mm2, mm1 /* shift right each pixel NshiftRight times */
  6902. psrlw mm3, mm1 /* shift right each pixel NshiftRight times */
  6903. paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */
  6904. paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */
  6905. movq mm2, [esi] /* load 8 bytes from Src */
  6906. movq mm3, mm2 /* save MM2 in MM3 */
  6907. sub esi, 2 /* move ESI pointer back 2 bytes left */
  6908. punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
  6909. punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
  6910. psrlw mm2, mm1 /* shift right each pixel NshiftRight times */
  6911. psrlw mm3, mm1 /* shift right each pixel NshiftRight times */
  6912. paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */
  6913. paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */
  6914. /* ---, */
  6915. movq mm2, mm4 /* copy MM4 into MM2 */
  6916. psrlq mm4, 32 /* shift 2 left words to the right */
  6917. psubw mm4, mm2 /* MM4 = MM4 - MM2 */
  6918. movq mm3, mm6 /* copy MM6 into MM3 */
  6919. psrlq mm6, 32 /* shift 2 left words to the right */
  6920. psubw mm6, mm3 /* MM6 = MM6 - MM3 */
  6921. punpckldq mm4, mm6 /* combine 2 words of MM6 and 2 words of MM4 */
  6922. movq mm2, mm5 /* copy MM6 into MM2 */
  6923. psrlq mm5, 32 /* shift 2 left words to the right */
  6924. psubw mm5, mm2 /* MM5 = MM5 - MM2 */
  6925. movq mm3, mm7 /* copy MM7 into MM3 */
  6926. psrlq mm7, 32 /* shift 2 left words to the right */
  6927. psubw mm7, mm3 /* MM7 = MM7 - MM3 */
  6928. punpckldq mm5, mm7 /* combine 2 words of MM7 and 2 words of MM5 */
  6929. /* Take abs values of MM4 and MM5 */
  6930. movq mm6, mm4 /* copy MM4 into MM6 */
  6931. movq mm7, mm5 /* copy MM5 into MM7 */
  6932. psraw mm6, 15 /* fill MM6 words with word sign bit */
  6933. psraw mm7, 15 /* fill MM7 words with word sign bit */
  6934. pxor mm4, mm6 /* take 1's compliment of only neg words */
  6935. pxor mm5, mm7 /* take 1's compliment of only neg words */
  6936. psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */
  6937. psubsw mm5, mm7 /* add 1 to only neg words, W-(-1) or W-0 */
  6938. packuswb mm4, mm5 /* combine and pack/saturate MM5 and MM4 */
  6939. movq [edi], mm4 /* store result in Dest */
  6940. /* ---, */
  6941. sub esi, eax /* move to the current top row in Src */
  6942. sub esi, eax
  6943. add esi, 8 /* move Src pointer to the next 8 pixels */
  6944. add edi, 8 /* move Dest pointer to the next 8 pixels */
  6945. /* ---, */
  6946. dec ecx /* decrease loop counter COLUMNS */
  6947. jnz L10412 /* check loop termination, proceed if required */
  6948. mov esi, ebx /* restore most left current row Src address */
  6949. mov edi, edx /* restore most left current row Dest address */
  6950. add esi, eax /* move to the next row in Src */
  6951. add edi, eax /* move to the next row in Dest */
  6952. dec rows /* decrease loop counter ROWS */
  6953. jnz L10410 /* check loop termination, proceed if required */
  6954. /* ---, */
  6955. emms /* exit MMX state */
  6956. popa
  6957. }
  6958. #else
  6959. asm volatile
  6960. ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
  6961. "mov %3, %%eax \n\t" /* load columns into EAX */
  6962. "xor %%ebx, %%ebx \n\t" /* zero EBX */
  6963. "mov %4, %%bl \n\t" /* load NRightShift into BL */
  6964. "movd %%ebx, %%mm1 \n\t" /* copy NRightShift into MM1 */
  6965. /* --- */
  6966. "mov %1, %%esi \n\t" /* ESI = Src row 0 address */
  6967. "mov %0, %%edi \n\t" /* load Dest address to EDI */
  6968. "add %%eax, %%edi \n\t" /* EDI = EDI + columns */
  6969. "inc %%edi \n\t" /* 1 byte offset from the left edge */
  6970. /* initialize ROWS counter */
  6971. "subl $2, %2 \n\t" /* do not use first and last rows */
  6972. /* --- */
  6973. ".L10410: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */
  6974. "shr $3, %%ecx \n\t" /* EBX/8 (MMX loads 8 bytes at a time) */
  6975. "mov %%esi, %%ebx \n\t" /* save ESI in EBX */
  6976. "mov %%edi, %%edx \n\t" /* save EDI in EDX */
  6977. ".align 16 \n\t" /* 16 byte alignment of the loop entry */
  6978. ".L10412: \n\t"
  6979. /* --- */
  6980. "movq (%%esi), %%mm4 \n\t" /* load 8 bytes from Src */
  6981. "movq %%mm4, %%mm5 \n\t" /* save MM4 in MM5 */
  6982. "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */
  6983. "punpcklbw %%mm0, %%mm4 \n\t" /* unpack 4 low bytes into words */
  6984. "punpckhbw %%mm0, %%mm5 \n\t" /* unpack 4 high bytes into words */
  6985. "psrlw %%mm1, %%mm4 \n\t" /* shift right each pixel NshiftRight times */
  6986. "psrlw %%mm1, %%mm5 \n\t" /* shift right each pixel NshiftRight times */
  6987. "movq (%%esi), %%mm6 \n\t" /* load 8 bytes from Src */
  6988. "movq %%mm6, %%mm7 \n\t" /* save MM6 in MM7 */
  6989. "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */
  6990. "punpcklbw %%mm0, %%mm6 \n\t" /* unpack 4 low bytes into words */
  6991. "punpckhbw %%mm0, %%mm7 \n\t" /* unpack 4 high bytes into words */
  6992. "psrlw %%mm1, %%mm6 \n\t" /* shift right each pixel NshiftRight times */
  6993. "psrlw %%mm1, %%mm7 \n\t" /* shift right each pixel NshiftRight times */
  6994. "add %%eax, %%esi \n\t" /* move to the next row of Src */
  6995. "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
  6996. "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
  6997. "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */
  6998. "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
  6999. "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
  7000. "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  7001. "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */
  7002. "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */
  7003. "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */
  7004. "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */
  7005. "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */
  7006. "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
  7007. "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
  7008. "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */
  7009. "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
  7010. "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
  7011. "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  7012. "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */
  7013. "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */
  7014. "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */
  7015. "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */
  7016. "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */
  7017. "add %%eax, %%esi \n\t" /* move to the next row of Src */
  7018. "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
  7019. "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
  7020. "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */
  7021. "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
  7022. "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
  7023. "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  7024. "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */
  7025. "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */
  7026. "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */
  7027. "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
  7028. "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
  7029. "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */
  7030. "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
  7031. "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
  7032. "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
  7033. "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */
  7034. "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */
  7035. "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */
  7036. /* --- */
  7037. "movq %%mm4, %%mm2 \n\t" /* copy MM4 into MM2 */
  7038. "psrlq $32, %%mm4 \n\t" /* shift 2 left words to the right */
  7039. "psubw %%mm2, %%mm4 \n\t" /* MM4 = MM4 - MM2 */
  7040. "movq %%mm6, %%mm3 \n\t" /* copy MM6 into MM3 */
  7041. "psrlq $32, %%mm6 \n\t" /* shift 2 left words to the right */
  7042. "psubw %%mm3, %%mm6 \n\t" /* MM6 = MM6 - MM3 */
  7043. "punpckldq %%mm6, %%mm4 \n\t" /* combine 2 words of MM6 and 2 words of MM4 */
  7044. "movq %%mm5, %%mm2 \n\t" /* copy MM6 into MM2 */
  7045. "psrlq $32, %%mm5 \n\t" /* shift 2 left words to the right */
  7046. "psubw %%mm2, %%mm5 \n\t" /* MM5 = MM5 - MM2 */
  7047. "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
  7048. "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
  7049. "psubw %%mm3, %%mm7 \n\t" /* MM7 = MM7 - MM3 */
  7050. "punpckldq %%mm7, %%mm5 \n\t" /* combine 2 words of MM7 and 2 words of MM5 */
  7051. /* Take abs values of MM4 and MM5 */
  7052. "movq %%mm4, %%mm6 \n\t" /* copy MM4 into MM6 */
  7053. "movq %%mm5, %%mm7 \n\t" /* copy MM5 into MM7 */
  7054. "psraw $15, %%mm6 \n\t" /* fill MM6 words with word sign bit */
  7055. "psraw $15, %%mm7 \n\t" /* fill MM7 words with word sign bit */
  7056. "pxor %%mm6, %%mm4 \n\t" /* take 1's compliment of only neg. words */
  7057. "pxor %%mm7, %%mm5 \n\t" /* take 1's compliment of only neg. words */
  7058. "psubsw %%mm6, %%mm4 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
  7059. "psubsw %%mm7, %%mm5 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
  7060. "packuswb %%mm5, %%mm4 \n\t" /* combine and pack/saturate MM5 and MM4 */
  7061. "movq %%mm4, (%%edi) \n\t" /* store result in Dest */
  7062. /* --- */
  7063. "sub %%eax, %%esi \n\t" /* move to the current top row in Src */
  7064. "sub %%eax, %%esi \n\t" "add $8, %%esi \n\t" /* move Src pointer to the next 8 pixels */
  7065. "add $8, %%edi \n\t" /* move Dest pointer to the next 8 pixels */
  7066. /* --- */
  7067. "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
  7068. "jnz .L10412 \n\t" /* check loop termination, proceed if required */
  7069. "mov %%ebx, %%esi \n\t" /* restore most left current row Src address */
  7070. "mov %%edx, %%edi \n\t" /* restore most left current row Dest address */
  7071. "add %%eax, %%esi \n\t" /* move to the next row in Src */
  7072. "add %%eax, %%edi \n\t" /* move to the next row in Dest */
  7073. "decl %2 \n\t" /* decrease loop counter ROWS */
  7074. "jnz .L10410 \n\t" /* check loop termination, proceed if required */
  7075. /* --- */
  7076. "emms \n\t" /* exit MMX state */
  7077. "popa \n\t":"=m" (Dest) /* %0 */
  7078. :"m"(Src), /* %1 */
  7079. "m"(rows), /* %2 */
  7080. "m"(columns), /* %3 */
  7081. "m"(NRightShift) /* %4 */
  7082. );
  7083. #endif
  7084. #endif
  7085. return (0);
  7086. } else {
  7087. /* No non-MMX implementation yet */
  7088. return (-1);
  7089. }
  7090. }
  7091. /*!
  7092. \brief Align stack to 32 byte boundary,
  7093. */
  7094. void SDL_imageFilterAlignStack(void)
  7095. {
  7096. #ifdef USE_MMX
  7097. #if !defined(GCC__)
  7098. __asm
  7099. { /* --- stack alignment --- */
  7100. mov ebx, esp /* load ESP into EBX */
  7101. sub ebx, 4 /* reserve space on stack for old value of ESP */
  7102. and ebx, -32 /* align EBX along a 32 byte boundary */
  7103. mov [ebx], esp /* save old value of ESP in stack, behind the bndry */
  7104. mov esp, ebx /* align ESP along a 32 byte boundary */
  7105. }
  7106. #else
  7107. asm volatile
  7108. ( /* --- stack alignment --- */
  7109. "mov %%esp, %%ebx \n\t" /* load ESP into EBX */
  7110. "sub $4, %%ebx \n\t" /* reserve space on stack for old value of ESP */
  7111. "and $-32, %%ebx \n\t" /* align EBX along a 32 byte boundary */
  7112. "mov %%esp, (%%ebx) \n\t" /* save old value of ESP in stack, behind the bndry */
  7113. "mov %%ebx, %%esp \n\t" /* align ESP along a 32 byte boundary */
  7114. ::);
  7115. #endif
  7116. #endif
  7117. }
  7118. /*!
  7119. \brief Restore previously aligned stack.
  7120. */
  7121. void SDL_imageFilterRestoreStack(void)
  7122. {
  7123. #ifdef USE_MMX
  7124. #if !defined(GCC__)
  7125. __asm
  7126. { /* --- restoring old stack --- */
  7127. mov ebx, [esp] /* load old value of ESP */
  7128. mov esp, ebx /* restore old value of ESP */
  7129. }
  7130. #else
  7131. asm volatile
  7132. ( /* --- restoring old stack --- */
  7133. "mov (%%esp), %%ebx \n\t" /* load old value of ESP */
  7134. "mov %%ebx, %%esp \n\t" /* restore old value of ESP */
  7135. ::);
  7136. #endif
  7137. #endif
  7138. }