PageRenderTime 62ms CodeModel.GetById 28ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/ffmpeg/libswscale/x86/rgb2rgb_template.c

https://bitbucket.org/bgiorgini/xbmc
C | 2595 lines | 2386 code | 131 blank | 78 comment | 78 complexity | 69133a8698389e86b7d9121a15e4a22d MD5 | raw file
Possible License(s): GPL-3.0, CC-BY-SA-3.0, BSD-3-Clause, GPL-2.0, LGPL-3.0, 0BSD, LGPL-2.0, AGPL-1.0, LGPL-2.1

Large files files are truncated, but you can click here to view the full file

  1. /*
  2. * software RGB to RGB converter
  3. * pluralize by software PAL8 to RGB converter
  4. * software YUV to YUV converter
  5. * software YUV to RGB converter
  6. * Written by Nick Kurshev.
  7. * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
  8. * lot of big-endian byte order fixes by Alex Beregszaszi
  9. *
  10. * This file is part of FFmpeg.
  11. *
  12. * FFmpeg is free software; you can redistribute it and/or
  13. * modify it under the terms of the GNU Lesser General Public
  14. * License as published by the Free Software Foundation; either
  15. * version 2.1 of the License, or (at your option) any later version.
  16. *
  17. * FFmpeg is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  20. * Lesser General Public License for more details.
  21. *
  22. * You should have received a copy of the GNU Lesser General Public
  23. * License along with FFmpeg; if not, write to the Free Software
  24. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25. */
  26. #include <stddef.h>
  27. #undef PREFETCH
  28. #undef MOVNTQ
  29. #undef EMMS
  30. #undef SFENCE
  31. #undef PAVGB
  32. #if COMPILE_TEMPLATE_AMD3DNOW
  33. #define PREFETCH "prefetch"
  34. #define PAVGB "pavgusb"
  35. #elif COMPILE_TEMPLATE_MMX2
  36. #define PREFETCH "prefetchnta"
  37. #define PAVGB "pavgb"
  38. #else
  39. #define PREFETCH " # nop"
  40. #endif
  41. #if COMPILE_TEMPLATE_AMD3DNOW
  42. /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
  43. #define EMMS "femms"
  44. #else
  45. #define EMMS "emms"
  46. #endif
  47. #if COMPILE_TEMPLATE_MMX2
  48. #define MOVNTQ "movntq"
  49. #define SFENCE "sfence"
  50. #else
  51. #define MOVNTQ "movq"
  52. #define SFENCE " # nop"
  53. #endif
  54. #if !COMPILE_TEMPLATE_SSE2
  55. #if !COMPILE_TEMPLATE_AMD3DNOW
  56. static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
  57. {
  58. uint8_t *dest = dst;
  59. const uint8_t *s = src;
  60. const uint8_t *end;
  61. const uint8_t *mm_end;
  62. end = s + src_size;
  63. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  64. mm_end = end - 23;
  65. __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
  66. while (s < mm_end) {
  67. __asm__ volatile(
  68. PREFETCH" 32%1 \n\t"
  69. "movd %1, %%mm0 \n\t"
  70. "punpckldq 3%1, %%mm0 \n\t"
  71. "movd 6%1, %%mm1 \n\t"
  72. "punpckldq 9%1, %%mm1 \n\t"
  73. "movd 12%1, %%mm2 \n\t"
  74. "punpckldq 15%1, %%mm2 \n\t"
  75. "movd 18%1, %%mm3 \n\t"
  76. "punpckldq 21%1, %%mm3 \n\t"
  77. "por %%mm7, %%mm0 \n\t"
  78. "por %%mm7, %%mm1 \n\t"
  79. "por %%mm7, %%mm2 \n\t"
  80. "por %%mm7, %%mm3 \n\t"
  81. MOVNTQ" %%mm0, %0 \n\t"
  82. MOVNTQ" %%mm1, 8%0 \n\t"
  83. MOVNTQ" %%mm2, 16%0 \n\t"
  84. MOVNTQ" %%mm3, 24%0"
  85. :"=m"(*dest)
  86. :"m"(*s)
  87. :"memory");
  88. dest += 32;
  89. s += 24;
  90. }
  91. __asm__ volatile(SFENCE:::"memory");
  92. __asm__ volatile(EMMS:::"memory");
  93. while (s < end) {
  94. *dest++ = *s++;
  95. *dest++ = *s++;
  96. *dest++ = *s++;
  97. *dest++ = 255;
  98. }
  99. }
  100. #define STORE_BGR24_MMX \
  101. "psrlq $8, %%mm2 \n\t" \
  102. "psrlq $8, %%mm3 \n\t" \
  103. "psrlq $8, %%mm6 \n\t" \
  104. "psrlq $8, %%mm7 \n\t" \
  105. "pand "MANGLE(mask24l)", %%mm0\n\t" \
  106. "pand "MANGLE(mask24l)", %%mm1\n\t" \
  107. "pand "MANGLE(mask24l)", %%mm4\n\t" \
  108. "pand "MANGLE(mask24l)", %%mm5\n\t" \
  109. "pand "MANGLE(mask24h)", %%mm2\n\t" \
  110. "pand "MANGLE(mask24h)", %%mm3\n\t" \
  111. "pand "MANGLE(mask24h)", %%mm6\n\t" \
  112. "pand "MANGLE(mask24h)", %%mm7\n\t" \
  113. "por %%mm2, %%mm0 \n\t" \
  114. "por %%mm3, %%mm1 \n\t" \
  115. "por %%mm6, %%mm4 \n\t" \
  116. "por %%mm7, %%mm5 \n\t" \
  117. \
  118. "movq %%mm1, %%mm2 \n\t" \
  119. "movq %%mm4, %%mm3 \n\t" \
  120. "psllq $48, %%mm2 \n\t" \
  121. "psllq $32, %%mm3 \n\t" \
  122. "pand "MANGLE(mask24hh)", %%mm2\n\t" \
  123. "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
  124. "por %%mm2, %%mm0 \n\t" \
  125. "psrlq $16, %%mm1 \n\t" \
  126. "psrlq $32, %%mm4 \n\t" \
  127. "psllq $16, %%mm5 \n\t" \
  128. "por %%mm3, %%mm1 \n\t" \
  129. "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
  130. "por %%mm5, %%mm4 \n\t" \
  131. \
  132. MOVNTQ" %%mm0, %0 \n\t" \
  133. MOVNTQ" %%mm1, 8%0 \n\t" \
  134. MOVNTQ" %%mm4, 16%0"
  135. static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
  136. {
  137. uint8_t *dest = dst;
  138. const uint8_t *s = src;
  139. const uint8_t *end;
  140. const uint8_t *mm_end;
  141. end = s + src_size;
  142. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  143. mm_end = end - 31;
  144. while (s < mm_end) {
  145. __asm__ volatile(
  146. PREFETCH" 32%1 \n\t"
  147. "movq %1, %%mm0 \n\t"
  148. "movq 8%1, %%mm1 \n\t"
  149. "movq 16%1, %%mm4 \n\t"
  150. "movq 24%1, %%mm5 \n\t"
  151. "movq %%mm0, %%mm2 \n\t"
  152. "movq %%mm1, %%mm3 \n\t"
  153. "movq %%mm4, %%mm6 \n\t"
  154. "movq %%mm5, %%mm7 \n\t"
  155. STORE_BGR24_MMX
  156. :"=m"(*dest)
  157. :"m"(*s)
  158. :"memory");
  159. dest += 24;
  160. s += 32;
  161. }
  162. __asm__ volatile(SFENCE:::"memory");
  163. __asm__ volatile(EMMS:::"memory");
  164. while (s < end) {
  165. *dest++ = *s++;
  166. *dest++ = *s++;
  167. *dest++ = *s++;
  168. s++;
  169. }
  170. }
  171. /*
  172. original by Strepto/Astral
  173. ported to gcc & bugfixed: A'rpi
  174. MMX2, 3DNOW optimization by Nick Kurshev
  175. 32-bit C version, and and&add trick by Michael Niedermayer
  176. */
  177. static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
  178. {
  179. register const uint8_t* s=src;
  180. register uint8_t* d=dst;
  181. register const uint8_t *end;
  182. const uint8_t *mm_end;
  183. end = s + src_size;
  184. __asm__ volatile(PREFETCH" %0"::"m"(*s));
  185. __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
  186. mm_end = end - 15;
  187. while (s<mm_end) {
  188. __asm__ volatile(
  189. PREFETCH" 32%1 \n\t"
  190. "movq %1, %%mm0 \n\t"
  191. "movq 8%1, %%mm2 \n\t"
  192. "movq %%mm0, %%mm1 \n\t"
  193. "movq %%mm2, %%mm3 \n\t"
  194. "pand %%mm4, %%mm0 \n\t"
  195. "pand %%mm4, %%mm2 \n\t"
  196. "paddw %%mm1, %%mm0 \n\t"
  197. "paddw %%mm3, %%mm2 \n\t"
  198. MOVNTQ" %%mm0, %0 \n\t"
  199. MOVNTQ" %%mm2, 8%0"
  200. :"=m"(*d)
  201. :"m"(*s)
  202. );
  203. d+=16;
  204. s+=16;
  205. }
  206. __asm__ volatile(SFENCE:::"memory");
  207. __asm__ volatile(EMMS:::"memory");
  208. mm_end = end - 3;
  209. while (s < mm_end) {
  210. register unsigned x= *((const uint32_t *)s);
  211. *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
  212. d+=4;
  213. s+=4;
  214. }
  215. if (s < end) {
  216. register unsigned short x= *((const uint16_t *)s);
  217. *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
  218. }
  219. }
  220. static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
  221. {
  222. register const uint8_t* s=src;
  223. register uint8_t* d=dst;
  224. register const uint8_t *end;
  225. const uint8_t *mm_end;
  226. end = s + src_size;
  227. __asm__ volatile(PREFETCH" %0"::"m"(*s));
  228. __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
  229. __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
  230. mm_end = end - 15;
  231. while (s<mm_end) {
  232. __asm__ volatile(
  233. PREFETCH" 32%1 \n\t"
  234. "movq %1, %%mm0 \n\t"
  235. "movq 8%1, %%mm2 \n\t"
  236. "movq %%mm0, %%mm1 \n\t"
  237. "movq %%mm2, %%mm3 \n\t"
  238. "psrlq $1, %%mm0 \n\t"
  239. "psrlq $1, %%mm2 \n\t"
  240. "pand %%mm7, %%mm0 \n\t"
  241. "pand %%mm7, %%mm2 \n\t"
  242. "pand %%mm6, %%mm1 \n\t"
  243. "pand %%mm6, %%mm3 \n\t"
  244. "por %%mm1, %%mm0 \n\t"
  245. "por %%mm3, %%mm2 \n\t"
  246. MOVNTQ" %%mm0, %0 \n\t"
  247. MOVNTQ" %%mm2, 8%0"
  248. :"=m"(*d)
  249. :"m"(*s)
  250. );
  251. d+=16;
  252. s+=16;
  253. }
  254. __asm__ volatile(SFENCE:::"memory");
  255. __asm__ volatile(EMMS:::"memory");
  256. mm_end = end - 3;
  257. while (s < mm_end) {
  258. register uint32_t x= *((const uint32_t*)s);
  259. *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
  260. s+=4;
  261. d+=4;
  262. }
  263. if (s < end) {
  264. register uint16_t x= *((const uint16_t*)s);
  265. *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
  266. }
  267. }
  268. static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
  269. {
  270. const uint8_t *s = src;
  271. const uint8_t *end;
  272. const uint8_t *mm_end;
  273. uint16_t *d = (uint16_t *)dst;
  274. end = s + src_size;
  275. mm_end = end - 15;
  276. #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
  277. __asm__ volatile(
  278. "movq %3, %%mm5 \n\t"
  279. "movq %4, %%mm6 \n\t"
  280. "movq %5, %%mm7 \n\t"
  281. "jmp 2f \n\t"
  282. ".p2align 4 \n\t"
  283. "1: \n\t"
  284. PREFETCH" 32(%1) \n\t"
  285. "movd (%1), %%mm0 \n\t"
  286. "movd 4(%1), %%mm3 \n\t"
  287. "punpckldq 8(%1), %%mm0 \n\t"
  288. "punpckldq 12(%1), %%mm3 \n\t"
  289. "movq %%mm0, %%mm1 \n\t"
  290. "movq %%mm3, %%mm4 \n\t"
  291. "pand %%mm6, %%mm0 \n\t"
  292. "pand %%mm6, %%mm3 \n\t"
  293. "pmaddwd %%mm7, %%mm0 \n\t"
  294. "pmaddwd %%mm7, %%mm3 \n\t"
  295. "pand %%mm5, %%mm1 \n\t"
  296. "pand %%mm5, %%mm4 \n\t"
  297. "por %%mm1, %%mm0 \n\t"
  298. "por %%mm4, %%mm3 \n\t"
  299. "psrld $5, %%mm0 \n\t"
  300. "pslld $11, %%mm3 \n\t"
  301. "por %%mm3, %%mm0 \n\t"
  302. MOVNTQ" %%mm0, (%0) \n\t"
  303. "add $16, %1 \n\t"
  304. "add $8, %0 \n\t"
  305. "2: \n\t"
  306. "cmp %2, %1 \n\t"
  307. " jb 1b \n\t"
  308. : "+r" (d), "+r"(s)
  309. : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
  310. );
  311. #else
  312. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  313. __asm__ volatile(
  314. "movq %0, %%mm7 \n\t"
  315. "movq %1, %%mm6 \n\t"
  316. ::"m"(red_16mask),"m"(green_16mask));
  317. while (s < mm_end) {
  318. __asm__ volatile(
  319. PREFETCH" 32%1 \n\t"
  320. "movd %1, %%mm0 \n\t"
  321. "movd 4%1, %%mm3 \n\t"
  322. "punpckldq 8%1, %%mm0 \n\t"
  323. "punpckldq 12%1, %%mm3 \n\t"
  324. "movq %%mm0, %%mm1 \n\t"
  325. "movq %%mm0, %%mm2 \n\t"
  326. "movq %%mm3, %%mm4 \n\t"
  327. "movq %%mm3, %%mm5 \n\t"
  328. "psrlq $3, %%mm0 \n\t"
  329. "psrlq $3, %%mm3 \n\t"
  330. "pand %2, %%mm0 \n\t"
  331. "pand %2, %%mm3 \n\t"
  332. "psrlq $5, %%mm1 \n\t"
  333. "psrlq $5, %%mm4 \n\t"
  334. "pand %%mm6, %%mm1 \n\t"
  335. "pand %%mm6, %%mm4 \n\t"
  336. "psrlq $8, %%mm2 \n\t"
  337. "psrlq $8, %%mm5 \n\t"
  338. "pand %%mm7, %%mm2 \n\t"
  339. "pand %%mm7, %%mm5 \n\t"
  340. "por %%mm1, %%mm0 \n\t"
  341. "por %%mm4, %%mm3 \n\t"
  342. "por %%mm2, %%mm0 \n\t"
  343. "por %%mm5, %%mm3 \n\t"
  344. "psllq $16, %%mm3 \n\t"
  345. "por %%mm3, %%mm0 \n\t"
  346. MOVNTQ" %%mm0, %0 \n\t"
  347. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  348. d += 4;
  349. s += 16;
  350. }
  351. #endif
  352. __asm__ volatile(SFENCE:::"memory");
  353. __asm__ volatile(EMMS:::"memory");
  354. while (s < end) {
  355. register int rgb = *(const uint32_t*)s; s += 4;
  356. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
  357. }
  358. }
  359. static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
  360. {
  361. const uint8_t *s = src;
  362. const uint8_t *end;
  363. const uint8_t *mm_end;
  364. uint16_t *d = (uint16_t *)dst;
  365. end = s + src_size;
  366. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  367. __asm__ volatile(
  368. "movq %0, %%mm7 \n\t"
  369. "movq %1, %%mm6 \n\t"
  370. ::"m"(red_16mask),"m"(green_16mask));
  371. mm_end = end - 15;
  372. while (s < mm_end) {
  373. __asm__ volatile(
  374. PREFETCH" 32%1 \n\t"
  375. "movd %1, %%mm0 \n\t"
  376. "movd 4%1, %%mm3 \n\t"
  377. "punpckldq 8%1, %%mm0 \n\t"
  378. "punpckldq 12%1, %%mm3 \n\t"
  379. "movq %%mm0, %%mm1 \n\t"
  380. "movq %%mm0, %%mm2 \n\t"
  381. "movq %%mm3, %%mm4 \n\t"
  382. "movq %%mm3, %%mm5 \n\t"
  383. "psllq $8, %%mm0 \n\t"
  384. "psllq $8, %%mm3 \n\t"
  385. "pand %%mm7, %%mm0 \n\t"
  386. "pand %%mm7, %%mm3 \n\t"
  387. "psrlq $5, %%mm1 \n\t"
  388. "psrlq $5, %%mm4 \n\t"
  389. "pand %%mm6, %%mm1 \n\t"
  390. "pand %%mm6, %%mm4 \n\t"
  391. "psrlq $19, %%mm2 \n\t"
  392. "psrlq $19, %%mm5 \n\t"
  393. "pand %2, %%mm2 \n\t"
  394. "pand %2, %%mm5 \n\t"
  395. "por %%mm1, %%mm0 \n\t"
  396. "por %%mm4, %%mm3 \n\t"
  397. "por %%mm2, %%mm0 \n\t"
  398. "por %%mm5, %%mm3 \n\t"
  399. "psllq $16, %%mm3 \n\t"
  400. "por %%mm3, %%mm0 \n\t"
  401. MOVNTQ" %%mm0, %0 \n\t"
  402. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  403. d += 4;
  404. s += 16;
  405. }
  406. __asm__ volatile(SFENCE:::"memory");
  407. __asm__ volatile(EMMS:::"memory");
  408. while (s < end) {
  409. register int rgb = *(const uint32_t*)s; s += 4;
  410. *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
  411. }
  412. }
  413. static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
  414. {
  415. const uint8_t *s = src;
  416. const uint8_t *end;
  417. const uint8_t *mm_end;
  418. uint16_t *d = (uint16_t *)dst;
  419. end = s + src_size;
  420. mm_end = end - 15;
  421. #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
  422. __asm__ volatile(
  423. "movq %3, %%mm5 \n\t"
  424. "movq %4, %%mm6 \n\t"
  425. "movq %5, %%mm7 \n\t"
  426. "jmp 2f \n\t"
  427. ".p2align 4 \n\t"
  428. "1: \n\t"
  429. PREFETCH" 32(%1) \n\t"
  430. "movd (%1), %%mm0 \n\t"
  431. "movd 4(%1), %%mm3 \n\t"
  432. "punpckldq 8(%1), %%mm0 \n\t"
  433. "punpckldq 12(%1), %%mm3 \n\t"
  434. "movq %%mm0, %%mm1 \n\t"
  435. "movq %%mm3, %%mm4 \n\t"
  436. "pand %%mm6, %%mm0 \n\t"
  437. "pand %%mm6, %%mm3 \n\t"
  438. "pmaddwd %%mm7, %%mm0 \n\t"
  439. "pmaddwd %%mm7, %%mm3 \n\t"
  440. "pand %%mm5, %%mm1 \n\t"
  441. "pand %%mm5, %%mm4 \n\t"
  442. "por %%mm1, %%mm0 \n\t"
  443. "por %%mm4, %%mm3 \n\t"
  444. "psrld $6, %%mm0 \n\t"
  445. "pslld $10, %%mm3 \n\t"
  446. "por %%mm3, %%mm0 \n\t"
  447. MOVNTQ" %%mm0, (%0) \n\t"
  448. "add $16, %1 \n\t"
  449. "add $8, %0 \n\t"
  450. "2: \n\t"
  451. "cmp %2, %1 \n\t"
  452. " jb 1b \n\t"
  453. : "+r" (d), "+r"(s)
  454. : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
  455. );
  456. #else
  457. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  458. __asm__ volatile(
  459. "movq %0, %%mm7 \n\t"
  460. "movq %1, %%mm6 \n\t"
  461. ::"m"(red_15mask),"m"(green_15mask));
  462. while (s < mm_end) {
  463. __asm__ volatile(
  464. PREFETCH" 32%1 \n\t"
  465. "movd %1, %%mm0 \n\t"
  466. "movd 4%1, %%mm3 \n\t"
  467. "punpckldq 8%1, %%mm0 \n\t"
  468. "punpckldq 12%1, %%mm3 \n\t"
  469. "movq %%mm0, %%mm1 \n\t"
  470. "movq %%mm0, %%mm2 \n\t"
  471. "movq %%mm3, %%mm4 \n\t"
  472. "movq %%mm3, %%mm5 \n\t"
  473. "psrlq $3, %%mm0 \n\t"
  474. "psrlq $3, %%mm3 \n\t"
  475. "pand %2, %%mm0 \n\t"
  476. "pand %2, %%mm3 \n\t"
  477. "psrlq $6, %%mm1 \n\t"
  478. "psrlq $6, %%mm4 \n\t"
  479. "pand %%mm6, %%mm1 \n\t"
  480. "pand %%mm6, %%mm4 \n\t"
  481. "psrlq $9, %%mm2 \n\t"
  482. "psrlq $9, %%mm5 \n\t"
  483. "pand %%mm7, %%mm2 \n\t"
  484. "pand %%mm7, %%mm5 \n\t"
  485. "por %%mm1, %%mm0 \n\t"
  486. "por %%mm4, %%mm3 \n\t"
  487. "por %%mm2, %%mm0 \n\t"
  488. "por %%mm5, %%mm3 \n\t"
  489. "psllq $16, %%mm3 \n\t"
  490. "por %%mm3, %%mm0 \n\t"
  491. MOVNTQ" %%mm0, %0 \n\t"
  492. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  493. d += 4;
  494. s += 16;
  495. }
  496. #endif
  497. __asm__ volatile(SFENCE:::"memory");
  498. __asm__ volatile(EMMS:::"memory");
  499. while (s < end) {
  500. register int rgb = *(const uint32_t*)s; s += 4;
  501. *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
  502. }
  503. }
  504. static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
  505. {
  506. const uint8_t *s = src;
  507. const uint8_t *end;
  508. const uint8_t *mm_end;
  509. uint16_t *d = (uint16_t *)dst;
  510. end = s + src_size;
  511. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  512. __asm__ volatile(
  513. "movq %0, %%mm7 \n\t"
  514. "movq %1, %%mm6 \n\t"
  515. ::"m"(red_15mask),"m"(green_15mask));
  516. mm_end = end - 15;
  517. while (s < mm_end) {
  518. __asm__ volatile(
  519. PREFETCH" 32%1 \n\t"
  520. "movd %1, %%mm0 \n\t"
  521. "movd 4%1, %%mm3 \n\t"
  522. "punpckldq 8%1, %%mm0 \n\t"
  523. "punpckldq 12%1, %%mm3 \n\t"
  524. "movq %%mm0, %%mm1 \n\t"
  525. "movq %%mm0, %%mm2 \n\t"
  526. "movq %%mm3, %%mm4 \n\t"
  527. "movq %%mm3, %%mm5 \n\t"
  528. "psllq $7, %%mm0 \n\t"
  529. "psllq $7, %%mm3 \n\t"
  530. "pand %%mm7, %%mm0 \n\t"
  531. "pand %%mm7, %%mm3 \n\t"
  532. "psrlq $6, %%mm1 \n\t"
  533. "psrlq $6, %%mm4 \n\t"
  534. "pand %%mm6, %%mm1 \n\t"
  535. "pand %%mm6, %%mm4 \n\t"
  536. "psrlq $19, %%mm2 \n\t"
  537. "psrlq $19, %%mm5 \n\t"
  538. "pand %2, %%mm2 \n\t"
  539. "pand %2, %%mm5 \n\t"
  540. "por %%mm1, %%mm0 \n\t"
  541. "por %%mm4, %%mm3 \n\t"
  542. "por %%mm2, %%mm0 \n\t"
  543. "por %%mm5, %%mm3 \n\t"
  544. "psllq $16, %%mm3 \n\t"
  545. "por %%mm3, %%mm0 \n\t"
  546. MOVNTQ" %%mm0, %0 \n\t"
  547. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  548. d += 4;
  549. s += 16;
  550. }
  551. __asm__ volatile(SFENCE:::"memory");
  552. __asm__ volatile(EMMS:::"memory");
  553. while (s < end) {
  554. register int rgb = *(const uint32_t*)s; s += 4;
  555. *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
  556. }
  557. }
  558. static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
  559. {
  560. const uint8_t *s = src;
  561. const uint8_t *end;
  562. const uint8_t *mm_end;
  563. uint16_t *d = (uint16_t *)dst;
  564. end = s + src_size;
  565. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  566. __asm__ volatile(
  567. "movq %0, %%mm7 \n\t"
  568. "movq %1, %%mm6 \n\t"
  569. ::"m"(red_16mask),"m"(green_16mask));
  570. mm_end = end - 11;
  571. while (s < mm_end) {
  572. __asm__ volatile(
  573. PREFETCH" 32%1 \n\t"
  574. "movd %1, %%mm0 \n\t"
  575. "movd 3%1, %%mm3 \n\t"
  576. "punpckldq 6%1, %%mm0 \n\t"
  577. "punpckldq 9%1, %%mm3 \n\t"
  578. "movq %%mm0, %%mm1 \n\t"
  579. "movq %%mm0, %%mm2 \n\t"
  580. "movq %%mm3, %%mm4 \n\t"
  581. "movq %%mm3, %%mm5 \n\t"
  582. "psrlq $3, %%mm0 \n\t"
  583. "psrlq $3, %%mm3 \n\t"
  584. "pand %2, %%mm0 \n\t"
  585. "pand %2, %%mm3 \n\t"
  586. "psrlq $5, %%mm1 \n\t"
  587. "psrlq $5, %%mm4 \n\t"
  588. "pand %%mm6, %%mm1 \n\t"
  589. "pand %%mm6, %%mm4 \n\t"
  590. "psrlq $8, %%mm2 \n\t"
  591. "psrlq $8, %%mm5 \n\t"
  592. "pand %%mm7, %%mm2 \n\t"
  593. "pand %%mm7, %%mm5 \n\t"
  594. "por %%mm1, %%mm0 \n\t"
  595. "por %%mm4, %%mm3 \n\t"
  596. "por %%mm2, %%mm0 \n\t"
  597. "por %%mm5, %%mm3 \n\t"
  598. "psllq $16, %%mm3 \n\t"
  599. "por %%mm3, %%mm0 \n\t"
  600. MOVNTQ" %%mm0, %0 \n\t"
  601. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  602. d += 4;
  603. s += 12;
  604. }
  605. __asm__ volatile(SFENCE:::"memory");
  606. __asm__ volatile(EMMS:::"memory");
  607. while (s < end) {
  608. const int b = *s++;
  609. const int g = *s++;
  610. const int r = *s++;
  611. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  612. }
  613. }
  614. static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
  615. {
  616. const uint8_t *s = src;
  617. const uint8_t *end;
  618. const uint8_t *mm_end;
  619. uint16_t *d = (uint16_t *)dst;
  620. end = s + src_size;
  621. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  622. __asm__ volatile(
  623. "movq %0, %%mm7 \n\t"
  624. "movq %1, %%mm6 \n\t"
  625. ::"m"(red_16mask),"m"(green_16mask));
  626. mm_end = end - 15;
  627. while (s < mm_end) {
  628. __asm__ volatile(
  629. PREFETCH" 32%1 \n\t"
  630. "movd %1, %%mm0 \n\t"
  631. "movd 3%1, %%mm3 \n\t"
  632. "punpckldq 6%1, %%mm0 \n\t"
  633. "punpckldq 9%1, %%mm3 \n\t"
  634. "movq %%mm0, %%mm1 \n\t"
  635. "movq %%mm0, %%mm2 \n\t"
  636. "movq %%mm3, %%mm4 \n\t"
  637. "movq %%mm3, %%mm5 \n\t"
  638. "psllq $8, %%mm0 \n\t"
  639. "psllq $8, %%mm3 \n\t"
  640. "pand %%mm7, %%mm0 \n\t"
  641. "pand %%mm7, %%mm3 \n\t"
  642. "psrlq $5, %%mm1 \n\t"
  643. "psrlq $5, %%mm4 \n\t"
  644. "pand %%mm6, %%mm1 \n\t"
  645. "pand %%mm6, %%mm4 \n\t"
  646. "psrlq $19, %%mm2 \n\t"
  647. "psrlq $19, %%mm5 \n\t"
  648. "pand %2, %%mm2 \n\t"
  649. "pand %2, %%mm5 \n\t"
  650. "por %%mm1, %%mm0 \n\t"
  651. "por %%mm4, %%mm3 \n\t"
  652. "por %%mm2, %%mm0 \n\t"
  653. "por %%mm5, %%mm3 \n\t"
  654. "psllq $16, %%mm3 \n\t"
  655. "por %%mm3, %%mm0 \n\t"
  656. MOVNTQ" %%mm0, %0 \n\t"
  657. :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
  658. d += 4;
  659. s += 12;
  660. }
  661. __asm__ volatile(SFENCE:::"memory");
  662. __asm__ volatile(EMMS:::"memory");
  663. while (s < end) {
  664. const int r = *s++;
  665. const int g = *s++;
  666. const int b = *s++;
  667. *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
  668. }
  669. }
  670. static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
  671. {
  672. const uint8_t *s = src;
  673. const uint8_t *end;
  674. const uint8_t *mm_end;
  675. uint16_t *d = (uint16_t *)dst;
  676. end = s + src_size;
  677. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  678. __asm__ volatile(
  679. "movq %0, %%mm7 \n\t"
  680. "movq %1, %%mm6 \n\t"
  681. ::"m"(red_15mask),"m"(green_15mask));
  682. mm_end = end - 11;
  683. while (s < mm_end) {
  684. __asm__ volatile(
  685. PREFETCH" 32%1 \n\t"
  686. "movd %1, %%mm0 \n\t"
  687. "movd 3%1, %%mm3 \n\t"
  688. "punpckldq 6%1, %%mm0 \n\t"
  689. "punpckldq 9%1, %%mm3 \n\t"
  690. "movq %%mm0, %%mm1 \n\t"
  691. "movq %%mm0, %%mm2 \n\t"
  692. "movq %%mm3, %%mm4 \n\t"
  693. "movq %%mm3, %%mm5 \n\t"
  694. "psrlq $3, %%mm0 \n\t"
  695. "psrlq $3, %%mm3 \n\t"
  696. "pand %2, %%mm0 \n\t"
  697. "pand %2, %%mm3 \n\t"
  698. "psrlq $6, %%mm1 \n\t"
  699. "psrlq $6, %%mm4 \n\t"
  700. "pand %%mm6, %%mm1 \n\t"
  701. "pand %%mm6, %%mm4 \n\t"
  702. "psrlq $9, %%mm2 \n\t"
  703. "psrlq $9, %%mm5 \n\t"
  704. "pand %%mm7, %%mm2 \n\t"
  705. "pand %%mm7, %%mm5 \n\t"
  706. "por %%mm1, %%mm0 \n\t"
  707. "por %%mm4, %%mm3 \n\t"
  708. "por %%mm2, %%mm0 \n\t"
  709. "por %%mm5, %%mm3 \n\t"
  710. "psllq $16, %%mm3 \n\t"
  711. "por %%mm3, %%mm0 \n\t"
  712. MOVNTQ" %%mm0, %0 \n\t"
  713. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  714. d += 4;
  715. s += 12;
  716. }
  717. __asm__ volatile(SFENCE:::"memory");
  718. __asm__ volatile(EMMS:::"memory");
  719. while (s < end) {
  720. const int b = *s++;
  721. const int g = *s++;
  722. const int r = *s++;
  723. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  724. }
  725. }
  726. static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
  727. {
  728. const uint8_t *s = src;
  729. const uint8_t *end;
  730. const uint8_t *mm_end;
  731. uint16_t *d = (uint16_t *)dst;
  732. end = s + src_size;
  733. __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
  734. __asm__ volatile(
  735. "movq %0, %%mm7 \n\t"
  736. "movq %1, %%mm6 \n\t"
  737. ::"m"(red_15mask),"m"(green_15mask));
  738. mm_end = end - 15;
  739. while (s < mm_end) {
  740. __asm__ volatile(
  741. PREFETCH" 32%1 \n\t"
  742. "movd %1, %%mm0 \n\t"
  743. "movd 3%1, %%mm3 \n\t"
  744. "punpckldq 6%1, %%mm0 \n\t"
  745. "punpckldq 9%1, %%mm3 \n\t"
  746. "movq %%mm0, %%mm1 \n\t"
  747. "movq %%mm0, %%mm2 \n\t"
  748. "movq %%mm3, %%mm4 \n\t"
  749. "movq %%mm3, %%mm5 \n\t"
  750. "psllq $7, %%mm0 \n\t"
  751. "psllq $7, %%mm3 \n\t"
  752. "pand %%mm7, %%mm0 \n\t"
  753. "pand %%mm7, %%mm3 \n\t"
  754. "psrlq $6, %%mm1 \n\t"
  755. "psrlq $6, %%mm4 \n\t"
  756. "pand %%mm6, %%mm1 \n\t"
  757. "pand %%mm6, %%mm4 \n\t"
  758. "psrlq $19, %%mm2 \n\t"
  759. "psrlq $19, %%mm5 \n\t"
  760. "pand %2, %%mm2 \n\t"
  761. "pand %2, %%mm5 \n\t"
  762. "por %%mm1, %%mm0 \n\t"
  763. "por %%mm4, %%mm3 \n\t"
  764. "por %%mm2, %%mm0 \n\t"
  765. "por %%mm5, %%mm3 \n\t"
  766. "psllq $16, %%mm3 \n\t"
  767. "por %%mm3, %%mm0 \n\t"
  768. MOVNTQ" %%mm0, %0 \n\t"
  769. :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
  770. d += 4;
  771. s += 12;
  772. }
  773. __asm__ volatile(SFENCE:::"memory");
  774. __asm__ volatile(EMMS:::"memory");
  775. while (s < end) {
  776. const int r = *s++;
  777. const int g = *s++;
  778. const int b = *s++;
  779. *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
  780. }
  781. }
  782. static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
  783. {
  784. const uint16_t *end;
  785. const uint16_t *mm_end;
  786. uint8_t *d = dst;
  787. const uint16_t *s = (const uint16_t*)src;
  788. end = s + src_size/2;
  789. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  790. mm_end = end - 7;
  791. while (s < mm_end) {
  792. __asm__ volatile(
  793. PREFETCH" 32%1 \n\t"
  794. "movq %1, %%mm0 \n\t"
  795. "movq %1, %%mm1 \n\t"
  796. "movq %1, %%mm2 \n\t"
  797. "pand %2, %%mm0 \n\t"
  798. "pand %3, %%mm1 \n\t"
  799. "pand %4, %%mm2 \n\t"
  800. "psllq $5, %%mm0 \n\t"
  801. "pmulhw %6, %%mm0 \n\t"
  802. "pmulhw %6, %%mm1 \n\t"
  803. "pmulhw %7, %%mm2 \n\t"
  804. "movq %%mm0, %%mm3 \n\t"
  805. "movq %%mm1, %%mm4 \n\t"
  806. "movq %%mm2, %%mm5 \n\t"
  807. "punpcklwd %5, %%mm0 \n\t"
  808. "punpcklwd %5, %%mm1 \n\t"
  809. "punpcklwd %5, %%mm2 \n\t"
  810. "punpckhwd %5, %%mm3 \n\t"
  811. "punpckhwd %5, %%mm4 \n\t"
  812. "punpckhwd %5, %%mm5 \n\t"
  813. "psllq $8, %%mm1 \n\t"
  814. "psllq $16, %%mm2 \n\t"
  815. "por %%mm1, %%mm0 \n\t"
  816. "por %%mm2, %%mm0 \n\t"
  817. "psllq $8, %%mm4 \n\t"
  818. "psllq $16, %%mm5 \n\t"
  819. "por %%mm4, %%mm3 \n\t"
  820. "por %%mm5, %%mm3 \n\t"
  821. "movq %%mm0, %%mm6 \n\t"
  822. "movq %%mm3, %%mm7 \n\t"
  823. "movq 8%1, %%mm0 \n\t"
  824. "movq 8%1, %%mm1 \n\t"
  825. "movq 8%1, %%mm2 \n\t"
  826. "pand %2, %%mm0 \n\t"
  827. "pand %3, %%mm1 \n\t"
  828. "pand %4, %%mm2 \n\t"
  829. "psllq $5, %%mm0 \n\t"
  830. "pmulhw %6, %%mm0 \n\t"
  831. "pmulhw %6, %%mm1 \n\t"
  832. "pmulhw %7, %%mm2 \n\t"
  833. "movq %%mm0, %%mm3 \n\t"
  834. "movq %%mm1, %%mm4 \n\t"
  835. "movq %%mm2, %%mm5 \n\t"
  836. "punpcklwd %5, %%mm0 \n\t"
  837. "punpcklwd %5, %%mm1 \n\t"
  838. "punpcklwd %5, %%mm2 \n\t"
  839. "punpckhwd %5, %%mm3 \n\t"
  840. "punpckhwd %5, %%mm4 \n\t"
  841. "punpckhwd %5, %%mm5 \n\t"
  842. "psllq $8, %%mm1 \n\t"
  843. "psllq $16, %%mm2 \n\t"
  844. "por %%mm1, %%mm0 \n\t"
  845. "por %%mm2, %%mm0 \n\t"
  846. "psllq $8, %%mm4 \n\t"
  847. "psllq $16, %%mm5 \n\t"
  848. "por %%mm4, %%mm3 \n\t"
  849. "por %%mm5, %%mm3 \n\t"
  850. :"=m"(*d)
  851. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r),"m"(mmx_null),"m"(mul15_mid),"m"(mul15_hi)
  852. :"memory");
  853. /* borrowed 32 to 24 */
  854. __asm__ volatile(
  855. "movq %%mm0, %%mm4 \n\t"
  856. "movq %%mm3, %%mm5 \n\t"
  857. "movq %%mm6, %%mm0 \n\t"
  858. "movq %%mm7, %%mm1 \n\t"
  859. "movq %%mm4, %%mm6 \n\t"
  860. "movq %%mm5, %%mm7 \n\t"
  861. "movq %%mm0, %%mm2 \n\t"
  862. "movq %%mm1, %%mm3 \n\t"
  863. STORE_BGR24_MMX
  864. :"=m"(*d)
  865. :"m"(*s)
  866. :"memory");
  867. d += 24;
  868. s += 8;
  869. }
  870. __asm__ volatile(SFENCE:::"memory");
  871. __asm__ volatile(EMMS:::"memory");
  872. while (s < end) {
  873. register uint16_t bgr;
  874. bgr = *s++;
  875. *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
  876. *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
  877. *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
  878. }
  879. }
  880. static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
  881. {
  882. const uint16_t *end;
  883. const uint16_t *mm_end;
  884. uint8_t *d = (uint8_t *)dst;
  885. const uint16_t *s = (const uint16_t *)src;
  886. end = s + src_size/2;
  887. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  888. mm_end = end - 7;
  889. while (s < mm_end) {
  890. __asm__ volatile(
  891. PREFETCH" 32%1 \n\t"
  892. "movq %1, %%mm0 \n\t"
  893. "movq %1, %%mm1 \n\t"
  894. "movq %1, %%mm2 \n\t"
  895. "pand %2, %%mm0 \n\t"
  896. "pand %3, %%mm1 \n\t"
  897. "pand %4, %%mm2 \n\t"
  898. "psllq $5, %%mm0 \n\t"
  899. "psrlq $1, %%mm2 \n\t"
  900. "pmulhw %6, %%mm0 \n\t"
  901. "pmulhw %8, %%mm1 \n\t"
  902. "pmulhw %7, %%mm2 \n\t"
  903. "movq %%mm0, %%mm3 \n\t"
  904. "movq %%mm1, %%mm4 \n\t"
  905. "movq %%mm2, %%mm5 \n\t"
  906. "punpcklwd %5, %%mm0 \n\t"
  907. "punpcklwd %5, %%mm1 \n\t"
  908. "punpcklwd %5, %%mm2 \n\t"
  909. "punpckhwd %5, %%mm3 \n\t"
  910. "punpckhwd %5, %%mm4 \n\t"
  911. "punpckhwd %5, %%mm5 \n\t"
  912. "psllq $8, %%mm1 \n\t"
  913. "psllq $16, %%mm2 \n\t"
  914. "por %%mm1, %%mm0 \n\t"
  915. "por %%mm2, %%mm0 \n\t"
  916. "psllq $8, %%mm4 \n\t"
  917. "psllq $16, %%mm5 \n\t"
  918. "por %%mm4, %%mm3 \n\t"
  919. "por %%mm5, %%mm3 \n\t"
  920. "movq %%mm0, %%mm6 \n\t"
  921. "movq %%mm3, %%mm7 \n\t"
  922. "movq 8%1, %%mm0 \n\t"
  923. "movq 8%1, %%mm1 \n\t"
  924. "movq 8%1, %%mm2 \n\t"
  925. "pand %2, %%mm0 \n\t"
  926. "pand %3, %%mm1 \n\t"
  927. "pand %4, %%mm2 \n\t"
  928. "psllq $5, %%mm0 \n\t"
  929. "psrlq $1, %%mm2 \n\t"
  930. "pmulhw %6, %%mm0 \n\t"
  931. "pmulhw %8, %%mm1 \n\t"
  932. "pmulhw %7, %%mm2 \n\t"
  933. "movq %%mm0, %%mm3 \n\t"
  934. "movq %%mm1, %%mm4 \n\t"
  935. "movq %%mm2, %%mm5 \n\t"
  936. "punpcklwd %5, %%mm0 \n\t"
  937. "punpcklwd %5, %%mm1 \n\t"
  938. "punpcklwd %5, %%mm2 \n\t"
  939. "punpckhwd %5, %%mm3 \n\t"
  940. "punpckhwd %5, %%mm4 \n\t"
  941. "punpckhwd %5, %%mm5 \n\t"
  942. "psllq $8, %%mm1 \n\t"
  943. "psllq $16, %%mm2 \n\t"
  944. "por %%mm1, %%mm0 \n\t"
  945. "por %%mm2, %%mm0 \n\t"
  946. "psllq $8, %%mm4 \n\t"
  947. "psllq $16, %%mm5 \n\t"
  948. "por %%mm4, %%mm3 \n\t"
  949. "por %%mm5, %%mm3 \n\t"
  950. :"=m"(*d)
  951. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null),"m"(mul15_mid),"m"(mul15_hi),"m"(mul16_mid)
  952. :"memory");
  953. /* borrowed 32 to 24 */
  954. __asm__ volatile(
  955. "movq %%mm0, %%mm4 \n\t"
  956. "movq %%mm3, %%mm5 \n\t"
  957. "movq %%mm6, %%mm0 \n\t"
  958. "movq %%mm7, %%mm1 \n\t"
  959. "movq %%mm4, %%mm6 \n\t"
  960. "movq %%mm5, %%mm7 \n\t"
  961. "movq %%mm0, %%mm2 \n\t"
  962. "movq %%mm1, %%mm3 \n\t"
  963. STORE_BGR24_MMX
  964. :"=m"(*d)
  965. :"m"(*s)
  966. :"memory");
  967. d += 24;
  968. s += 8;
  969. }
  970. __asm__ volatile(SFENCE:::"memory");
  971. __asm__ volatile(EMMS:::"memory");
  972. while (s < end) {
  973. register uint16_t bgr;
  974. bgr = *s++;
  975. *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
  976. *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
  977. *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
  978. }
  979. }
  980. /*
  981. * mm0 = 00 B3 00 B2 00 B1 00 B0
  982. * mm1 = 00 G3 00 G2 00 G1 00 G0
  983. * mm2 = 00 R3 00 R2 00 R1 00 R0
  984. * mm6 = FF FF FF FF FF FF FF FF
  985. * mm7 = 00 00 00 00 00 00 00 00
  986. */
  987. #define PACK_RGB32 \
  988. "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
  989. "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
  990. "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
  991. "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
  992. "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
  993. "movq %%mm0, %%mm3 \n\t" \
  994. "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
  995. "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
  996. MOVNTQ" %%mm0, %0 \n\t" \
  997. MOVNTQ" %%mm3, 8%0 \n\t" \
  998. static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
  999. {
  1000. const uint16_t *end;
  1001. const uint16_t *mm_end;
  1002. uint8_t *d = dst;
  1003. const uint16_t *s = (const uint16_t *)src;
  1004. end = s + src_size/2;
  1005. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  1006. __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
  1007. __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
  1008. mm_end = end - 3;
  1009. while (s < mm_end) {
  1010. __asm__ volatile(
  1011. PREFETCH" 32%1 \n\t"
  1012. "movq %1, %%mm0 \n\t"
  1013. "movq %1, %%mm1 \n\t"
  1014. "movq %1, %%mm2 \n\t"
  1015. "pand %2, %%mm0 \n\t"
  1016. "pand %3, %%mm1 \n\t"
  1017. "pand %4, %%mm2 \n\t"
  1018. "psllq $5, %%mm0 \n\t"
  1019. "pmulhw %5, %%mm0 \n\t"
  1020. "pmulhw %5, %%mm1 \n\t"
  1021. "pmulhw %6, %%mm2 \n\t"
  1022. PACK_RGB32
  1023. :"=m"(*d)
  1024. :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r),"m"(mul15_mid),"m"(mul15_hi)
  1025. :"memory");
  1026. d += 16;
  1027. s += 4;
  1028. }
  1029. __asm__ volatile(SFENCE:::"memory");
  1030. __asm__ volatile(EMMS:::"memory");
  1031. while (s < end) {
  1032. register uint16_t bgr;
  1033. bgr = *s++;
  1034. *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
  1035. *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
  1036. *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
  1037. *d++ = 255;
  1038. }
  1039. }
  1040. static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
  1041. {
  1042. const uint16_t *end;
  1043. const uint16_t *mm_end;
  1044. uint8_t *d = dst;
  1045. const uint16_t *s = (const uint16_t*)src;
  1046. end = s + src_size/2;
  1047. __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
  1048. __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
  1049. __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
  1050. mm_end = end - 3;
  1051. while (s < mm_end) {
  1052. __asm__ volatile(
  1053. PREFETCH" 32%1 \n\t"
  1054. "movq %1, %%mm0 \n\t"
  1055. "movq %1, %%mm1 \n\t"
  1056. "movq %1, %%mm2 \n\t"
  1057. "pand %2, %%mm0 \n\t"
  1058. "pand %3, %%mm1 \n\t"
  1059. "pand %4, %%mm2 \n\t"
  1060. "psllq $5, %%mm0 \n\t"
  1061. "psrlq $1, %%mm2 \n\t"
  1062. "pmulhw %5, %%mm0 \n\t"
  1063. "pmulhw %7, %%mm1 \n\t"
  1064. "pmulhw %6, %%mm2 \n\t"
  1065. PACK_RGB32
  1066. :"=m"(*d)
  1067. :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid),"m"(mul15_hi),"m"(mul16_mid)
  1068. :"memory");
  1069. d += 16;
  1070. s += 4;
  1071. }
  1072. __asm__ volatile(SFENCE:::"memory");
  1073. __asm__ volatile(EMMS:::"memory");
  1074. while (s < end) {
  1075. register uint16_t bgr;
  1076. bgr = *s++;
  1077. *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
  1078. *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
  1079. *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
  1080. *d++ = 255;
  1081. }
  1082. }
  1083. static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
  1084. {
  1085. x86_reg idx = 15 - src_size;
  1086. const uint8_t *s = src-idx;
  1087. uint8_t *d = dst-idx;
  1088. __asm__ volatile(
  1089. "test %0, %0 \n\t"
  1090. "jns 2f \n\t"
  1091. PREFETCH" (%1, %0) \n\t"
  1092. "movq %3, %%mm7 \n\t"
  1093. "pxor %4, %%mm7 \n\t"
  1094. "movq %%mm7, %%mm6 \n\t"
  1095. "pxor %5, %%mm7 \n\t"
  1096. ".p2align 4 \n\t"
  1097. "1: \n\t"
  1098. PREFETCH" 32(%1, %0) \n\t"
  1099. "movq (%1, %0), %%mm0 \n\t"
  1100. "movq 8(%1, %0), %%mm1 \n\t"
  1101. # if COMPILE_TEMPLATE_MMX2
  1102. "pshufw $177, %%mm0, %%mm3 \n\t"
  1103. "pshufw $177, %%mm1, %%mm5 \n\t"
  1104. "pand %%mm7, %%mm0 \n\t"
  1105. "pand %%mm6, %%mm3 \n\t"
  1106. "pand %%mm7, %%mm1 \n\t"
  1107. "pand %%mm6, %%mm5 \n\t"
  1108. "por %%mm3, %%mm0 \n\t"
  1109. "por %%mm5, %%mm1 \n\t"
  1110. # else
  1111. "movq %%mm0, %%mm2 \n\t"
  1112. "movq %%mm1, %%mm4 \n\t"
  1113. "pand %%mm7, %%mm0 \n\t"
  1114. "pand %%mm6, %%mm2 \n\t"
  1115. "pand %%mm7, %%mm1 \n\t"
  1116. "pand %%mm6, %%mm4 \n\t"
  1117. "movq %%mm2, %%mm3 \n\t"
  1118. "movq %%mm4, %%mm5 \n\t"
  1119. "pslld $16, %%mm2 \n\t"
  1120. "psrld $16, %%mm3 \n\t"
  1121. "pslld $16, %%mm4 \n\t"
  1122. "psrld $16, %%mm5 \n\t"
  1123. "por %%mm2, %%mm0 \n\t"
  1124. "por %%mm4, %%mm1 \n\t"
  1125. "por %%mm3, %%mm0 \n\t"
  1126. "por %%mm5, %%mm1 \n\t"
  1127. # endif
  1128. MOVNTQ" %%mm0, (%2, %0) \n\t"
  1129. MOVNTQ" %%mm1, 8(%2, %0) \n\t"
  1130. "add $16, %0 \n\t"
  1131. "js 1b \n\t"
  1132. SFENCE" \n\t"
  1133. EMMS" \n\t"
  1134. "2: \n\t"
  1135. : "+&r"(idx)
  1136. : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
  1137. : "memory");
  1138. for (; idx<15; idx+=4) {
  1139. register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
  1140. v &= 0xff00ff;
  1141. *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
  1142. }
  1143. }
  1144. static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
  1145. {
  1146. unsigned i;
  1147. x86_reg mmx_size= 23 - src_size;
  1148. __asm__ volatile (
  1149. "test %%"REG_a", %%"REG_a" \n\t"
  1150. "jns 2f \n\t"
  1151. "movq "MANGLE(mask24r)", %%mm5 \n\t"
  1152. "movq "MANGLE(mask24g)", %%mm6 \n\t"
  1153. "movq "MANGLE(mask24b)", %%mm7 \n\t"
  1154. ".p2align 4 \n\t"
  1155. "1: \n\t"
  1156. PREFETCH" 32(%1, %%"REG_a") \n\t"
  1157. "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
  1158. "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
  1159. "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
  1160. "psllq $16, %%mm0 \n\t" // 00 BGR BGR
  1161. "pand %%mm5, %%mm0 \n\t"
  1162. "pand %%mm6, %%mm1 \n\t"
  1163. "pand %%mm7, %%mm2 \n\t"
  1164. "por %%mm0, %%mm1 \n\t"
  1165. "por %%mm2, %%mm1 \n\t"
  1166. "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
  1167. MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
  1168. "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
  1169. "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
  1170. "pand %%mm7, %%mm0 \n\t"
  1171. "pand %%mm5, %%mm1 \n\t"
  1172. "pand %%mm6, %%mm2 \n\t"
  1173. "por %%mm0, %%mm1 \n\t"
  1174. "por %%mm2, %%mm1 \n\t"
  1175. "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
  1176. MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
  1177. "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
  1178. "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
  1179. "pand %%mm6, %%mm0 \n\t"
  1180. "pand %%mm7, %%mm1 \n\t"
  1181. "pand %%mm5, %%mm2 \n\t"
  1182. "por %%mm0, %%mm1 \n\t"
  1183. "por %%mm2, %%mm1 \n\t"
  1184. MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
  1185. "add $24, %%"REG_a" \n\t"
  1186. " js 1b \n\t"
  1187. "2: \n\t"
  1188. : "+a" (mmx_size)
  1189. : "r" (src-mmx_size), "r"(dst-mmx_size)
  1190. );
  1191. __asm__ volatile(SFENCE:::"memory");
  1192. __asm__ volatile(EMMS:::"memory");
  1193. if (mmx_size==23) return; //finished, was multiple of 8
  1194. src+= src_size;
  1195. dst+= src_size;
  1196. src_size= 23-mmx_size;
  1197. src-= src_size;
  1198. dst-= src_size;
  1199. for (i=0; i<src_size; i+=3) {
  1200. register uint8_t x;
  1201. x = src[i + 2];
  1202. dst[i + 1] = src[i + 1];
  1203. dst[i + 2] = src[i + 0];
  1204. dst[i + 0] = x;
  1205. }
  1206. }
  1207. static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
  1208. int width, int height,
  1209. int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
  1210. {
  1211. int y;
  1212. const x86_reg chromWidth= width>>1;
  1213. for (y=0; y<height; y++) {
  1214. //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
  1215. __asm__ volatile(
  1216. "xor %%"REG_a", %%"REG_a" \n\t"
  1217. ".p2align 4 \n\t"
  1218. "1: \n\t"
  1219. PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
  1220. PREFETCH" 32(%2, %%"REG_a") \n\t"
  1221. PREFETCH" 32(%3, %%"REG_a") \n\t"
  1222. "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
  1223. "movq %%mm0, %%mm2 \n\t" // U(0)
  1224. "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
  1225. "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
  1226. "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
  1227. "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
  1228. "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
  1229. "movq %%mm3, %%mm4 \n\t" // Y(0)
  1230. "movq %%mm5, %%mm6 \n\t" // Y(8)
  1231. "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
  1232. "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
  1233. "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
  1234. "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
  1235. MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
  1236. MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
  1237. MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
  1238. MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
  1239. "add $8, %%"REG_a" \n\t"
  1240. "cmp %4, %%"REG_a" \n\t"
  1241. " jb 1b \n\t"
  1242. ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
  1243. : "%"REG_a
  1244. );
  1245. if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
  1246. usrc += chromStride;
  1247. vsrc += chromStride;
  1248. }
  1249. ysrc += lumStride;
  1250. dst += dstStride;
  1251. }
  1252. __asm__(EMMS" …

Large files files are truncated, but you can click here to view the full file