PageRenderTime 95ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 1ms

/branches/jet3d_dev_msvc2005/source/Engine/JetEngine/Bitmap/Compression/YUV.c

#
C | 1021 lines | 749 code | 196 blank | 76 comment | 4 complexity | feec26ced659f2367447079cf4b77f5a MD5 | raw file
  1. /****************************************************************************************/
  2. /* Yuv */
  3. /* */
  4. /* Author: Charles Bloom */
  5. /* Description: YUV <-> RGB code */
  6. /* */
  7. /* The contents of this file are subject to the Jet3D Public License */
  8. /* Version 1.02 (the "License"); you may not use this file except in */
  9. /* compliance with the License. You may obtain a copy of the License at */
  10. /* http://www.jet3d.com */
  11. /* */
  12. /* Software distributed under the License is distributed on an "AS IS" */
  13. /* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See */
  14. /* the License for the specific language governing rights and limitations */
  15. /* under the License. */
  16. /* */
  17. /* The Original Code is Jet3D, released December 12, 1999. */
  18. /* Copyright (C) 1996-1999 Eclipse Entertainment, L.L.C. All Rights Reserved */
  19. /* */
  20. /****************************************************************************************/
  21. #include <assert.h>
  22. #include "YUV.h"
  23. #include "Utility.h"
  24. #include "Cpu.h"
  25. #ifdef BUILD_BE
  26. #include <inttypes.h>
  27. #define __int64 uint64_t
  28. #endif
  29. #pragma warning(disable : 4244) // int -> uint8 conversions abound
  30. #pragma warning(disable : 4799) // I know we've got no emms; it's done in wavelet.c
  31. /*}{******* RGB <-> YUV in C ***********/
  32. void RGBb_to_YUVb(const uint8 *RGB,uint8 *YUV)
  33. {
  34. int R = RGB[0], G = RGB[1], B = RGB[2];
  35. YUV[0] = Y_RGB(R,G,B);
  36. YUV[1] = U_RGB(R,G,B) + 127;
  37. YUV[2] = V_RGB(R,G,B) + 127;
  38. }
  39. void YUVb_to_RGBb(const uint8 *YUV,uint8 *RGB)
  40. {
  41. int y,u,v,r,g,b;
  42. y = YUV[0];
  43. u = YUV[1] - 127;
  44. v = YUV[2] - 127;
  45. r = R_YUV(y,u,v);
  46. g = G_YUV(y,u,v);
  47. b = B_YUV(y,u,v);
  48. RGB[0] = minmax(r,0,255); // we could get negative ones and whatnot
  49. RGB[1] = minmax(g,0,255); // because the y,u,v are not really 24 bits;
  50. RGB[2] = minmax(b,0,255); // there are regions of YUV space that will never be reached by RGBb_to_YUVb
  51. }
  52. void RGBb_to_YUVb_line(const uint8 *RGB,uint8 *YUV,int len)
  53. {
  54. int R,G,B;
  55. while(len--)
  56. {
  57. R = *RGB++;
  58. G = *RGB++;
  59. B = *RGB++;
  60. *YUV++ = Y_RGB(R,G,B);
  61. *YUV++ = U_RGB(R,G,B) + 127;
  62. *YUV++ = V_RGB(R,G,B) + 127;
  63. }
  64. }
  65. void YUVb_to_RGBb_line(const uint8 *YUV,uint8 *RGB,int len)
  66. {
  67. int y,u,v,r,g,b;
  68. while(len--)
  69. {
  70. y = (*YUV++);
  71. u = (*YUV++) - 127;
  72. v = (*YUV++) - 127;
  73. r = R_YUV(y,u,v);
  74. g = G_YUV(y,u,v);
  75. b = B_YUV(y,u,v);
  76. *RGB++ = minmax(r,0,255); // we could get negative ones and whatnot
  77. *RGB++ = minmax(g,0,255); // because the y,u,v are not really 24 bits;
  78. *RGB++ = minmax(b,0,255); // there are regions of YUV space that will never be reached by RGBb_to_YUVb
  79. }
  80. }
  81. void RGBb_to_YUVi(const uint8 *RGB,int *Y,int *U,int *V)
  82. {
  83. int R = RGB[0], G = RGB[1], B = RGB[2];
  84. *Y = Y_RGB(R,G,B);
  85. *U = U_RGB(R,G,B) + 127;
  86. *V = V_RGB(R,G,B) + 127;
  87. assert( isinrange(*Y,0,255) );
  88. assert( isinrange(*U,0,255) );
  89. assert( isinrange(*V,0,255) );
  90. }
  91. void YUVi_to_RGBb(int y,int u,int v,uint8 *RGB)
  92. {
  93. int r,g,b;
  94. // yuv can be kicked out of 0,255 by the wavelet
  95. // assert( isinrange(y,0,255) );
  96. // assert( isinrange(u,0,255) );
  97. // assert( isinrange(v,0,255) );
  98. u -= 127;
  99. v -= 127;
  100. r = R_YUV(y,u,v); // this is just like a matrix multiply
  101. g = G_YUV(y,u,v);
  102. b = B_YUV(y,u,v);
  103. RGB[0] = minmax(r,0,255); // we could get negative ones and whatnot
  104. RGB[1] = minmax(g,0,255); // because the y,u,v are not really 24 bits;
  105. RGB[2] = minmax(b,0,255); // there are regions of YUV space that will never be reached by RGBb_to_YUVb
  106. }
  107. void RGBi_to_YUVi(int R,int G,int B,int *Y,int *U,int *V)
  108. {
  109. assert( isinrange(R,0,255) );
  110. assert( isinrange(G,0,255) );
  111. assert( isinrange(B,0,255) );
  112. *Y = Y_RGB(R,G,B);
  113. *U = U_RGB(R,G,B) + 127;
  114. *V = V_RGB(R,G,B) + 127;
  115. assert( isinrange(*Y,0,255) );
  116. assert( isinrange(*U,0,255) );
  117. assert( isinrange(*V,0,255) );
  118. }
  119. void YUVi_to_RGBi(int y,int u,int v,int *R,int *G,int *B)
  120. {
  121. int r,g,b;
  122. // yuv can be kicked out of 0,255 by the wavelet
  123. // assert( isinrange(y,0,255) );
  124. // assert( isinrange(u,0,255) );
  125. // assert( isinrange(v,0,255) );
  126. u -= 127;
  127. v -= 127;
  128. r = R_YUV(y,u,v); // this is just like a matrix multiply
  129. g = G_YUV(y,u,v);
  130. b = B_YUV(y,u,v);
  131. *R = minmax(r,0,255); // we could get negative ones and whatnot
  132. *G = minmax(g,0,255); // because the y,u,v are not really 24 bits;
  133. *B = minmax(b,0,255); // there are regions of YUV space that will never be reached by RGBb_to_YUVb
  134. }
  135. void YUVi_to_RGBi_line(int *line1,int *line2,int *line3,int len)
  136. {
  137. int y,u,v,r,g,b;
  138. // <> use MMX
  139. cachetouch_w(line1,len>>3);
  140. cachetouch_w(line2,len>>3);
  141. cachetouch_w(line3,len>>3);
  142. while(len--)
  143. {
  144. y = *line1;
  145. u = *line2 - 127;
  146. v = *line3 - 127;
  147. r = R_YUV(y,u,v);
  148. g = G_YUV(y,u,v);
  149. b = B_YUV(y,u,v);
  150. r = minmax(r,0,255);
  151. g = minmax(g,0,255);
  152. b = minmax(b,0,255);
  153. *line1++ = r;
  154. *line2++ = g;
  155. *line3++ = b;
  156. }
  157. }
  158. void YUVi_to_BGRb_line_c(int *iline1,int *iline2,int *iline3,uint8 * ibline,int ilen)
  159. {
  160. int y,u,v,r,g,b,len;
  161. int *line1,*line2,*line3;
  162. uint8 * bline;
  163. line1 = iline1;
  164. line2 = iline2;
  165. line3 = iline3;
  166. bline = ibline;
  167. len = ilen;
  168. cachetouch_r(line1,len>>3);
  169. cachetouch_r(line2,len>>3);
  170. cachetouch_r(line3,len>>3);
  171. cachetouch_w(bline,(len*3)>>5);
  172. while(len--)
  173. {
  174. y = (*line1++);
  175. u = (*line2++) - 127;
  176. v = (*line3++) - 127;
  177. r = R_YUV(y,u,v);
  178. g = G_YUV(y,u,v);
  179. b = B_YUV(y,u,v);
  180. r = minmax(r,0,255);
  181. g = minmax(g,0,255);
  182. b = minmax(b,0,255);
  183. bline[0] = b;
  184. bline[1] = g;
  185. bline[2] = r;
  186. bline+=3;
  187. }
  188. }
  189. void YUVi_to_BGRb_lines_c(int w,int h,int **Ylines,int **Ulines,int **Vlines,uint8 * BGRptr,int BGRstride)
  190. {
  191. int yz;
  192. for(yz=0;yz<h;yz++)
  193. {
  194. int y,u,v,r,g,b,len;
  195. int *line1,*line2,*line3;
  196. uint8 * bline;
  197. line1 = Ylines[yz];
  198. line2 = Ulines[yz];
  199. line3 = Vlines[yz];
  200. bline = BGRptr;
  201. len = w;
  202. cachetouch_r(line1,len>>3);
  203. cachetouch_r(line2,len>>3);
  204. cachetouch_r(line3,len>>3);
  205. cachetouch_w(bline,(len*3)>>5);
  206. while(len--)
  207. {
  208. y = (*line1++);
  209. u = (*line2++) - 127;
  210. v = (*line3++) - 127;
  211. r = R_YUV(y,u,v);
  212. g = G_YUV(y,u,v);
  213. b = B_YUV(y,u,v);
  214. r = minmax(r,0,255);
  215. g = minmax(g,0,255);
  216. b = minmax(b,0,255);
  217. bline[0] = b;
  218. bline[1] = g;
  219. bline[2] = r;
  220. bline+=3;
  221. }
  222. BGRptr += BGRstride;
  223. }
  224. }
  225. void YUVi_to_XRGB_line_c(int *iline1,int *iline2,int *iline3,uint8 * ibline,int ilen)
  226. {
  227. int y,u,v,r,g,b,len;
  228. int *line1,*line2,*line3;
  229. uint8 * bline;
  230. line1 = iline1;
  231. line2 = iline2;
  232. line3 = iline3;
  233. bline = ibline;
  234. len = ilen;
  235. cachetouch_r(line1,len>>3);
  236. cachetouch_r(line2,len>>3);
  237. cachetouch_r(line3,len>>3);
  238. cachetouch_w(bline,len>>3);
  239. while(len--)
  240. {
  241. y = (*line1++);
  242. u = (*line2++) - 127;
  243. v = (*line3++) - 127;
  244. r = R_YUV(y,u,v);
  245. g = G_YUV(y,u,v);
  246. b = B_YUV(y,u,v);
  247. r = minmax(r,0,255);
  248. g = minmax(g,0,255);
  249. b = minmax(b,0,255);
  250. bline[0] = b;
  251. bline[1] = g;
  252. bline[2] = r;
  253. bline += 4;
  254. }
  255. }
  256. /*}{******* MMX YUV -> BGR blitters ***********/
  257. #ifdef BUILD_BE // neccesary due to integer overflow on the various systems..
  258. static const __int64 Const_V_16 = 2789617077 * 256 * 256;//0x0000A6462DB50000;
  259. #endif
  260. #ifdef WIN32
  261. static const __int64 Const_V_16 = 0x0000A6462DB50000;
  262. #endif
  263. static const __int64 Const_U_16 = 0x00000000E9FA7168;
  264. void YUVi_to_BGRb_lines_mmx(int w,int h,int **Ylines,int **Ulines,int **Vlines,uint8 * BGRptr,int BGRstride)
  265. {
  266. int yz;
  267. for(yz=0;yz<h;yz++)
  268. {
  269. int *line1,*line2,*line3;
  270. uint8 * bline;
  271. line1 = Ylines[yz];
  272. line2 = Ulines[yz];
  273. line3 = Vlines[yz];
  274. bline = BGRptr;
  275. BGRptr += BGRstride;
  276. assert(w > 1 && h > 1 );
  277. cachetouch_r(line1,w>>3);
  278. cachetouch_r(line2,w>>3);
  279. cachetouch_r(line3,w>>3);
  280. cachetouch_w(bline,(w*3)>>5);
  281. #ifdef WIN32
  282. __asm
  283. {
  284. mov ecx,w
  285. sub ecx,1
  286. mov edi,bline
  287. movq mm3,Const_V_16
  288. movq mm4,Const_U_16
  289. More:
  290. /**
  291. *
  292. * ecx is width
  293. * edi is BGRptr
  294. *
  295. * eax is (V<<2)-509
  296. * ebx is (U<<2)-509
  297. * edx is Y
  298. *
  299. * the multiply coefficients are in 14 bits, then we rshr 16 via mulhw
  300. *
  301. * mm0 is four V int16's, multiplied by their coefficients (mm3)
  302. * mm1 is four U int16's, multiplied by their coefficients (mm4)
  303. * mm2 is four Y int16's
  304. *
  305. * XRGB = mm0 + mm1 + mm2
  306. *
  307. * we're taking about 45 clocks
  308. * my manual count indicates we could take about 37 if we were perfect
  309. */
  310. /*
  311. *
  312. * MMX optimization notes:
  313. * 1. there is only one MMX pack/unpack unit
  314. * 2. there is only one MMX multiply unit
  315. * 3. MMX instructions that use memory or integers use port 0 only
  316. * 4. all MMX instructions are 1 clock except multiply, which is 3
  317. */
  318. mov eax,line3 // V
  319. mov eax,[eax] // eax = v; hard stall on eax, inevitable
  320. add line3,4 // no stall on line3
  321. shl eax,2 // V<<=2
  322. mov ebx,line2 // U
  323. sub eax,509 // do ((V<<2)-510) instead of ((V-127)<<2)
  324. mov ebx,[ebx] // ebx = u
  325. add line2,4
  326. movd mm0,eax // mm0 = [0][v]
  327. shl ebx,2
  328. punpckldq mm0,mm0 // mm0 = [v][v]
  329. sub ebx,509
  330. packssdw mm0,mm0 // mm0 = [v][v][v][v]
  331. movd mm1,ebx // mm1 = [0][u]
  332. mov edx,line1 // Y
  333. pmulhw mm0,mm3 // keep only high words; same as multiplying in 32 bits and doing >>16
  334. // put some non-dependent stuff after the multiply:
  335. mov edx,[edx] // edx = y
  336. punpckldq mm1,mm1 // mm1 = [u][u]
  337. movd mm2,edx // mm2 = [0][y]
  338. packssdw mm1,mm1 // mm1 = [u][u][u][u]
  339. // these two packs cannot pair!
  340. punpckldq mm2,mm2 // mm2 = [y][y]
  341. pmulhw mm1,mm4
  342. // put some stuff after the multiply:
  343. add line1,4
  344. packssdw mm2,mm2 // mm2 = [y][y][y][y]
  345. // now XRGB = mm0 + mm1 + mm2
  346. paddsw mm0,mm1
  347. paddsw mm0,mm2 // hard stall on mm0, inevitable ; no stall on mm2
  348. // convert the four int16s to eight bytes; also do a clamp(0,255) for free!
  349. packuswb mm0,mm0 // hard stall on mm0, inevitable
  350. movd [edi],mm0 // hard stall on mm0, then unaligned write! bad!
  351. add edi,3 // no stall on edi
  352. dec ecx
  353. jnz More
  354. //{ one last one that doesn't write 4->3
  355. mov eax,line3 // V
  356. mov eax,[eax]
  357. add line3,4
  358. shl eax,2
  359. sub eax,509
  360. movd mm0,eax // mm0 = [0][x]
  361. punpckldq mm0,mm0 // mm0 = [x][x]
  362. packssdw mm0,mm0 // mm0 = [x][x][x][x]
  363. pmulhw mm0,mm3
  364. mov ebx,line2 // U
  365. mov ebx,[ebx]
  366. add line2,4
  367. shl ebx,2
  368. sub ebx,509
  369. movd mm1,ebx // mm0 = [0][x]
  370. punpckldq mm1,mm1 // mm0 = [x][x]
  371. packssdw mm1,mm1 // mm0 = [x][x][x][x]
  372. pmulhw mm1,mm4
  373. mov edx,line1 // Y
  374. mov edx,[edx]
  375. add line1,4
  376. movd mm2,edx // mm0 = [0][x]
  377. punpckldq mm2,mm2 // mm0 = [x][x]
  378. packssdw mm2,mm2 // mm0 = [x][x][x][x]
  379. paddsw mm0,mm1
  380. paddsw mm0,mm2
  381. packuswb mm0,mm0
  382. movd eax,mm0 // eax is XRGB
  383. mov [edi],ax
  384. shr eax,16
  385. mov [edi+2],al
  386. //}
  387. }
  388. #endif
  389. #ifdef BUILD_BE
  390. __asm__ __volatile__ ("
  391. movl %0, %%ecx // %0 = w ; mov ecx,w
  392. subl $1, %%ecx // sub ecx,1
  393. movl %1, %%edi // %1 = bline ; mov edi,bline
  394. movq %5, %%mm3 //Const_V_16, %%mm3 // %2 = Const_V_16, i think we could use the name cause its global but.. movq mm3,Const_V_16
  395. movq %6, %%mm4 // Const_U_16, %%mm4 // %3 = const_U_15, same as above ; movq mm4,Const_U_16
  396. More:
  397. ///**
  398. //*
  399. //* ecx is width
  400. //* edi is BGRptr
  401. //*
  402. //* eax is (V<<2)-509
  403. //* ebx is (U<<2)-509
  404. //* edx is Y
  405. //*
  406. //* the multiply coefficients are in 14 bits, then we rshr 16 via mulhw
  407. //*
  408. //* mm0 is four V int16's, multiplied by their coefficients (mm3)
  409. //* mm1 is four U int16's, multiplied by their coefficients (mm4)
  410. //* mm2 is four Y int16's
  411. //*
  412. //* XRGB = mm0 + mm1 + mm2
  413. //*
  414. //* we're taking about 45 clocks
  415. //* my manual count indicates we could take about 37 if we were perfect
  416. //*/
  417. ///*
  418. //*
  419. //* MMX optimization notes:
  420. //* 1. there is only one MMX pack/unpack unit
  421. //* 2. there is only one MMX multiply unit
  422. //* 3. MMX instructions that use memory or integers use port 0 only
  423. //* 4. all MMX instructions are 1 clock except multiply, which is 3
  424. //*/
  425. movl %2, %%eax // %2 = line 3 ;mov eax,line3 // V
  426. movl (%%eax), %%eax //mov eax,[eax] // eax = v; hard stall on eax, inevitable
  427. addl $4, %2 // %2 = line 3 add line3,4 // no stall on line3
  428. shll $2, %%eax //shl eax,2 // V<<=2
  429. movl %3, %%ebx // %3 = line2 mov ebx,line2 // U
  430. subl $509, %%eax // sub eax,509 // do ((V<<2)-510) instead of ((V-127)<<2)
  431. movl (%%ebx), %%ebx // mov ebx,[ebx] // ebx = u
  432. addl $4, %3 // %3 = line2 ;add line2,4
  433. movd %%eax, %%mm0 //;movd mm0,eax // mm0 = [0][v]
  434. shll $2, %%ebx // shl ebx,2
  435. punpckldq %%mm0, %%mm0 //;punpckldq mm0,mm0 // mm0 = [v][v]
  436. subl $509, %%ebx //sub ebx,509
  437. packssdw %%mm0, %%mm0 //;packssdw mm0,mm0 // mm0 = [v][v][v][v]
  438. movd %%ebx, %%mm1 //;movd mm1,ebx // mm1 = [0][u]
  439. movl %4, %%edx //%4 = line1;mov edx,line1 // Y
  440. pmulhw %%mm0, %%mm3 // reverse? pmulhw mm0,mm3 // keep only high words; same as multiplying in 32 bits and doing >>16
  441. // put some non-dependent stuff after the multiply:
  442. movl (%%edx), %%edx //;mov edx,[edx] // edx = y
  443. punpckldq %%mm1,%%mm1 //;punpckldq mm1,mm1 // mm1 = [u][u]
  444. movd %%edx, %%mm2 //;movd mm2,edx // mm2 = [0][y]
  445. packssdw %%mm1,%%mm1 //;packssdw mm1,mm1 // mm1 = [u][u][u][u]
  446. // these two packs cannot pair!
  447. punpckldq %%mm2,%%mm2 //;punpckldq mm2,mm2 // mm2 = [y][y]
  448. pmulhw %%mm4, %%mm1 // reverse? pmulhw mm1,mm4
  449. // put some stuff after the multiply:
  450. addl $4, %4 // ;add line1,4
  451. packssdw %%mm2, %%mm2 // ;packssdw mm2,mm2 // mm2 = [y][y][y][y]
  452. // now XRGB = mm0 + mm1 + mm2
  453. paddsw %%mm1, %%mm0 // ?reverse? paddsw mm0,mm1
  454. paddsw %%mm2, %%mm0 // ;paddsw mm0,mm2 // hard stall on mm0, inevitable ; no stall on mm2
  455. // convert the four int16s to eight bytes; also do a clamp(0,255) for free!
  456. packuswb %%mm0, %%mm0 //; packuswb mm0,mm0 // hard stall on mm0, inevitable
  457. movd %%mm0, (%%edi) //; movd [edi],mm0 // hard stall on mm0, then unaligned write! bad!
  458. addl $3, %%edi //;add edi,3 // no stall on edi
  459. dec %%ecx //;dec ecx
  460. jnz More
  461. //{ one last one that doesn't write 4->3
  462. movl %2, %%eax //;mov eax,line3 // V
  463. movl (%%eax), %%eax //;mov eax,[eax]
  464. addl $4, %2 //;add line3,4
  465. shl $2, %%eax //;shl eax,2
  466. subl $509, %%eax //;sub eax,509
  467. movd %%eax, %%mm0 //;movd mm0,eax // mm0 = [0][x]
  468. punpckldq %%mm0, %%mm0 //;punpckldq mm0,mm0 // mm0 = [x][x]
  469. packssdw %%mm0, %%mm0 //;packssdw mm0,mm0 // mm0 = [x][x][x][x]
  470. pmulhw %%mm3, %%mm0 //;pmulhw mm0,mm3
  471. movl %3, %%ebx //;mov ebx,line2 // U
  472. movl (%%ebx), %%ebx //;mov ebx,[ebx]
  473. addl $4, %3 //;add line2,4
  474. shll $2, %%ebx //;shl ebx,2
  475. subl $509, %%ebx //;sub ebx,509
  476. movd %%ebx, %%mm1 //;movd mm1,ebx // mm0 = [0][x]
  477. punpckldq %%mm1, %%mm1 //;punpckldq mm1,mm1 // mm0 = [x][x]
  478. packssdw %%mm1, %%mm1 //;packssdw mm1,mm1 // mm0 = [x][x][x][x]
  479. pmulhw %%mm4, %%mm1 //;pmulhw mm1,mm4
  480. movl %4, %%edx //;mov edx,line1 // Y
  481. movl (%%edx), %%edx //;mov edx,[edx]
  482. addl $4, %4 //;add line1,4
  483. movd %%edx, %%mm2 //;movd mm2,edx // mm0 = [0][x]
  484. punpckldq %%mm2,%%mm2 //;punpckldq mm2,mm2 // mm0 = [x][x]
  485. packssdw %%mm2, %%mm2 //;packssdw mm2,mm2 // mm0 = [x][x][x][x]
  486. paddsw %%mm1, %%mm0 //;paddsw mm0,mm1
  487. paddsw %%mm2, %%mm0 //;paddsw mm0,mm2
  488. packuswb %%mm0, %%mm0 //;packuswb mm0,mm0
  489. movd %%eax, %%mm0 //;movd eax,mm0 // eax is XRGB
  490. mov (%%edi), %%ax //;mov [edi],ax
  491. shr $16, %%eax //;shr eax,16
  492. mov %%al, 2(%%edi) //;mov [edi+2],al
  493. " : // outputs
  494. : "m" (w), "m" (bline), "m" (line3), "m" (line2), "m" (line1), "m" (Const_V_16), "m" (Const_U_16)// inputs
  495. : "%edi", "%edx", "%eax", "%ebx", "%ecx" );// clobbered
  496. #endif
  497. }
  498. //__asm { emms }
  499. }
  500. void YUVi_to_BGRb_line_mmx2(int *line1,int *line2,int *line3,uint8 * bline,int len)
  501. {
  502. assert(len > 1 );
  503. len --;
  504. cachetouch_r(line1,len>>3);
  505. cachetouch_r(line2,len>>3);
  506. cachetouch_r(line3,len>>3);
  507. cachetouch_w(bline,(len*3)>>5);
  508. #ifdef WIN32
  509. __asm
  510. {
  511. mov ecx,len
  512. mov edi,bline
  513. movq mm3,Const_V_16
  514. movq mm4,Const_U_16
  515. YUVi_to_BGRb_line_mmx2_More:
  516. mov eax,line3 // V
  517. mov eax,[eax] // hard stall on eax, inevitable
  518. add line3,4 // no stall on line3
  519. shl eax,2
  520. mov ebx,line2 // U
  521. sub eax,510
  522. mov ebx,[ebx]
  523. add line2,4
  524. movd mm0,eax // mm0 = [0][x]
  525. shl ebx,2
  526. punpckldq mm0,mm0 // mm0 = [x][x]
  527. sub ebx,510
  528. packssdw mm0,mm0 // mm0 = [x][x][x][x]
  529. movd mm1,ebx // mm0 = [0][x]
  530. pmulhw mm0,mm3
  531. mov edx,line1 // Y
  532. punpckldq mm1,mm1 // mm0 = [x][x]
  533. mov edx,[edx]
  534. packssdw mm1,mm1 // mm0 = [x][x][x][x]
  535. movd mm2,edx // mm0 = [0][x]
  536. add line1,4
  537. punpckldq mm2,mm2 // mm0 = [x][x]
  538. pmulhw mm1,mm4
  539. packssdw mm2,mm2 // mm0 = [x][x][x][x]
  540. paddsw mm0,mm1
  541. paddsw mm0,mm2 // hard stall on mm0, inevitable ; no stall on mm2
  542. packuswb mm0,mm0 // hard stall on mm0, inevitable
  543. movd [edi],mm0 // unaligned write! bad!
  544. add edi,3 // no stall on edi
  545. dec ecx
  546. jnz YUVi_to_BGRb_line_mmx2_More
  547. mov bline,edi
  548. //emms
  549. }
  550. #endif
  551. #ifdef BUILD_BE
  552. __asm__ __volatile__ ( "
  553. movl %0, %%ecx //;%0 = len ;mov ecx,len
  554. movl %1, %%edi //;%1 = bline ;mov edi,bline
  555. movq Const_V_16, %%mm3 //%2 = Const_V_16;movq mm3,Const_V_16
  556. movq Const_U_16, %%mm4 //%3 = Const_U_16;movq mm4,Const_U_16
  557. YUVi_to_BGRb_line_mmx2_More:
  558. movl %2,%%eax // ;%4 = line3 //mov eax,line3 // V
  559. movl (%%eax), %%eax //;mov eax,[eax] // hard stall on eax, inevitable
  560. addl $4, %2 //;add line3,4 // no stall on line3
  561. shll $2, %%eax //;shl eax,2
  562. movl %3, %%ebx //;3 = line2 mov ebx,line2 // U
  563. subl $510, %%eax //;sub eax,510
  564. movl (%%ebx), %%ebx //;mov ebx,[ebx]
  565. addl $4, %3 //;add line2,4
  566. movd %%eax, %%mm0 //;movd mm0,eax // mm0 = [0][x]
  567. shll $2, %%ebx //;shl ebx,2
  568. punpckldq %%mm0, %%mm0 //;punpckldq mm0,mm0 // mm0 = [x][x]
  569. subl $510,%%ebx //;sub ebx,510
  570. packssdw %%mm0, %%mm0 //;packssdw mm0,mm0 // mm0 = [x][x][x][x]
  571. movd %%ebx, %%mm1 //;movd mm1,ebx // mm0 = [0][x]
  572. pmulhw %%mm3, %%mm0 //;pmulhw mm0,mm3
  573. movl %4, %%edx // ;%4 = line 1mov edx,line1 // Y
  574. punpckldq %%mm1, %%mm1 //;punpckldq mm1,mm1 // mm0 = [x][x]
  575. movl (%%edx), %%edx //;mov edx,[edx]
  576. packssdw %%mm1, %%mm1 //;packssdw mm1,mm1 // mm0 = [x][x][x][x]
  577. movd %%edx, %%mm2 //;movd mm2,edx // mm0 = [0][x]
  578. addl $6, %3 //;add line1,4
  579. punpckldq %%mm2,%%mm2 //;punpckldq mm2,mm2 // mm0 = [x][x]
  580. pmulhw %%mm4, %%mm1 //; (?reverse?) pmulhw mm1,mm4
  581. packssdw %%mm2, %%mm2 //; packssdw mm2,mm2 // mm0 = [x][x][x][x]
  582. paddsw %%mm1, %%mm0 // ?reverse? paddsw mm0,mm1
  583. paddsw %%mm2, %%mm0 // ?reverse? paddsw mm0,mm2 // hard stall on mm0, inevitable ; no stall on mm2
  584. packuswb %%mm0, %%mm0 // packuswb mm0,mm0 // hard stall on mm0, inevitable
  585. movd (%%edi), %%mm0 // ;movd [edi],mm0 // unaligned write! bad!
  586. addl $3, %%edi // ;add edi,3 // no stall on edi
  587. decl %%ecx //;dec ecx
  588. jnz YUVi_to_BGRb_line_mmx2_More
  589. movl %%edi, %1 // ;mov bline,edi"
  590. : // outputs
  591. : "m" (len), "m" (bline), "m" (line3) , "m" (line2), "m" (line1)
  592. : "%ebx" , "%eax" , "%edi" , "%ecx" , "%edx" );
  593. #endif
  594. {
  595. int y,u,v,r,g,b;
  596. y = (*line1);
  597. u = (*line2) - 127;
  598. v = (*line3) - 127;
  599. r = R_YUV(y,u,v);
  600. g = G_YUV(y,u,v);
  601. b = B_YUV(y,u,v);
  602. r = minmax(r,0,255);
  603. g = minmax(g,0,255);
  604. b = minmax(b,0,255);
  605. bline[0] = b;
  606. bline[1] = g;
  607. bline[2] = r;
  608. }
  609. }
  610. void YUVi_to_XRGB_line_mmx(int *line1,int *line2,int *line3,uint8 * bline,int len)
  611. {
  612. assert(len > 0 );
  613. cachetouch_r(line1,len>>3);
  614. cachetouch_r(line2,len>>3);
  615. cachetouch_r(line3,len>>3);
  616. cachetouch_w(bline,len>>3);
  617. #ifdef WIN32
  618. __asm
  619. {
  620. mov ecx,len
  621. mov edi,bline
  622. movq mm3,Const_V_16
  623. movq mm4,Const_U_16
  624. More:
  625. mov eax,line3 // V
  626. mov eax,[eax] // hard stall on eax, inevitable
  627. add line3,4 // no stall on line3
  628. shl eax,2
  629. mov ebx,line2 // U
  630. sub eax,510
  631. mov ebx,[ebx]
  632. add line2,4
  633. movd mm0,eax // mm0 = [0][x]
  634. shl ebx,2
  635. punpckldq mm0,mm0 // mm0 = [x][x]
  636. sub ebx,510
  637. packssdw mm0,mm0 // mm0 = [x][x][x][x]
  638. movd mm1,ebx // mm0 = [0][x]
  639. pmulhw mm0,mm3
  640. mov edx,line1 // Y
  641. punpckldq mm1,mm1 // mm0 = [x][x]
  642. mov edx,[edx]
  643. packssdw mm1,mm1 // mm0 = [x][x][x][x]
  644. movd mm2,edx // mm0 = [0][x]
  645. add line1,4
  646. punpckldq mm2,mm2 // mm0 = [x][x]
  647. pmulhw mm1,mm4
  648. packssdw mm2,mm2 // mm0 = [x][x][x][x]
  649. paddsw mm0,mm1
  650. paddsw mm0,mm2 // hard stall on mm0, inevitable ; no stall on mm2
  651. packuswb mm0,mm0 // hard stall on mm0, inevitable
  652. movd [edi],mm0
  653. add edi,4 // no stall on edi
  654. dec ecx
  655. jnz More
  656. //emms
  657. }
  658. #endif
  659. #ifdef BUILD_BE
  660. __asm__ __volatile__ ("
  661. movl %0, %%ecx // %0 = len ;mov ecx,len
  662. movl %1, %%edi // %1 = bline ; mov edi,bline
  663. movq Const_V_16, %%mm3 //;movq mm3,Const_V_16
  664. movq Const_U_16, %%mm4 //;movq mm4,Const_U_16
  665. YUVi_to_XRGB_line_mmx_More:
  666. movl %2, %%eax // %2 = line3 //mov eax,line3 // V
  667. movl (%%eax), %%eax //;mov eax,[eax] // hard stall on eax, inevitable
  668. addl $4, %2 //;add line3,4 // no stall on line3
  669. shll $2, %%eax //shl eax,2
  670. movl %3, %%ebx // %3 = line2 mov ebx,line2 // U
  671. subl $510, %%eax //;sub eax,510
  672. movl (%%ebx), %%ebx //;mov ebx,[ebx]
  673. addl $4, %3 //;add line2,4
  674. movd %%eax, %%mm0 //;movd mm0,eax // mm0 = [0][x]
  675. shll $2, %%ebx //; shl ebx,2
  676. punpckldq %%mm0, %%mm0 //;punpckldq mm0,mm0 // mm0 = [x][x]
  677. subl $510, %%ebx //;sub ebx,510
  678. packssdw %%mm0, %%mm0 //;packssdw mm0,mm0 // mm0 = [x][x][x][x]
  679. movd %%ebx, %%mm1 //;movd mm1,ebx // mm0 = [0][x]
  680. pmulhw %%mm3, %%mm0 //?reverse? pmulhw mm0,mm3
  681. movl %4 , %%edx // %4 = line1 mov edx,line1 // Y
  682. punpckldq %%mm1, %%mm1 //; punpckldq mm1,mm1 // mm0 = [x][x]
  683. movl (%%edx), %%edx //; mov edx,[edx]
  684. packssdw %%mm1, %%mm1 //;packssdw mm1,mm1 // mm0 = [x][x][x][x]
  685. movd %%edx, %%mm2 //; movd mm2,edx // mm0 = [0][x]
  686. addl $4, %4 // add line1,4
  687. punpckldq %%mm2, %%mm2 //; punpckldq mm2,mm2 // mm0 = [x][x]
  688. pmulhw %%mm4, %%mm1 // ?reverse? pmulhw mm1,mm4
  689. packssdw %%mm2, %%mm2 // packssdw mm2,mm2 // mm0 = [x][x][x][x]
  690. paddsw %%mm1, %%mm0 // ?reverse? paddsw mm0,mm1
  691. paddsw %%mm2, %%mm0 // ?reverse? paddsw mm0,mm2 // hard stall on mm0, inevitable ; no stall on mm2
  692. packuswb %%mm0, %%mm0 // packuswb mm0,mm0 // hard stall on mm0, inevitable
  693. movd %%mm0, (%%edi) //;movd [edi],mm0
  694. addl $4, %%edi //;add edi,4 // no stall on edi
  695. decl %%ecx //dec ecx
  696. jnz YUVi_to_XRGB_line_mmx_More
  697. "
  698. : // outputs
  699. : "g" (len), "g" (bline) , "g" (line3), "g" (line2) , "g" (line1)
  700. : "%ecx" , "%edi", "%eax", "%ebx", "%edx");
  701. #endif // BUILD_BE
  702. }
  703. /*}{******* CPU setup ***********/
  704. void (*YUVi_to_XRGB_line)(int *line1,int *line2,int *line3,uint8 * bline,int len) = NULL;
  705. void (*YUVi_to_BGRb_lines)(int w,int h,int **Ylines,int **Ulines,int **Vlines,uint8 * BGRptr,int BGRstride) = NULL;
  706. void SetupYUV(void)
  707. {
  708. jeCPU_GetInfo();
  709. if ( jeCPU_Features & JE_CPU_HAS_MMX )
  710. {
  711. // timed on hare512.bmp :
  712. // YUVi_to_BGRb_line = YUVi_to_BGRb_line_mmx1; // blit : 0.025 seconds = 47.2 clocks / pixel
  713. // YUVi_to_BGRb_line = YUVi_to_BGRb_line_mmx2; // blit : 0.025 seconds = 47.2 clocks / pixel
  714. // YUVi_to_BGRb_line = YUVi_to_BGRb_line_c; // blit : 0.034 seconds = 66.6 clocks / pixel
  715. YUVi_to_XRGB_line = YUVi_to_XRGB_line_mmx;
  716. YUVi_to_BGRb_lines = YUVi_to_BGRb_lines_mmx;// blit : 0.0245 seconds= 45.9 clocks / pixel
  717. }
  718. else
  719. {
  720. // YUVi_to_BGRb_line = YUVi_to_BGRb_line_c;
  721. YUVi_to_XRGB_line = YUVi_to_XRGB_line_c;
  722. YUVi_to_BGRb_lines = YUVi_to_BGRb_lines_c;
  723. }
  724. }
  725. /*}******* EOF ***********/