PageRenderTime 88ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/trunk/source/Engine/JetEngine/Bitmap/Compression/YUV.cpp

#
C++ | 703 lines | 441 code | 186 blank | 76 comment | 4 complexity | 1f8521119bb673056cf24518967ce036 MD5 | raw file
  1. /****************************************************************************************/
  2. /* Yuv */
  3. /* */
  4. /* Author: Charles Bloom */
  5. /* Description: YUV <-> RGB code */
  6. /* */
  7. /* The contents of this file are subject to the Jet3D Public License */
  8. /* Version 1.02 (the "License"); you may not use this file except in */
  9. /* compliance with the License. You may obtain a copy of the License at */
  10. /* http://www.jet3d.com */
  11. /* */
  12. /* Software distributed under the License is distributed on an "AS IS" */
  13. /* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See */
  14. /* the License for the specific language governing rights and limitations */
  15. /* under the License. */
  16. /* */
  17. /* The Original Code is Jet3D, released December 12, 1999. */
  18. /* Copyright (C) 1996-1999 Eclipse Entertainment, L.L.C. All Rights Reserved */
  19. /* */
  20. /****************************************************************************************/
  21. #include "stdafx.h"
  22. #include <assert.h>
  23. #include "YUV.h"
  24. #include "Utility.h"
  25. #include "Cpu.h"
  26. #pragma warning(disable : 4244) // int -> uint8 conversions abound
  27. #pragma warning(disable : 4799) // I know we've got no emms; it's done in wavelet.c
  28. /*}{******* RGB <-> YUV in C ***********/
  29. void RGBb_to_YUVb(const uint8 *RGB,uint8 *YUV)
  30. {
  31. int R = RGB[0], G = RGB[1], B = RGB[2];
  32. YUV[0] = Y_RGB(R,G,B);
  33. YUV[1] = U_RGB(R,G,B) + 127;
  34. YUV[2] = V_RGB(R,G,B) + 127;
  35. }
  36. void YUVb_to_RGBb(const uint8 *YUV,uint8 *RGB)
  37. {
  38. int y,u,v,r,g,b;
  39. y = YUV[0];
  40. u = YUV[1] - 127;
  41. v = YUV[2] - 127;
  42. r = R_YUV(y,u,v);
  43. g = G_YUV(y,u,v);
  44. b = B_YUV(y,u,v);
  45. RGB[0] = minmax(r,0,255); // we could get negative ones and whatnot
  46. RGB[1] = minmax(g,0,255); // because the y,u,v are not really 24 bits;
  47. RGB[2] = minmax(b,0,255); // there are regions of YUV space that will never be reached by RGBb_to_YUVb
  48. }
  49. void RGBb_to_YUVb_line(const uint8 *RGB,uint8 *YUV,int len)
  50. {
  51. int R,G,B;
  52. while(len--)
  53. {
  54. R = *RGB++;
  55. G = *RGB++;
  56. B = *RGB++;
  57. *YUV++ = Y_RGB(R,G,B);
  58. *YUV++ = U_RGB(R,G,B) + 127;
  59. *YUV++ = V_RGB(R,G,B) + 127;
  60. }
  61. }
  62. void YUVb_to_RGBb_line(const uint8 *YUV,uint8 *RGB,int len)
  63. {
  64. int y,u,v,r,g,b;
  65. while(len--)
  66. {
  67. y = (*YUV++);
  68. u = (*YUV++) - 127;
  69. v = (*YUV++) - 127;
  70. r = R_YUV(y,u,v);
  71. g = G_YUV(y,u,v);
  72. b = B_YUV(y,u,v);
  73. *RGB++ = minmax(r,0,255); // we could get negative ones and whatnot
  74. *RGB++ = minmax(g,0,255); // because the y,u,v are not really 24 bits;
  75. *RGB++ = minmax(b,0,255); // there are regions of YUV space that will never be reached by RGBb_to_YUVb
  76. }
  77. }
  78. void RGBb_to_YUVi(const uint8 *RGB,int *Y,int *U,int *V)
  79. {
  80. int R = RGB[0], G = RGB[1], B = RGB[2];
  81. *Y = Y_RGB(R,G,B);
  82. *U = U_RGB(R,G,B) + 127;
  83. *V = V_RGB(R,G,B) + 127;
  84. assert( isinrange(*Y,0,255) );
  85. assert( isinrange(*U,0,255) );
  86. assert( isinrange(*V,0,255) );
  87. }
  88. void YUVi_to_RGBb(int y,int u,int v,uint8 *RGB)
  89. {
  90. int r,g,b;
  91. // yuv can be kicked out of 0,255 by the wavelet
  92. // assert( isinrange(y,0,255) );
  93. // assert( isinrange(u,0,255) );
  94. // assert( isinrange(v,0,255) );
  95. u -= 127;
  96. v -= 127;
  97. r = R_YUV(y,u,v); // this is just like a matrix multiply
  98. g = G_YUV(y,u,v);
  99. b = B_YUV(y,u,v);
  100. RGB[0] = minmax(r,0,255); // we could get negative ones and whatnot
  101. RGB[1] = minmax(g,0,255); // because the y,u,v are not really 24 bits;
  102. RGB[2] = minmax(b,0,255); // there are regions of YUV space that will never be reached by RGBb_to_YUVb
  103. }
  104. void RGBi_to_YUVi(int R,int G,int B,int *Y,int *U,int *V)
  105. {
  106. assert( isinrange(R,0,255) );
  107. assert( isinrange(G,0,255) );
  108. assert( isinrange(B,0,255) );
  109. *Y = Y_RGB(R,G,B);
  110. *U = U_RGB(R,G,B) + 127;
  111. *V = V_RGB(R,G,B) + 127;
  112. assert( isinrange(*Y,0,255) );
  113. assert( isinrange(*U,0,255) );
  114. assert( isinrange(*V,0,255) );
  115. }
  116. void YUVi_to_RGBi(int y,int u,int v,int *R,int *G,int *B)
  117. {
  118. int r,g,b;
  119. // yuv can be kicked out of 0,255 by the wavelet
  120. // assert( isinrange(y,0,255) );
  121. // assert( isinrange(u,0,255) );
  122. // assert( isinrange(v,0,255) );
  123. u -= 127;
  124. v -= 127;
  125. r = R_YUV(y,u,v); // this is just like a matrix multiply
  126. g = G_YUV(y,u,v);
  127. b = B_YUV(y,u,v);
  128. *R = minmax(r,0,255); // we could get negative ones and whatnot
  129. *G = minmax(g,0,255); // because the y,u,v are not really 24 bits;
  130. *B = minmax(b,0,255); // there are regions of YUV space that will never be reached by RGBb_to_YUVb
  131. }
  132. void YUVi_to_RGBi_line(int *line1,int *line2,int *line3,int len)
  133. {
  134. int y,u,v,r,g,b;
  135. // <> use MMX
  136. cachetouch_w(line1,len>>3);
  137. cachetouch_w(line2,len>>3);
  138. cachetouch_w(line3,len>>3);
  139. while(len--)
  140. {
  141. y = *line1;
  142. u = *line2 - 127;
  143. v = *line3 - 127;
  144. r = R_YUV(y,u,v);
  145. g = G_YUV(y,u,v);
  146. b = B_YUV(y,u,v);
  147. r = minmax(r,0,255);
  148. g = minmax(g,0,255);
  149. b = minmax(b,0,255);
  150. *line1++ = r;
  151. *line2++ = g;
  152. *line3++ = b;
  153. }
  154. }
  155. void YUVi_to_BGRb_line_c(int *iline1,int *iline2,int *iline3,uint8 * ibline,int ilen)
  156. {
  157. int y,u,v,r,g,b,len;
  158. int *line1,*line2,*line3;
  159. uint8 * bline;
  160. line1 = iline1;
  161. line2 = iline2;
  162. line3 = iline3;
  163. bline = ibline;
  164. len = ilen;
  165. cachetouch_r(line1,len>>3);
  166. cachetouch_r(line2,len>>3);
  167. cachetouch_r(line3,len>>3);
  168. cachetouch_w(bline,(len*3)>>5);
  169. while(len--)
  170. {
  171. y = (*line1++);
  172. u = (*line2++) - 127;
  173. v = (*line3++) - 127;
  174. r = R_YUV(y,u,v);
  175. g = G_YUV(y,u,v);
  176. b = B_YUV(y,u,v);
  177. r = minmax(r,0,255);
  178. g = minmax(g,0,255);
  179. b = minmax(b,0,255);
  180. bline[0] = b;
  181. bline[1] = g;
  182. bline[2] = r;
  183. bline+=3;
  184. }
  185. }
  186. void YUVi_to_BGRb_lines_c(int w,int h,int **Ylines,int **Ulines,int **Vlines,uint8 * BGRptr,int BGRstride)
  187. {
  188. int yz;
  189. for(yz=0;yz<h;yz++)
  190. {
  191. int y,u,v,r,g,b,len;
  192. int *line1,*line2,*line3;
  193. uint8 * bline;
  194. line1 = Ylines[yz];
  195. line2 = Ulines[yz];
  196. line3 = Vlines[yz];
  197. bline = BGRptr;
  198. len = w;
  199. cachetouch_r(line1,len>>3);
  200. cachetouch_r(line2,len>>3);
  201. cachetouch_r(line3,len>>3);
  202. cachetouch_w(bline,(len*3)>>5);
  203. while(len--)
  204. {
  205. y = (*line1++);
  206. u = (*line2++) - 127;
  207. v = (*line3++) - 127;
  208. r = R_YUV(y,u,v);
  209. g = G_YUV(y,u,v);
  210. b = B_YUV(y,u,v);
  211. r = minmax(r,0,255);
  212. g = minmax(g,0,255);
  213. b = minmax(b,0,255);
  214. bline[0] = b;
  215. bline[1] = g;
  216. bline[2] = r;
  217. bline+=3;
  218. }
  219. BGRptr += BGRstride;
  220. }
  221. }
  222. void YUVi_to_XRGB_line_c(int *iline1,int *iline2,int *iline3,uint8 * ibline,int ilen)
  223. {
  224. int y,u,v,r,g,b,len;
  225. int *line1,*line2,*line3;
  226. uint8 * bline;
  227. line1 = iline1;
  228. line2 = iline2;
  229. line3 = iline3;
  230. bline = ibline;
  231. len = ilen;
  232. cachetouch_r(line1,len>>3);
  233. cachetouch_r(line2,len>>3);
  234. cachetouch_r(line3,len>>3);
  235. cachetouch_w(bline,len>>3);
  236. while(len--)
  237. {
  238. y = (*line1++);
  239. u = (*line2++) - 127;
  240. v = (*line3++) - 127;
  241. r = R_YUV(y,u,v);
  242. g = G_YUV(y,u,v);
  243. b = B_YUV(y,u,v);
  244. r = minmax(r,0,255);
  245. g = minmax(g,0,255);
  246. b = minmax(b,0,255);
  247. bline[0] = b;
  248. bline[1] = g;
  249. bline[2] = r;
  250. bline += 4;
  251. }
  252. }
  253. /*}{******* MMX YUV -> BGR blitters ***********/
  254. static const __int64 Const_V_16 = 0x0000A6462DB50000;
  255. static const __int64 Const_U_16 = 0x00000000E9FA7168;
  256. void YUVi_to_BGRb_lines_mmx(int w,int h,int **Ylines,int **Ulines,int **Vlines,uint8 * BGRptr,int BGRstride)
  257. {
  258. int yz;
  259. for(yz=0;yz<h;yz++)
  260. {
  261. int *line1,*line2,*line3;
  262. uint8 * bline;
  263. line1 = Ylines[yz];
  264. line2 = Ulines[yz];
  265. line3 = Vlines[yz];
  266. bline = BGRptr;
  267. BGRptr += BGRstride;
  268. assert(w > 1 && h > 1 );
  269. cachetouch_r(line1,w>>3);
  270. cachetouch_r(line2,w>>3);
  271. cachetouch_r(line3,w>>3);
  272. cachetouch_w(bline,(w*3)>>5);
  273. __asm
  274. {
  275. mov ecx,w
  276. sub ecx,1
  277. mov edi,bline
  278. movq mm3,Const_V_16
  279. movq mm4,Const_U_16
  280. More:
  281. /**
  282. *
  283. * ecx is width
  284. * edi is BGRptr
  285. *
  286. * eax is (V<<2)-509
  287. * ebx is (U<<2)-509
  288. * edx is Y
  289. *
  290. * the multiply coefficients are in 14 bits, then we rshr 16 via mulhw
  291. *
  292. * mm0 is four V int16's, multiplied by their coefficients (mm3)
  293. * mm1 is four U int16's, multiplied by their coefficients (mm4)
  294. * mm2 is four Y int16's
  295. *
  296. * XRGB = mm0 + mm1 + mm2
  297. *
  298. * we're taking about 45 clocks
  299. * my manual count indicates we could take about 37 if we were perfect
  300. */
  301. /*
  302. *
  303. * MMX optimization notes:
  304. * 1. there is only one MMX pack/unpack unit
  305. * 2. there is only one MMX multiply unit
  306. * 3. MMX instructions that use memory or integers use port 0 only
  307. * 4. all MMX instructions are 1 clock except multiply, which is 3
  308. */
  309. mov eax,line3 // V
  310. mov eax,[eax] // eax = v; hard stall on eax, inevitable
  311. add line3,4 // no stall on line3
  312. shl eax,2 // V<<=2
  313. mov ebx,line2 // U
  314. sub eax,509 // do ((V<<2)-510) instead of ((V-127)<<2)
  315. mov ebx,[ebx] // ebx = u
  316. add line2,4
  317. movd mm0,eax // mm0 = [0][v]
  318. shl ebx,2
  319. punpckldq mm0,mm0 // mm0 = [v][v]
  320. sub ebx,509
  321. packssdw mm0,mm0 // mm0 = [v][v][v][v]
  322. movd mm1,ebx // mm1 = [0][u]
  323. mov edx,line1 // Y
  324. pmulhw mm0,mm3 // keep only high words; same as multiplying in 32 bits and doing >>16
  325. // put some non-dependent stuff after the multiply:
  326. mov edx,[edx] // edx = y
  327. punpckldq mm1,mm1 // mm1 = [u][u]
  328. movd mm2,edx // mm2 = [0][y]
  329. packssdw mm1,mm1 // mm1 = [u][u][u][u]
  330. // these two packs cannot pair!
  331. punpckldq mm2,mm2 // mm2 = [y][y]
  332. pmulhw mm1,mm4
  333. // put some stuff after the multiply:
  334. add line1,4
  335. packssdw mm2,mm2 // mm2 = [y][y][y][y]
  336. // now XRGB = mm0 + mm1 + mm2
  337. paddsw mm0,mm1
  338. paddsw mm0,mm2 // hard stall on mm0, inevitable ; no stall on mm2
  339. // convert the four int16s to eight bytes; also do a clamp(0,255) for free!
  340. packuswb mm0,mm0 // hard stall on mm0, inevitable
  341. movd [edi],mm0 // hard stall on mm0, then unaligned write! bad!
  342. add edi,3 // no stall on edi
  343. dec ecx
  344. jnz More
  345. //{ one last one that doesn't write 4->3
  346. mov eax,line3 // V
  347. mov eax,[eax]
  348. add line3,4
  349. shl eax,2
  350. sub eax,509
  351. movd mm0,eax // mm0 = [0][x]
  352. punpckldq mm0,mm0 // mm0 = [x][x]
  353. packssdw mm0,mm0 // mm0 = [x][x][x][x]
  354. pmulhw mm0,mm3
  355. mov ebx,line2 // U
  356. mov ebx,[ebx]
  357. add line2,4
  358. shl ebx,2
  359. sub ebx,509
  360. movd mm1,ebx // mm0 = [0][x]
  361. punpckldq mm1,mm1 // mm0 = [x][x]
  362. packssdw mm1,mm1 // mm0 = [x][x][x][x]
  363. pmulhw mm1,mm4
  364. mov edx,line1 // Y
  365. mov edx,[edx]
  366. add line1,4
  367. movd mm2,edx // mm0 = [0][x]
  368. punpckldq mm2,mm2 // mm0 = [x][x]
  369. packssdw mm2,mm2 // mm0 = [x][x][x][x]
  370. paddsw mm0,mm1
  371. paddsw mm0,mm2
  372. packuswb mm0,mm0
  373. movd eax,mm0 // eax is XRGB
  374. mov [edi],ax
  375. shr eax,16
  376. mov [edi+2],al
  377. //}
  378. }
  379. }
  380. //__asm { emms }
  381. }
  382. void YUVi_to_BGRb_line_mmx2(int *line1,int *line2,int *line3,uint8 * bline,int len)
  383. {
  384. assert(len > 1 );
  385. len --;
  386. cachetouch_r(line1,len>>3);
  387. cachetouch_r(line2,len>>3);
  388. cachetouch_r(line3,len>>3);
  389. cachetouch_w(bline,(len*3)>>5);
  390. __asm
  391. {
  392. mov ecx,len
  393. mov edi,bline
  394. movq mm3,Const_V_16
  395. movq mm4,Const_U_16
  396. YUVi_to_BGRb_line_mmx2_More:
  397. mov eax,line3 // V
  398. mov eax,[eax] // hard stall on eax, inevitable
  399. add line3,4 // no stall on line3
  400. shl eax,2
  401. mov ebx,line2 // U
  402. sub eax,510
  403. mov ebx,[ebx]
  404. add line2,4
  405. movd mm0,eax // mm0 = [0][x]
  406. shl ebx,2
  407. punpckldq mm0,mm0 // mm0 = [x][x]
  408. sub ebx,510
  409. packssdw mm0,mm0 // mm0 = [x][x][x][x]
  410. movd mm1,ebx // mm0 = [0][x]
  411. pmulhw mm0,mm3
  412. mov edx,line1 // Y
  413. punpckldq mm1,mm1 // mm0 = [x][x]
  414. mov edx,[edx]
  415. packssdw mm1,mm1 // mm0 = [x][x][x][x]
  416. movd mm2,edx // mm0 = [0][x]
  417. add line1,4
  418. punpckldq mm2,mm2 // mm0 = [x][x]
  419. pmulhw mm1,mm4
  420. packssdw mm2,mm2 // mm0 = [x][x][x][x]
  421. paddsw mm0,mm1
  422. paddsw mm0,mm2 // hard stall on mm0, inevitable ; no stall on mm2
  423. packuswb mm0,mm0 // hard stall on mm0, inevitable
  424. movd [edi],mm0 // unaligned write! bad!
  425. add edi,3 // no stall on edi
  426. dec ecx
  427. jnz YUVi_to_BGRb_line_mmx2_More
  428. mov bline,edi
  429. //emms
  430. }
  431. {
  432. int y,u,v,r,g,b;
  433. y = (*line1);
  434. u = (*line2) - 127;
  435. v = (*line3) - 127;
  436. r = R_YUV(y,u,v);
  437. g = G_YUV(y,u,v);
  438. b = B_YUV(y,u,v);
  439. r = minmax(r,0,255);
  440. g = minmax(g,0,255);
  441. b = minmax(b,0,255);
  442. bline[0] = b;
  443. bline[1] = g;
  444. bline[2] = r;
  445. }
  446. }
  447. void YUVi_to_XRGB_line_mmx(int *line1,int *line2,int *line3,uint8 * bline,int len)
  448. {
  449. assert(len > 0 );
  450. cachetouch_r(line1,len>>3);
  451. cachetouch_r(line2,len>>3);
  452. cachetouch_r(line3,len>>3);
  453. cachetouch_w(bline,len>>3);
  454. __asm
  455. {
  456. mov ecx,len
  457. mov edi,bline
  458. movq mm3,Const_V_16
  459. movq mm4,Const_U_16
  460. More:
  461. mov eax,line3 // V
  462. mov eax,[eax] // hard stall on eax, inevitable
  463. add line3,4 // no stall on line3
  464. shl eax,2
  465. mov ebx,line2 // U
  466. sub eax,510
  467. mov ebx,[ebx]
  468. add line2,4
  469. movd mm0,eax // mm0 = [0][x]
  470. shl ebx,2
  471. punpckldq mm0,mm0 // mm0 = [x][x]
  472. sub ebx,510
  473. packssdw mm0,mm0 // mm0 = [x][x][x][x]
  474. movd mm1,ebx // mm0 = [0][x]
  475. pmulhw mm0,mm3
  476. mov edx,line1 // Y
  477. punpckldq mm1,mm1 // mm0 = [x][x]
  478. mov edx,[edx]
  479. packssdw mm1,mm1 // mm0 = [x][x][x][x]
  480. movd mm2,edx // mm0 = [0][x]
  481. add line1,4
  482. punpckldq mm2,mm2 // mm0 = [x][x]
  483. pmulhw mm1,mm4
  484. packssdw mm2,mm2 // mm0 = [x][x][x][x]
  485. paddsw mm0,mm1
  486. paddsw mm0,mm2 // hard stall on mm0, inevitable ; no stall on mm2
  487. packuswb mm0,mm0 // hard stall on mm0, inevitable
  488. movd [edi],mm0
  489. add edi,4 // no stall on edi
  490. dec ecx
  491. jnz More
  492. //emms
  493. }
  494. }
  495. /*}{******* CPU setup ***********/
  496. void (*YUVi_to_XRGB_line)(int *line1,int *line2,int *line3,uint8 * bline,int len) = NULL;
  497. void (*YUVi_to_BGRb_lines)(int w,int h,int **Ylines,int **Ulines,int **Vlines,uint8 * BGRptr,int BGRstride) = NULL;
  498. void SetupYUV(void)
  499. {
  500. jeCPU_GetInfo();
  501. if ( jeCPU_Features & JE_CPU_HAS_MMX )
  502. {
  503. // timed on hare512.bmp :
  504. // YUVi_to_BGRb_line = YUVi_to_BGRb_line_mmx1; // blit : 0.025 seconds = 47.2 clocks / pixel
  505. // YUVi_to_BGRb_line = YUVi_to_BGRb_line_mmx2; // blit : 0.025 seconds = 47.2 clocks / pixel
  506. // YUVi_to_BGRb_line = YUVi_to_BGRb_line_c; // blit : 0.034 seconds = 66.6 clocks / pixel
  507. YUVi_to_XRGB_line = YUVi_to_XRGB_line_mmx;
  508. YUVi_to_BGRb_lines = YUVi_to_BGRb_lines_mmx;// blit : 0.0245 seconds= 45.9 clocks / pixel
  509. }
  510. else
  511. {
  512. // YUVi_to_BGRb_line = YUVi_to_BGRb_line_c;
  513. YUVi_to_XRGB_line = YUVi_to_XRGB_line_c;
  514. YUVi_to_BGRb_lines = YUVi_to_BGRb_lines_c;
  515. }
  516. }
  517. /*}******* EOF ***********/