PageRenderTime 57ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/src/dsutil/vd.cpp

https://bitbucket.org/Tomasen/splayer/
C++ | 1930 lines | 1485 code | 323 blank | 122 comment | 262 complexity | 8b2b4d09d447840566427cd560ac9037 MD5 | raw file
Possible License(s): GPL-2.0, CC-BY-SA-3.0, LGPL-2.0, LGPL-2.1, BSD-3-Clause, LGPL-3.0, AGPL-1.0
  1. // VirtualDub - Video processing and capture application
  2. // Copyright (C) 1998-2001 Avery Lee
  3. //
  4. // This program is free software; you can redistribute it and/or modify
  5. // it under the terms of the GNU General Public License as published by
  6. // the Free Software Foundation; either version 2 of the License, or
  7. // (at your option) any later version.
  8. //
  9. // This program is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU General Public License for more details.
  13. //
  14. // You should have received a copy of the GNU General Public License
  15. // along with this program; if not, write to the Free Software
  16. // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17. //
  18. // Notes:
  19. // - BitBltFromI420ToRGB is from VirtualDub
  20. // - The core assembly function of CCpuID is from DVD2AVI
  21. // - sse2 yv12 to yuy2 conversion by Haali
  22. // (- vd.cpp/h should be renamed to something more sensible already :)
  23. #include "stdafx.h"
  24. #include "vd.h"
  25. #include "..\svplib\svplib.h"
  26. #undef SVP_LogMsg5
  27. #define SVP_LogMsg5 __noop
  28. #pragma warning(disable : 4799) // no emms... blahblahblah
  29. #ifdef _WIN64 // _WIN64
  30. CCpuID g_cpuid;
  31. CCpuID::CCpuID()
  32. {
  33. // TODOX64 : ??
  34. m_flags = (flag_t)7;
  35. }
  36. static void yuvtoyuy2row_c(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width)
  37. {
  38. WORD* dstw = (WORD*)dst;
  39. for(; width > 1; width -= 2)
  40. {
  41. *dstw++ = (*srcu++<<8)|*srcy++;
  42. *dstw++ = (*srcv++<<8)|*srcy++;
  43. }
  44. }
  45. static void yuvtoyuy2row_avg_c(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width, DWORD pitchuv)
  46. {
  47. WORD* dstw = (WORD*)dst;
  48. for(; width > 1; width -= 2, srcu++, srcv++)
  49. {
  50. *dstw++ = (((srcu[0]+srcu[pitchuv])>>1)<<8)|*srcy++;
  51. *dstw++ = (((srcv[0]+srcv[pitchuv])>>1)<<8)|*srcy++;
  52. }
  53. }
  54. static void asm_blend_row_clipped_c(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
  55. {
  56. BYTE* src2 = src + srcpitch;
  57. do {*dst++ = (*src++ + *src2++ + 1) >> 1;}
  58. while(w--);
  59. }
  60. static void asm_blend_row_c(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
  61. {
  62. BYTE* src2 = src + srcpitch;
  63. BYTE* src3 = src2 + srcpitch;
  64. do {*dst++ = (*src++ + (*src2++ << 1) + *src3++ + 2) >> 2;}
  65. while(w--);
  66. }
  67. bool BitBltFromI420ToI420(int w, int h, BYTE* dsty, BYTE* dstu, BYTE* dstv, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch)
  68. {
  69. if((w&1)) return(false);
  70. if(w > 0 && w == srcpitch && w == dstpitch)
  71. {
  72. memcpy(dsty, srcy, h*srcpitch);
  73. memcpy(dstu, srcu, h/2*srcpitch/2);
  74. memcpy(dstv, srcv, h/2*srcpitch/2);
  75. }
  76. else
  77. {
  78. int pitch = min(abs(srcpitch), abs(dstpitch));
  79. for(int y = 0; y < h; y++, srcy += srcpitch, dsty += dstpitch)
  80. memcpy(dsty, srcy, pitch);
  81. srcpitch >>= 1;
  82. dstpitch >>= 1;
  83. pitch = min(abs(srcpitch), abs(dstpitch));
  84. for(int y = 0; y < h; y+=2, srcu += srcpitch, dstu += dstpitch)
  85. memcpy(dstu, srcu, pitch);
  86. for(int y = 0; y < h; y+=2, srcv += srcpitch, dstv += dstpitch)
  87. memcpy(dstv, srcv, pitch);
  88. }
  89. return true;
  90. }
  91. bool BitBltFromI420ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch, bool fInterlaced)
  92. {
  93. if(w<=0 || h<=0 || (w&1) || (h&1))
  94. return(false);
  95. if(srcpitch == 0) srcpitch = w;
  96. do
  97. {
  98. yuvtoyuy2row_c(dst, srcy, srcu, srcv, w);
  99. yuvtoyuy2row_avg_c(dst + dstpitch, srcy + srcpitch, srcu, srcv, w, srcpitch/2);
  100. dst += 2*dstpitch;
  101. srcy += srcpitch*2;
  102. srcu += srcpitch/2;
  103. srcv += srcpitch/2;
  104. }
  105. while((h -= 2) > 2);
  106. yuvtoyuy2row_c(dst, srcy, srcu, srcv, w);
  107. yuvtoyuy2row_c(dst + dstpitch, srcy + srcpitch, srcu, srcv, w);
  108. return(true);
  109. }
  110. bool BitBltFromYUY2ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* src, int srcpitch)
  111. {
  112. if(w > 0 && w == srcpitch && w == dstpitch)
  113. {
  114. memcpy(dst, src, h*srcpitch);
  115. }
  116. else
  117. {
  118. int pitch = min(abs(srcpitch), abs(dstpitch));
  119. for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
  120. memcpy(dst, src, pitch);
  121. }
  122. return(true);
  123. }
  124. bool BitBltFromI420ToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch)
  125. {
  126. ASSERT(FALSE);
  127. return false;
  128. }
  129. bool BitBltFromRGBToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* src, int srcpitch, int sbpp)
  130. {
  131. if(dbpp == sbpp)
  132. {
  133. int rowbytes = w*dbpp>>3;
  134. if(rowbytes > 0 && rowbytes == srcpitch && rowbytes == dstpitch)
  135. {
  136. memcpy(dst, src, h*rowbytes);
  137. }
  138. else
  139. {
  140. for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
  141. memcpy(dst, src, rowbytes);
  142. }
  143. return(true);
  144. }
  145. if(sbpp != 16 && sbpp != 24 && sbpp != 32
  146. || dbpp != 16 && dbpp != 24 && dbpp != 32)
  147. return(false);
  148. if(dbpp == 16)
  149. {
  150. for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
  151. {
  152. if(sbpp == 24)
  153. {
  154. BYTE* s = (BYTE*)src;
  155. WORD* d = (WORD*)dst;
  156. for(int x = 0; x < w; x++, s+=3, d++)
  157. *d = (WORD)(((*((DWORD*)s)>>8)&0xf800)|((*((DWORD*)s)>>5)&0x07e0)|((*((DWORD*)s)>>3)&0x1f));
  158. }
  159. else if(sbpp == 32)
  160. {
  161. DWORD* s = (DWORD*)src;
  162. WORD* d = (WORD*)dst;
  163. for(int x = 0; x < w; x++, s++, d++)
  164. *d = (WORD)(((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x1f));
  165. }
  166. }
  167. }
  168. else if(dbpp == 24)
  169. {
  170. for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
  171. {
  172. if(sbpp == 16)
  173. {
  174. WORD* s = (WORD*)src;
  175. BYTE* d = (BYTE*)dst;
  176. for(int x = 0; x < w; x++, s++, d+=3)
  177. { // not tested, r-g-b might be in reverse
  178. d[0] = (*s&0x001f)<<3;
  179. d[1] = (*s&0x07e0)<<5;
  180. d[2] = (*s&0xf800)<<8;
  181. }
  182. }
  183. else if(sbpp == 32)
  184. {
  185. BYTE* s = (BYTE*)src;
  186. BYTE* d = (BYTE*)dst;
  187. for(int x = 0; x < w; x++, s+=4, d+=3)
  188. {d[0] = s[0]; d[1] = s[1]; d[2] = s[2];}
  189. }
  190. }
  191. }
  192. else if(dbpp == 32)
  193. {
  194. for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
  195. {
  196. if(sbpp == 16)
  197. {
  198. WORD* s = (WORD*)src;
  199. DWORD* d = (DWORD*)dst;
  200. for(int x = 0; x < w; x++, s++, d++)
  201. *d = ((*s&0xf800)<<8)|((*s&0x07e0)<<5)|((*s&0x001f)<<3);
  202. }
  203. else if(sbpp == 24)
  204. {
  205. BYTE* s = (BYTE*)src;
  206. DWORD* d = (DWORD*)dst;
  207. for(int x = 0; x < w; x++, s+=3, d++)
  208. *d = *((DWORD*)s)&0xffffff;
  209. }
  210. }
  211. }
  212. return(true);
  213. }
  214. bool BitBltFromYUY2ToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* src, int srcpitch)
  215. {
  216. ASSERT(FALSE);
  217. return false;
  218. }
  219. void DeinterlaceBlend(BYTE* dst, BYTE* src, DWORD rowbytes, DWORD h, DWORD dstpitch, DWORD srcpitch)
  220. {
  221. asm_blend_row_clipped_c(dst, src, rowbytes, srcpitch);
  222. if((h -= 2) > 0) do
  223. {
  224. dst += dstpitch;
  225. asm_blend_row_c(dst, src, rowbytes, srcpitch);
  226. src += srcpitch;
  227. }
  228. while(--h);
  229. asm_blend_row_clipped_c(dst + dstpitch, src, rowbytes, srcpitch);
  230. }
  231. void DeinterlaceBob(BYTE* dst, BYTE* src, DWORD rowbytes, DWORD h, DWORD dstpitch, DWORD srcpitch, bool topfield)
  232. {
  233. if(topfield)
  234. {
  235. BitBltFromRGBToRGB(rowbytes, h/2, dst, dstpitch*2, 8, src, srcpitch*2, 8);
  236. AvgLines8(dst, h, dstpitch);
  237. }
  238. else
  239. {
  240. BitBltFromRGBToRGB(rowbytes, h/2, dst + dstpitch, dstpitch*2, 8, src + srcpitch, srcpitch*2, 8);
  241. AvgLines8(dst + dstpitch, h-1, dstpitch);
  242. }
  243. }
  244. void AvgLines8(BYTE* dst, DWORD h, DWORD pitch)
  245. {
  246. if(h <= 1) return;
  247. BYTE* s = dst;
  248. BYTE* d = dst + (h-2)*pitch;
  249. for(; s < d; s += pitch*2)
  250. {
  251. BYTE* tmp = s;
  252. {
  253. for(int i = pitch; i--; tmp++)
  254. {
  255. tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1;
  256. }
  257. }
  258. }
  259. if(!(h&1) && h >= 2)
  260. {
  261. dst += (h-2)*pitch;
  262. memcpy(dst + pitch, dst, pitch);
  263. }
  264. }
  265. #else // _WIN64
  266. CCpuID::CCpuID()
  267. {
  268. DWORD flags = 0;
  269. __asm
  270. {
  271. mov eax, 1
  272. cpuid
  273. test edx, 0x00800000 // STD MMX
  274. jz TEST_SSE
  275. or [flags], 1
  276. TEST_SSE:
  277. test edx, 0x02000000 // STD SSE
  278. jz TEST_SSE2
  279. or [flags], 2
  280. or [flags], 4
  281. TEST_SSE2:
  282. test edx, 0x04000000 // SSE2
  283. jz TEST_3DNOW
  284. or [flags], 8
  285. TEST_3DNOW:
  286. mov eax, 0x80000001
  287. cpuid
  288. test edx, 0x80000000 // 3D NOW
  289. jz TEST_SSEMMX
  290. or [flags], 16
  291. TEST_SSEMMX:
  292. test edx, 0x00400000 // SSE MMX
  293. jz TEST_END
  294. or [flags], 2
  295. TEST_END:
  296. }
  297. m_flags = (flag_t)flags;
  298. }
  299. CCpuID g_cpuid;
  300. void memcpy_accel(void* dst, const void* src, size_t len)
  301. {
  302. if((g_cpuid.m_flags & CCpuID::ssefpu) && len >= 128
  303. && !((DWORD)src&15) && !((DWORD)dst&15))
  304. {
  305. __asm
  306. {
  307. mov esi, dword ptr [src]
  308. mov edi, dword ptr [dst]
  309. mov ecx, len
  310. shr ecx, 7
  311. memcpy_accel_sse_loop:
  312. prefetchnta [esi+16*8]
  313. movaps xmm0, [esi]
  314. movaps xmm1, [esi+16*1]
  315. movaps xmm2, [esi+16*2]
  316. movaps xmm3, [esi+16*3]
  317. movaps xmm4, [esi+16*4]
  318. movaps xmm5, [esi+16*5]
  319. movaps xmm6, [esi+16*6]
  320. movaps xmm7, [esi+16*7]
  321. movntps [edi], xmm0
  322. movntps [edi+16*1], xmm1
  323. movntps [edi+16*2], xmm2
  324. movntps [edi+16*3], xmm3
  325. movntps [edi+16*4], xmm4
  326. movntps [edi+16*5], xmm5
  327. movntps [edi+16*6], xmm6
  328. movntps [edi+16*7], xmm7
  329. add esi, 128
  330. add edi, 128
  331. dec ecx
  332. jne memcpy_accel_sse_loop
  333. mov ecx, len
  334. and ecx, 127
  335. cmp ecx, 0
  336. je memcpy_accel_sse_end
  337. memcpy_accel_sse_loop2:
  338. mov dl, byte ptr[esi]
  339. mov byte ptr[edi], dl
  340. inc esi
  341. inc edi
  342. dec ecx
  343. jne memcpy_accel_sse_loop2
  344. memcpy_accel_sse_end:
  345. emms
  346. sfence
  347. }
  348. }
  349. else if((g_cpuid.m_flags & CCpuID::mmx) && len >= 64
  350. && !((DWORD)src&7) && !((DWORD)dst&7))
  351. {
  352. __asm
  353. {
  354. mov esi, dword ptr [src]
  355. mov edi, dword ptr [dst]
  356. mov ecx, len
  357. shr ecx, 6
  358. memcpy_accel_mmx_loop:
  359. movq mm0, qword ptr [esi]
  360. movq mm1, qword ptr [esi+8*1]
  361. movq mm2, qword ptr [esi+8*2]
  362. movq mm3, qword ptr [esi+8*3]
  363. movq mm4, qword ptr [esi+8*4]
  364. movq mm5, qword ptr [esi+8*5]
  365. movq mm6, qword ptr [esi+8*6]
  366. movq mm7, qword ptr [esi+8*7]
  367. movq qword ptr [edi], mm0
  368. movq qword ptr [edi+8*1], mm1
  369. movq qword ptr [edi+8*2], mm2
  370. movq qword ptr [edi+8*3], mm3
  371. movq qword ptr [edi+8*4], mm4
  372. movq qword ptr [edi+8*5], mm5
  373. movq qword ptr [edi+8*6], mm6
  374. movq qword ptr [edi+8*7], mm7
  375. add esi, 64
  376. add edi, 64
  377. loop memcpy_accel_mmx_loop
  378. mov ecx, len
  379. and ecx, 63
  380. cmp ecx, 0
  381. je memcpy_accel_mmx_end
  382. memcpy_accel_mmx_loop2:
  383. mov dl, byte ptr [esi]
  384. mov byte ptr [edi], dl
  385. inc esi
  386. inc edi
  387. dec ecx
  388. jne memcpy_accel_mmx_loop2
  389. memcpy_accel_mmx_end:
  390. emms
  391. }
  392. }
  393. else
  394. {
  395. memcpy(dst, src, len);
  396. }
  397. }
  398. static void yuvtoyuy2row_c(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width)
  399. {
  400. WORD* dstw = (WORD*)dst;
  401. for(; width > 1; width -= 2)
  402. {
  403. *dstw++ = (*srcu++<<8)|*srcy++;
  404. *dstw++ = (*srcv++<<8)|*srcy++;
  405. }
  406. }
  407. static void yuvtoyuy2row_avg_c(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width, DWORD pitchuv)
  408. {
  409. WORD* dstw = (WORD*)dst;
  410. for(; width > 1; width -= 2, srcu++, srcv++)
  411. {
  412. *dstw++ = (((srcu[0]+srcu[pitchuv])>>1)<<8)|*srcy++;
  413. *dstw++ = (((srcv[0]+srcv[pitchuv])>>1)<<8)|*srcy++;
  414. }
  415. }
  416. static void asm_blend_row_clipped_c(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
  417. {
  418. BYTE* src2 = src + srcpitch;
  419. do {*dst++ = (*src++ + *src2++ + 1) >> 1;}
  420. while(w--);
  421. }
  422. static void asm_blend_row_c(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
  423. {
  424. BYTE* src2 = src + srcpitch;
  425. BYTE* src3 = src2 + srcpitch;
  426. do {*dst++ = (*src++ + (*src2++ << 1) + *src3++ + 2) >> 2;}
  427. while(w--);
  428. }
  429. bool BitBltFromI420ToI420(int w, int h, BYTE* dsty, BYTE* dstu, BYTE* dstv, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch)
  430. {
  431. if((w&1)) return(false);
  432. //SVP_LogMsg5(L"h %d", h);
  433. int orgh = h;
  434. if(h&1){h--;}
  435. if(w > 0 && w == srcpitch && w == dstpitch)
  436. {
  437. memcpy_accel(dsty, srcy, orgh*srcpitch);
  438. memcpy_accel(dstu, srcu, h*srcpitch/4);
  439. memcpy_accel(dstv, srcv, h*srcpitch/4);
  440. }
  441. else
  442. {
  443. //return true;
  444. int pitch = min(abs(srcpitch), abs(dstpitch));
  445. SVP_LogMsg5(L"pitch %d %d %d",abs(srcpitch), abs(dstpitch),pitch);
  446. //int fill = abs(dstpitch) - abs(srcpitch);
  447. //if(fill <= 0) {fill = 0;}
  448. if(0){
  449. memset(dsty, 0x7f, orgh*dstpitch);
  450. memset(dstu, 0xff, h*dstpitch/4);
  451. memset(dstv, 0xff, h*dstpitch/4);
  452. //return true;
  453. }
  454. for(int y = 0; y < orgh; y++, srcy += srcpitch, dsty += dstpitch){
  455. memcpy_accel(dsty, srcy, pitch);
  456. memset(dsty+pitch, 0x00, abs(dstpitch)-pitch);
  457. //memcpy(dsty, srcy, pitch);
  458. }
  459. srcpitch >>= 1;
  460. dstpitch >>= 1;
  461. pitch = min(abs(srcpitch), abs(dstpitch));
  462. //dstu-= dstpitch/2;
  463. //dstv+= dstpitch/2;
  464. //srcv-=srcpitch/2;
  465. //srcu-=srcpitch/2;
  466. for(int y = 0; y < h; y+=2, srcu += srcpitch, dstu += dstpitch){
  467. memcpy_accel(dstu, srcu, pitch);
  468. memset(dstu+pitch, 0x80, abs(dstpitch)-pitch);
  469. //memset(dstu, 0xff, abs(dstpitch));
  470. //memcpy(dstu, srcu, pitch);
  471. }
  472. for(int y = 0; y < h; y+=2, srcv += srcpitch, dstv += dstpitch)
  473. {
  474. //memset(dstv, 0xff, abs(dstpitch));
  475. //memcpy(dstv, srcv, pitch);
  476. memcpy_accel(dstv, srcv, pitch);
  477. memset(dstv+pitch, 0x80, abs(dstpitch)-pitch);
  478. }
  479. }
  480. return true;
  481. }
  482. bool BitBltFromYUY2ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* src, int srcpitch)
  483. {
  484. if(w > 0 && w == srcpitch && w == dstpitch)
  485. {
  486. memcpy_accel(dst, src, h*srcpitch);
  487. }
  488. else
  489. {
  490. int pitch = min(abs(srcpitch), abs(dstpitch));
  491. for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
  492. memcpy_accel(dst, src, pitch);
  493. }
  494. return(true);
  495. }
  496. extern "C" void asm_YUVtoRGB32_row(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
  497. extern "C" void asm_YUVtoRGB24_row(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
  498. extern "C" void asm_YUVtoRGB16_row(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
  499. extern "C" void asm_YUVtoRGB32_row_MMX(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
  500. extern "C" void asm_YUVtoRGB24_row_MMX(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
  501. extern "C" void asm_YUVtoRGB16_row_MMX(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
  502. extern "C" void asm_YUVtoRGB32_row_ISSE(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
  503. extern "C" void asm_YUVtoRGB24_row_ISSE(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
  504. extern "C" void asm_YUVtoRGB16_row_ISSE(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
  505. bool BitBltFromI420ToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch)
  506. {
  507. if(h&1){h--;}
  508. if(w<=0 || h<=0 || (w&1) || (h&1))
  509. return(false);
  510. void (*asm_YUVtoRGB_row)(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width) = NULL;;
  511. if((g_cpuid.m_flags & CCpuID::ssefpu) && !(w&7))
  512. {
  513. switch(dbpp)
  514. {
  515. case 16: asm_YUVtoRGB_row = asm_YUVtoRGB16_row/*_ISSE*/; break; // TODO: fix _ISSE (555->565)
  516. case 24: asm_YUVtoRGB_row = asm_YUVtoRGB24_row_ISSE; break;
  517. case 32: asm_YUVtoRGB_row = asm_YUVtoRGB32_row_ISSE; break;
  518. }
  519. }
  520. else if((g_cpuid.m_flags & CCpuID::mmx) && !(w&7))
  521. {
  522. switch(dbpp)
  523. {
  524. case 16: asm_YUVtoRGB_row = asm_YUVtoRGB16_row/*_MMX*/; break; // TODO: fix _MMX (555->565)
  525. case 24: asm_YUVtoRGB_row = asm_YUVtoRGB24_row_MMX; break;
  526. case 32: asm_YUVtoRGB_row = asm_YUVtoRGB32_row_MMX; break;
  527. }
  528. }
  529. else
  530. {
  531. switch(dbpp)
  532. {
  533. case 16: asm_YUVtoRGB_row = asm_YUVtoRGB16_row; break;
  534. case 24: asm_YUVtoRGB_row = asm_YUVtoRGB24_row; break;
  535. case 32: asm_YUVtoRGB_row = asm_YUVtoRGB32_row; break;
  536. }
  537. }
  538. if(!asm_YUVtoRGB_row)
  539. return(false);
  540. do
  541. {
  542. asm_YUVtoRGB_row(dst + dstpitch, dst, srcy + srcpitch, srcy, srcu, srcv, w/2);
  543. dst += 2*dstpitch;
  544. srcy += srcpitch*2;
  545. srcu += srcpitch/2;
  546. srcv += srcpitch/2;
  547. }
  548. while(h -= 2);
  549. if(g_cpuid.m_flags & CCpuID::mmx)
  550. __asm emms
  551. if(g_cpuid.m_flags & CCpuID::ssefpu)
  552. __asm sfence
  553. return true;
  554. }
  555. static void __declspec(naked) yuvtoyuy2row_MMX(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width)
  556. {
  557. __asm {
  558. push ebp
  559. push edi
  560. push esi
  561. push ebx
  562. mov edi, [esp+20] // dst
  563. mov ebp, [esp+24] // srcy
  564. mov ebx, [esp+28] // srcu
  565. mov esi, [esp+32] // srcv
  566. mov ecx, [esp+36] // width
  567. shr ecx, 3
  568. yuvtoyuy2row_loop:
  569. movd mm0, [ebx]
  570. punpcklbw mm0, [esi]
  571. movq mm1, [ebp]
  572. movq mm2, mm1
  573. punpcklbw mm1, mm0
  574. punpckhbw mm2, mm0
  575. movq [edi], mm1
  576. movq [edi+8], mm2
  577. add ebp, 8
  578. add ebx, 4
  579. add esi, 4
  580. add edi, 16
  581. dec ecx
  582. jnz yuvtoyuy2row_loop
  583. pop ebx
  584. pop esi
  585. pop edi
  586. pop ebp
  587. ret
  588. };
  589. }
  590. static void __declspec(naked) yuvtoyuy2row_avg_MMX(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width, DWORD pitchuv)
  591. {
  592. static const __int64 mask = 0x7f7f7f7f7f7f7f7fi64;
  593. __asm {
  594. push ebp
  595. push edi
  596. push esi
  597. push ebx
  598. movq mm7, mask
  599. mov edi, [esp+20] // dst
  600. mov ebp, [esp+24] // srcy
  601. mov ebx, [esp+28] // srcu
  602. mov esi, [esp+32] // srcv
  603. mov ecx, [esp+36] // width
  604. mov eax, [esp+40] // pitchuv
  605. shr ecx, 3
  606. yuvtoyuy2row_avg_loop:
  607. movd mm0, [ebx]
  608. punpcklbw mm0, [esi]
  609. movq mm1, mm0
  610. movd mm2, [ebx + eax]
  611. punpcklbw mm2, [esi + eax]
  612. movq mm3, mm2
  613. // (x+y)>>1 == (x&y)+((x^y)>>1)
  614. pand mm0, mm2
  615. pxor mm1, mm3
  616. psrlq mm1, 1
  617. pand mm1, mm7
  618. paddb mm0, mm1
  619. movq mm1, [ebp]
  620. movq mm2, mm1
  621. punpcklbw mm1, mm0
  622. punpckhbw mm2, mm0
  623. movq [edi], mm1
  624. movq [edi+8], mm2
  625. add ebp, 8
  626. add ebx, 4
  627. add esi, 4
  628. add edi, 16
  629. dec ecx
  630. jnz yuvtoyuy2row_avg_loop
  631. pop ebx
  632. pop esi
  633. pop edi
  634. pop ebp
  635. ret
  636. };
  637. }
  638. static void __declspec(naked) yv12_yuy2_row_sse2() {
  639. __asm {
  640. // ebx - Y
  641. // edx - U
  642. // esi - V
  643. // edi - dest
  644. // ecx - halfwidth
  645. xor eax, eax
  646. one:
  647. movdqa xmm0, [ebx + eax*2] // YYYYYYYY
  648. movdqa xmm1, [ebx + eax*2 + 16] // YYYYYYYY
  649. movdqa xmm2, [edx + eax] // UUUUUUUU
  650. movdqa xmm3, [esi + eax] // VVVVVVVV
  651. movdqa xmm4, xmm2
  652. movdqa xmm5, xmm0
  653. movdqa xmm6, xmm1
  654. punpcklbw xmm2, xmm3 // VUVUVUVU
  655. punpckhbw xmm4, xmm3 // VUVUVUVU
  656. punpcklbw xmm0, xmm2 // VYUYVYUY
  657. punpcklbw xmm1, xmm4
  658. punpckhbw xmm5, xmm2
  659. punpckhbw xmm6, xmm4
  660. movntdq [edi + eax*4], xmm0
  661. movntdq [edi + eax*4 + 16], xmm5
  662. movntdq [edi + eax*4 + 32], xmm1
  663. movntdq [edi + eax*4 + 48], xmm6
  664. add eax, 16
  665. cmp eax, ecx
  666. jb one
  667. ret
  668. };
  669. }
  670. static void __declspec(naked) yv12_yuy2_row_sse2_linear() {
  671. __asm {
  672. // ebx - Y
  673. // edx - U
  674. // esi - V
  675. // edi - dest
  676. // ecx - width
  677. // ebp - uv_stride
  678. xor eax, eax
  679. one:
  680. movdqa xmm0, [ebx + eax*2] // YYYYYYYY
  681. movdqa xmm1, [ebx + eax*2 + 16] // YYYYYYYY
  682. movdqa xmm2, [edx]
  683. movdqa xmm3, [esi]
  684. pavgb xmm2, [edx + ebp] // UUUUUUUU
  685. pavgb xmm3, [esi + ebp] // VVVVVVVV
  686. movdqa xmm4, xmm2
  687. movdqa xmm5, xmm0
  688. movdqa xmm6, xmm1
  689. punpcklbw xmm2, xmm3 // VUVUVUVU
  690. punpckhbw xmm4, xmm3 // VUVUVUVU
  691. punpcklbw xmm0, xmm2 // VYUYVYUY
  692. punpcklbw xmm1, xmm4
  693. punpckhbw xmm5, xmm2
  694. punpckhbw xmm6, xmm4
  695. movntdq [edi + eax*4], xmm0
  696. movntdq [edi + eax*4 + 16], xmm5
  697. movntdq [edi + eax*4 + 32], xmm1
  698. movntdq [edi + eax*4 + 48], xmm6
  699. add eax, 16
  700. add edx, 16
  701. add esi, 16
  702. cmp eax, ecx
  703. jb one
  704. ret
  705. };
  706. }
  707. static void __declspec(naked) yv12_yuy2_row_sse2_linear_interlaced() {
  708. __asm {
  709. // ebx - Y
  710. // edx - U
  711. // esi - V
  712. // edi - dest
  713. // ecx - width
  714. // ebp - uv_stride
  715. xor eax, eax
  716. one:
  717. movdqa xmm0, [ebx + eax*2] // YYYYYYYY
  718. movdqa xmm1, [ebx + eax*2 + 16] // YYYYYYYY
  719. movdqa xmm2, [edx]
  720. movdqa xmm3, [esi]
  721. pavgb xmm2, [edx + ebp*2] // UUUUUUUU
  722. pavgb xmm3, [esi + ebp*2] // VVVVVVVV
  723. movdqa xmm4, xmm2
  724. movdqa xmm5, xmm0
  725. movdqa xmm6, xmm1
  726. punpcklbw xmm2, xmm3 // VUVUVUVU
  727. punpckhbw xmm4, xmm3 // VUVUVUVU
  728. punpcklbw xmm0, xmm2 // VYUYVYUY
  729. punpcklbw xmm1, xmm4
  730. punpckhbw xmm5, xmm2
  731. punpckhbw xmm6, xmm4
  732. movntdq [edi + eax*4], xmm0
  733. movntdq [edi + eax*4 + 16], xmm5
  734. movntdq [edi + eax*4 + 32], xmm1
  735. movntdq [edi + eax*4 + 48], xmm6
  736. add eax, 16
  737. add edx, 16
  738. add esi, 16
  739. cmp eax, ecx
  740. jb one
  741. ret
  742. };
  743. }
  744. void __declspec(naked) yv12_yuy2_sse2(const BYTE *Y, const BYTE *U, const BYTE *V,
  745. int halfstride, unsigned halfwidth, unsigned height,
  746. BYTE *YUY2, int d_stride)
  747. {
  748. __asm {
  749. push ebx
  750. push esi
  751. push edi
  752. push ebp
  753. mov ebx, [esp + 20] // Y
  754. mov edx, [esp + 24] // U
  755. mov esi, [esp + 28] // V
  756. mov edi, [esp + 44] // D
  757. mov ebp, [esp + 32] // uv_stride
  758. mov ecx, [esp + 36] // uv_width
  759. mov eax, ecx
  760. add eax, 15
  761. and eax, 0xfffffff0
  762. sub [esp + 32], eax
  763. cmp dword ptr [esp + 40], 2
  764. jbe last2
  765. row:
  766. sub dword ptr [esp + 40], 2
  767. call yv12_yuy2_row_sse2
  768. lea ebx, [ebx + ebp*2]
  769. add edi, [esp + 48]
  770. call yv12_yuy2_row_sse2_linear
  771. add edx, [esp + 32]
  772. add esi, [esp + 32]
  773. lea ebx, [ebx + ebp*2]
  774. add edi, [esp + 48]
  775. cmp dword ptr [esp + 40], 2
  776. ja row
  777. last2:
  778. call yv12_yuy2_row_sse2
  779. dec dword ptr [esp + 40]
  780. jz done
  781. lea ebx, [ebx + ebp*2]
  782. add edi, [esp + 48]
  783. call yv12_yuy2_row_sse2
  784. done:
  785. pop ebp
  786. pop edi
  787. pop esi
  788. pop ebx
  789. ret
  790. };
  791. }
  792. void __declspec(naked) yv12_yuy2_sse2_interlaced(const BYTE *Y, const BYTE *U, const BYTE *V,
  793. int halfstride, unsigned halfwidth, unsigned height,
  794. BYTE *YUY2, int d_stride)
  795. {
  796. __asm {
  797. push ebx
  798. push esi
  799. push edi
  800. push ebp
  801. mov ebx, [esp + 20] // Y
  802. mov edx, [esp + 24] // U
  803. mov esi, [esp + 28] // V
  804. mov edi, [esp + 44] // D
  805. mov ebp, [esp + 32] // uv_stride
  806. mov ecx, [esp + 36] // uv_width
  807. mov eax, ecx
  808. add eax, 15
  809. and eax, 0xfffffff0
  810. sub [esp + 32], eax
  811. cmp dword ptr [esp + 40], 4
  812. jbe last4
  813. row:
  814. sub dword ptr [esp + 40], 4
  815. call yv12_yuy2_row_sse2 // first row, first field
  816. lea ebx, [ebx + ebp*2]
  817. add edi, [esp + 48]
  818. add edx, ebp
  819. add esi, ebp
  820. call yv12_yuy2_row_sse2 // first row, second field
  821. lea ebx, [ebx + ebp*2]
  822. add edi, [esp + 48]
  823. sub edx, ebp
  824. sub esi, ebp
  825. call yv12_yuy2_row_sse2_linear_interlaced // second row, first field
  826. add edx, [esp + 32]
  827. add esi, [esp + 32]
  828. lea ebx, [ebx + ebp*2]
  829. add edi, [esp + 48]
  830. call yv12_yuy2_row_sse2_linear_interlaced // second row, second field
  831. add edx, [esp + 32]
  832. add esi, [esp + 32]
  833. lea ebx, [ebx + ebp*2]
  834. add edi, [esp + 48]
  835. cmp dword ptr [esp + 40], 4
  836. ja row
  837. last4:
  838. call yv12_yuy2_row_sse2
  839. lea ebx, [ebx + ebp*2]
  840. add edi, [esp + 48]
  841. add edx, ebp
  842. add esi, ebp
  843. call yv12_yuy2_row_sse2
  844. lea ebx, [ebx + ebp*2]
  845. add edi, [esp + 48]
  846. sub edx, ebp
  847. sub esi, ebp
  848. call yv12_yuy2_row_sse2
  849. lea ebx, [ebx + ebp*2]
  850. add edi, [esp + 48]
  851. add edx, ebp
  852. add esi, ebp
  853. call yv12_yuy2_row_sse2
  854. pop ebp
  855. pop edi
  856. pop esi
  857. pop ebx
  858. ret
  859. };
  860. }
  861. bool BitBltFromI420ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch, bool fInterlaced)
  862. {
  863. if(h&1){h--;}
  864. if(w<=0 || h<=0 || (w&1) || (h&1))
  865. return(false);
  866. if(srcpitch == 0) srcpitch = w;
  867. if((g_cpuid.m_flags & CCpuID::sse2)
  868. && !((DWORD_PTR)srcy&15) && !((DWORD_PTR)srcu&15) && !((DWORD_PTR)srcv&15) && !(srcpitch&31)
  869. && !((DWORD_PTR)dst&15) && !(dstpitch&15))
  870. {
  871. if(!fInterlaced) {
  872. yv12_yuy2_sse2(srcy, srcu, srcv, srcpitch/2, w/2, h, dst, dstpitch);
  873. }
  874. else{
  875. yv12_yuy2_sse2_interlaced(srcy, srcu, srcv, srcpitch/2, w/2, h, dst, dstpitch);
  876. }
  877. return true;
  878. }
  879. else
  880. {
  881. ASSERT(!fInterlaced);
  882. }
  883. void (*yuvtoyuy2row)(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width) = NULL;
  884. void (*yuvtoyuy2row_avg)(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width, DWORD pitchuv) = NULL;
  885. if((g_cpuid.m_flags & CCpuID::mmx) && !(w&7))
  886. {
  887. yuvtoyuy2row = yuvtoyuy2row_MMX;
  888. yuvtoyuy2row_avg = yuvtoyuy2row_avg_MMX;
  889. }
  890. else
  891. {
  892. yuvtoyuy2row = yuvtoyuy2row_c;
  893. yuvtoyuy2row_avg = yuvtoyuy2row_avg_c;
  894. }
  895. if(!yuvtoyuy2row)
  896. return(false);
  897. do
  898. {
  899. yuvtoyuy2row(dst, srcy, srcu, srcv, w);
  900. yuvtoyuy2row_avg(dst + dstpitch, srcy + srcpitch, srcu, srcv, w, srcpitch/2);
  901. dst += 2*dstpitch;
  902. srcy += srcpitch*2;
  903. srcu += srcpitch/2;
  904. srcv += srcpitch/2;
  905. }
  906. while((h -= 2) > 2);
  907. yuvtoyuy2row(dst, srcy, srcu, srcv, w);
  908. yuvtoyuy2row(dst + dstpitch, srcy + srcpitch, srcu, srcv, w);
  909. if(g_cpuid.m_flags & CCpuID::mmx)
  910. __asm emms
  911. return(true);
  912. }
  913. bool BitBltFromRGBToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* src, int srcpitch, int sbpp){
  914. return true;
  915. }
  916. bool BitBltFromRGBToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* src, int srcpitch, int sbpp, DWORD* palette)
  917. {
  918. //SVP_LogMsg5(L" rgb to rgb %d %d",dbpp , sbpp);
  919. // SVP_LogMsg5(L" rgb to rgb a %d %d %d %d %d %d",w , h,dstpitch,srcpitch,dbpp,sbpp );
  920. if(dbpp == sbpp)
  921. {
  922. int rowbytes = w*dbpp>>3;
  923. int y = 0;
  924. __try{
  925. if(h&1){h--;}
  926. if(rowbytes > 0 && rowbytes == srcpitch && rowbytes == dstpitch)
  927. {
  928. memcpy_accel(dst, src, h*rowbytes);
  929. }
  930. else
  931. {
  932. for(y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
  933. memcpy_accel(dst, src, rowbytes);
  934. }
  935. }__except(EXCEPTION_EXECUTE_HANDLER){
  936. SVP_LogMsg5(L" rgb to rgb %d %d %d %d %d %d %d %d",w , h,dstpitch,srcpitch,dbpp,sbpp ,rowbytes, y);
  937. }
  938. return(true);
  939. }
  940. if(sbpp != 16 && sbpp != 24 && sbpp != 32 && sbpp != 15 && sbpp != 8
  941. || dbpp != 16 && dbpp != 24 && dbpp != 32)
  942. return(false);
  943. if(dbpp == 16)
  944. {
  945. for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
  946. {
  947. if(sbpp == 8)
  948. {
  949. BYTE* s = (BYTE*)src;
  950. WORD* d = (WORD*)dst;
  951. if(palette){
  952. for(int x = 0; x < w; x++, s++, d++)
  953. *d = (WORD)(((*((DWORD*)(palette[*s]))>>8)&0xf00)|((*((DWORD*)(palette[*s]))>>5)&0x03e0)|((*((DWORD*)(palette[*s]))>>3)&0x1f));
  954. }else{
  955. for(int x = 0; x < w; x++, s++, d++)
  956. *d = ((*s&0xe0)<<16)|((*s&0x1c)<<11)|((*s&0x03)<<6);
  957. }
  958. }else if(sbpp == 15)
  959. {
  960. BYTE* s = (BYTE*)src;
  961. WORD* d = (WORD*)dst;
  962. for(int x = 0; x < w; x++, s+=3, d++)
  963. *d = (WORD)(((*((DWORD*)s)>>9)&0x7c00)|((*((DWORD*)s)>>6)&0x03e0)|((*((DWORD*)s)>>3)&0x1f));
  964. }else if(sbpp == 24)
  965. {
  966. BYTE* s = (BYTE*)src;
  967. WORD* d = (WORD*)dst;
  968. for(int x = 0; x < w; x++, s+=3, d++)
  969. *d = (WORD)(((*((DWORD*)s)>>8)&0xf00)|((*((DWORD*)s)>>5)&0x03e0)|((*((DWORD*)s)>>3)&0x1f));
  970. }
  971. else if(sbpp == 32)
  972. {
  973. DWORD* s = (DWORD*)src;
  974. WORD* d = (WORD*)dst;
  975. for(int x = 0; x < w; x++, s++, d++)
  976. *d = (WORD)(((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x1f));
  977. }
  978. }
  979. }
  980. else if(dbpp == 24)
  981. {
  982. for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
  983. {
  984. if(sbpp == 8)
  985. {
  986. BYTE* s = (BYTE*)src;
  987. BYTE* d = (BYTE*)dst;
  988. if(palette){
  989. for(int x = 0; x < w; x++, s++, d+=3){
  990. d[0] = (palette[*s]&0xff0000) >> 16;
  991. d[1] = (palette[*s]&0xff00) >> 8;
  992. d[2] = palette[*s]&0xff;
  993. }
  994. }else{
  995. for(int x = 0; x < w; x++, s++, d+=3)
  996. { // not tested, r-g-b might be in reverse
  997. d[0] = (*s&0x03)<<6;
  998. d[1] = (*s&0x1c)<<11;
  999. d[2] = (*s&0xe0)<<16;
  1000. }
  1001. }
  1002. }else if(sbpp == 15)
  1003. {
  1004. WORD* s = (WORD*)src;
  1005. BYTE* d = (BYTE*)dst;
  1006. for(int x = 0; x < w; x++, s++, d+=3)
  1007. { // not tested, r-g-b might be in reverse
  1008. d[0] = (*s&0x001f)<<3;
  1009. d[1] = (*s&0x03e0)<<6;
  1010. d[2] = (*s&0x7c00)<<9;
  1011. }
  1012. }else if(sbpp == 16)
  1013. {
  1014. WORD* s = (WORD*)src;
  1015. BYTE* d = (BYTE*)dst;
  1016. for(int x = 0; x < w; x++, s++, d+=3)
  1017. { // not tested, r-g-b might be in reverse
  1018. d[0] = (*s&0x001f)<<3;
  1019. d[1] = (*s&0x07e0)<<5;
  1020. d[2] = (*s&0xf800)<<8;
  1021. }
  1022. }
  1023. else if(sbpp == 32)
  1024. {
  1025. BYTE* s = (BYTE*)src;
  1026. BYTE* d = (BYTE*)dst;
  1027. for(int x = 0; x < w; x++, s+=4, d+=3)
  1028. {d[0] = s[0]; d[1] = s[1]; d[2] = s[2];}
  1029. }
  1030. }
  1031. }
  1032. else if(dbpp == 32)
  1033. {
  1034. for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
  1035. {
  1036. if(sbpp == 8)
  1037. {
  1038. BYTE* s = (BYTE*)src;
  1039. DWORD* d = (DWORD*)dst;
  1040. if(palette){
  1041. for(int x = 0; x < w; x++, s++, d++)
  1042. *d = palette[*s];
  1043. }else{
  1044. for(int x = 0; x < w; x++, s++, d++)
  1045. *d = ((*s&0xe0)<<16)|((*s&0x1c)<<11)|((*s&0x03)<<6);
  1046. }
  1047. }else if(sbpp == 15)
  1048. {
  1049. WORD* s = (WORD*)src;
  1050. DWORD* d = (DWORD*)dst;
  1051. for(int x = 0; x < w; x++, s++, d++)
  1052. *d = ((*s&0x7c00)<<9)|((*s&0x03e0)<<6)|((*s&0x001f)<<3);
  1053. }else if(sbpp == 16)
  1054. {
  1055. WORD* s = (WORD*)src;
  1056. DWORD* d = (DWORD*)dst;
  1057. for(int x = 0; x < w; x++, s++, d++)
  1058. *d = ((*s&0xf800)<<8)|((*s&0x07e0)<<5)|((*s&0x001f)<<3);
  1059. }
  1060. else if(sbpp == 24)
  1061. {
  1062. BYTE* s = (BYTE*)src;
  1063. DWORD* d = (DWORD*)dst;
  1064. for(int x = 0; x < w; x++, s+=3, d++)
  1065. *d = *((DWORD*)s)&0xffffff;
  1066. }
  1067. }
  1068. }else{
  1069. //SVP_LogMsg5(L"None rgb to rgb");
  1070. }
  1071. return(true);
  1072. }
  1073. static void __declspec(naked) asm_blend_row_clipped_MMX(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
  1074. {
  1075. static const __int64 _x0001000100010001 = 0x0001000100010001;
  1076. __asm {
  1077. push ebp
  1078. push edi
  1079. push esi
  1080. push ebx
  1081. mov edi,[esp+20]
  1082. mov esi,[esp+24]
  1083. sub edi,esi
  1084. mov ebp,[esp+28]
  1085. mov edx,[esp+32]
  1086. shr ebp, 3
  1087. movq mm6, _x0001000100010001
  1088. pxor mm7, mm7
  1089. xloop:
  1090. movq mm0, [esi]
  1091. movq mm3, mm0
  1092. punpcklbw mm0, mm7
  1093. punpckhbw mm3, mm7
  1094. movq mm1, [esi+edx]
  1095. movq mm4, mm1
  1096. punpcklbw mm1, mm7
  1097. punpckhbw mm4, mm7
  1098. paddw mm1, mm0
  1099. paddw mm1, mm6
  1100. psrlw mm1, 1
  1101. paddw mm4, mm3
  1102. paddw mm4, mm6
  1103. psrlw mm4, 1
  1104. add esi, 8
  1105. packuswb mm1, mm4
  1106. movq [edi+esi-8], mm1
  1107. dec ebp
  1108. jne xloop
  1109. pop ebx
  1110. pop esi
  1111. pop edi
  1112. pop ebp
  1113. ret
  1114. };
  1115. }
  1116. static void __declspec(naked) asm_blend_row_MMX(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
  1117. {
  1118. static const __int64 mask0 = 0xfcfcfcfcfcfcfcfci64;
  1119. static const __int64 mask1 = 0x7f7f7f7f7f7f7f7fi64;
  1120. static const __int64 mask2 = 0x3f3f3f3f3f3f3f3fi64;
  1121. static const __int64 _x0002000200020002 = 0x0002000200020002;
  1122. __asm {
  1123. push ebp
  1124. push edi
  1125. push esi
  1126. push ebx
  1127. mov edi, [esp+20]
  1128. mov esi, [esp+24]
  1129. sub edi, esi
  1130. mov ebp, [esp+28]
  1131. mov edx, [esp+32]
  1132. shr ebp, 3
  1133. movq mm6, _x0002000200020002
  1134. pxor mm7, mm7
  1135. xloop:
  1136. movq mm0, [esi]
  1137. movq mm3, mm0
  1138. punpcklbw mm0, mm7
  1139. punpckhbw mm3, mm7
  1140. movq mm1, [esi+edx]
  1141. movq mm4, mm1
  1142. punpcklbw mm1, mm7
  1143. punpckhbw mm4, mm7
  1144. movq mm2, [esi+edx*2]
  1145. movq mm5, mm2
  1146. punpcklbw mm2, mm7
  1147. punpckhbw mm5, mm7
  1148. psllw mm1, 1
  1149. paddw mm1, mm0
  1150. paddw mm1, mm2
  1151. paddw mm1, mm6
  1152. psrlw mm1, 2
  1153. psllw mm4, 1
  1154. paddw mm4, mm3
  1155. paddw mm4, mm5
  1156. paddw mm4, mm6
  1157. psrlw mm4, 2
  1158. add esi, 8
  1159. packuswb mm1, mm4
  1160. movq [edi+esi-8], mm1
  1161. dec ebp
  1162. jne xloop
  1163. // sadly the original code makes a lot of visible banding artifacts on yuv
  1164. // (it seems those shiftings without rounding introduce too much error)
  1165. /*
  1166. mov edi,[esp+20]
  1167. mov esi,[esp+24]
  1168. sub edi,esi
  1169. mov ebp,[esp+28]
  1170. mov edx,[esp+32]
  1171. movq mm5,mask0
  1172. movq mm6,mask1
  1173. movq mm7,mask2
  1174. shr ebp,1
  1175. jz oddpart
  1176. xloop:
  1177. movq mm2,[esi]
  1178. movq mm0,mm5
  1179. movq mm1,[esi+edx]
  1180. pand mm0,mm2
  1181. psrlq mm1,1
  1182. movq mm2,[esi+edx*2]
  1183. psrlq mm2,2
  1184. pand mm1,mm6
  1185. psrlq mm0,2
  1186. pand mm2,mm7
  1187. paddb mm0,mm1
  1188. add esi,8
  1189. paddb mm0,mm2
  1190. dec ebp
  1191. movq [edi+esi-8],mm0
  1192. jne xloop
  1193. oddpart:
  1194. test byte ptr [esp+28],1
  1195. jz nooddpart
  1196. mov ecx,[esi]
  1197. mov eax,0fcfcfcfch
  1198. mov ebx,[esi+edx]
  1199. and eax,ecx
  1200. shr ebx,1
  1201. mov ecx,[esi+edx*2]
  1202. shr ecx,2
  1203. and ebx,07f7f7f7fh
  1204. shr eax,2
  1205. and ecx,03f3f3f3fh
  1206. add eax,ebx
  1207. add eax,ecx
  1208. mov [edi+esi],eax
  1209. nooddpart:
  1210. */
  1211. pop ebx
  1212. pop esi
  1213. pop edi
  1214. pop ebp
  1215. ret
  1216. };
  1217. }
  1218. __declspec(align(16)) static BYTE const_1_16_bytes[] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
  1219. static void asm_blend_row_SSE2(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
  1220. {
  1221. __asm
  1222. {
  1223. mov edx, srcpitch
  1224. mov esi, src
  1225. mov edi, dst
  1226. sub edi, esi
  1227. mov ecx, w
  1228. mov ebx, ecx
  1229. shr ecx, 4
  1230. and ebx, 15
  1231. movdqa xmm7, [const_1_16_bytes]
  1232. asm_blend_row_SSE2_loop:
  1233. movdqa xmm0, [esi]
  1234. movdqa xmm1, [esi+edx]
  1235. movdqa xmm2, [esi+edx*2]
  1236. pavgb xmm0, xmm1
  1237. pavgb xmm2, xmm1
  1238. psubusb xmm0, xmm7
  1239. pavgb xmm0, xmm2
  1240. movdqa [esi+edi], xmm0
  1241. add esi, 16
  1242. dec ecx
  1243. jnz asm_blend_row_SSE2_loop
  1244. test ebx,15
  1245. jz asm_blend_row_SSE2_end
  1246. mov ecx, ebx
  1247. xor ax, ax
  1248. xor bx, bx
  1249. xor dx, dx
  1250. asm_blend_row_SSE2_loop2:
  1251. mov al, [esi]
  1252. mov bl, [esi+edx]
  1253. mov dl, [esi+edx*2]
  1254. add ax, bx
  1255. inc ax
  1256. shr ax, 1
  1257. add dx, bx
  1258. inc dx
  1259. shr dx, 1
  1260. add ax, dx
  1261. shr ax, 1
  1262. mov [esi+edi], al
  1263. inc esi
  1264. dec ecx
  1265. jnz asm_blend_row_SSE2_loop2
  1266. asm_blend_row_SSE2_end:
  1267. }
  1268. }
  1269. static void asm_blend_row_clipped_SSE2(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
  1270. {
  1271. __asm
  1272. {
  1273. mov edx, srcpitch
  1274. mov esi, src
  1275. mov edi, dst
  1276. sub edi, esi
  1277. mov ecx, w
  1278. mov ebx, ecx
  1279. shr ecx, 4
  1280. and ebx, 15
  1281. movdqa xmm7, [const_1_16_bytes]
  1282. asm_blend_row_clipped_SSE2_loop:
  1283. movdqa xmm0, [esi]
  1284. movdqa xmm1, [esi+edx]
  1285. pavgb xmm0, xmm1
  1286. movdqa [esi+edi], xmm0
  1287. add esi, 16
  1288. dec ecx
  1289. jnz asm_blend_row_clipped_SSE2_loop
  1290. test ebx,15
  1291. jz asm_blend_row_clipped_SSE2_end
  1292. mov ecx, ebx
  1293. xor ax, ax
  1294. xor bx, bx
  1295. asm_blend_row_clipped_SSE2_loop2:
  1296. mov al, [esi]
  1297. mov bl, [esi+edx]
  1298. add ax, bx
  1299. inc ax
  1300. shr ax, 1
  1301. mov [esi+edi], al
  1302. inc esi
  1303. dec ecx
  1304. jnz asm_blend_row_clipped_SSE2_loop2
  1305. asm_blend_row_clipped_SSE2_end:
  1306. }
  1307. }
  1308. void DeinterlaceBlend(BYTE* dst, BYTE* src, DWORD rowbytes, DWORD h, DWORD dstpitch, DWORD srcpitch)
  1309. {
  1310. void (*asm_blend_row_clipped)(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) = NULL;
  1311. void (*asm_blend_row)(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) = NULL;
  1312. if((g_cpuid.m_flags & CCpuID::sse2) && !((DWORD)src&0xf) && !((DWORD)dst&0xf) && !(srcpitch&0xf))
  1313. {
  1314. asm_blend_row_clipped = asm_blend_row_clipped_SSE2;
  1315. asm_blend_row = asm_blend_row_SSE2;
  1316. }
  1317. else if(g_cpuid.m_flags & CCpuID::mmx)
  1318. {
  1319. asm_blend_row_clipped = asm_blend_row_clipped_MMX;
  1320. asm_blend_row = asm_blend_row_MMX;
  1321. }
  1322. else
  1323. {
  1324. asm_blend_row_clipped = asm_blend_row_clipped_c;
  1325. asm_blend_row = asm_blend_row_c;
  1326. }
  1327. if(!asm_blend_row_clipped)
  1328. return;
  1329. asm_blend_row_clipped(dst, src, rowbytes, srcpitch);
  1330. if((h -= 2) > 0) do
  1331. {
  1332. dst += dstpitch;
  1333. asm_blend_row(dst, src, rowbytes, srcpitch);
  1334. src += srcpitch;
  1335. }
  1336. while(--h);
  1337. asm_blend_row_clipped(dst + dstpitch, src, rowbytes, srcpitch);
  1338. if(g_cpuid.m_flags & CCpuID::mmx)
  1339. __asm emms
  1340. }
  1341. void DeinterlaceBob(BYTE* dst, BYTE* src, DWORD rowbytes, DWORD h, DWORD dstpitch, DWORD srcpitch, bool topfield)
  1342. {
  1343. if(topfield)
  1344. {
  1345. BitBltFromRGBToRGB(rowbytes, h/2, dst, dstpitch*2, 8, src, srcpitch*2, 8);
  1346. AvgLines8(dst, h, dstpitch);
  1347. }
  1348. else
  1349. {
  1350. BitBltFromRGBToRGB(rowbytes, h/2, dst + dstpitch, dstpitch*2, 8, src + srcpitch, srcpitch*2, 8);
  1351. AvgLines8(dst + dstpitch, h-1, dstpitch);
  1352. }
  1353. }
  1354. void AvgLines8(BYTE* dst, DWORD h, DWORD pitch)
  1355. {
  1356. if(h <= 1) return;
  1357. BYTE* s = dst;
  1358. BYTE* d = dst + (h-2)*pitch;
  1359. for(; s < d; s += pitch*2)
  1360. {
  1361. BYTE* tmp = s;
  1362. if((g_cpuid.m_flags & CCpuID::sse2) && !((DWORD)tmp&0xf) && !((DWORD)pitch&0xf))
  1363. {
  1364. __asm
  1365. {
  1366. mov esi, tmp
  1367. mov ebx, pitch
  1368. mov ecx, ebx
  1369. shr ecx, 4
  1370. AvgLines8_sse2_loop:
  1371. movdqa xmm0, [esi]
  1372. pavgb xmm0, [esi+ebx*2]
  1373. movdqa [esi+ebx], xmm0
  1374. add esi, 16
  1375. dec ecx
  1376. jnz AvgLines8_sse2_loop
  1377. mov tmp, esi
  1378. }
  1379. for(int i = pitch&7; i--; tmp++)
  1380. {
  1381. tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1;
  1382. }
  1383. }
  1384. else if(g_cpuid.m_flags & CCpuID::mmx)
  1385. {
  1386. __asm
  1387. {
  1388. mov esi, tmp
  1389. mov ebx, pitch
  1390. mov ecx, ebx
  1391. shr ecx, 3
  1392. pxor mm7, mm7
  1393. AvgLines8_mmx_loop:
  1394. movq mm0, [esi]
  1395. movq mm1, mm0
  1396. punpcklbw mm0, mm7
  1397. punpckhbw mm1, mm7
  1398. movq mm2, [esi+ebx*2]
  1399. movq mm3, mm2
  1400. punpcklbw mm2, mm7
  1401. punpckhbw mm3, mm7
  1402. paddw mm0, mm2
  1403. psrlw mm0, 1
  1404. paddw mm1, mm3
  1405. psrlw mm1, 1
  1406. packuswb mm0, mm1
  1407. movq [esi+ebx], mm0
  1408. lea esi, [esi+8]
  1409. dec ecx
  1410. jnz AvgLines8_mmx_loop
  1411. mov tmp, esi
  1412. }
  1413. for(int i = pitch&7; i--; tmp++)
  1414. {
  1415. tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1;
  1416. }
  1417. }
  1418. else
  1419. {
  1420. for(int i = pitch; i--; tmp++)
  1421. {
  1422. tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1;
  1423. }
  1424. }
  1425. }
  1426. if(!(h&1) && h >= 2)
  1427. {
  1428. dst += (h-2)*pitch;
  1429. memcpy_accel(dst + pitch, dst, pitch);
  1430. }
  1431. __asm emms;
  1432. }
  1433. void AvgLines555(BYTE* dst, DWORD h, DWORD pitch)
  1434. {
  1435. if(h <= 1) return;
  1436. unsigned __int64 __0x7c007c007c007c00 = 0x7c007c007c007c00;
  1437. unsigned __int64 __0x03e003e003e003e0 = 0x03e003e003e003e0;
  1438. unsigned __int64 __0x001f001f001f001f = 0x001f001f001f001f;
  1439. BYTE* s = dst;
  1440. BYTE* d = dst + (h-2)*pitch;
  1441. for(; s < d; s += pitch*2)
  1442. {
  1443. BYTE* tmp = s;
  1444. __asm
  1445. {
  1446. mov esi, tmp
  1447. mov ebx, pitch
  1448. mov ecx, ebx
  1449. shr ecx, 3
  1450. movq mm6, __0x03e003e003e003e0
  1451. movq mm7, __0x001f001f001f001f
  1452. AvgLines555_loop:
  1453. movq mm0, [esi]
  1454. movq mm1, mm0
  1455. movq mm2, mm0
  1456. psrlw mm0, 10 // red1 bits: mm0 = 001f001f001f001f
  1457. pand mm1, mm6 // green1 bits: mm1 = 03e003e003e003e0
  1458. pand mm2, mm7 // blue1 bits: mm2 = 001f001f001f001f
  1459. movq mm3, [esi+ebx*2]
  1460. movq mm4, mm3
  1461. movq mm5, mm3
  1462. psrlw mm3, 10 // red2 bits: mm3 = 001f001f001f001f
  1463. pand mm4, mm6 // green2 bits: mm4 = 03e003e003e003e0
  1464. pand mm5, mm7 // blue2 bits: mm5 = 001f001f001f001f
  1465. paddw mm0, mm3
  1466. psrlw mm0, 1 // (red1+red2)/2
  1467. psllw mm0, 10 // red bits at 7c007c007c007c00
  1468. paddw mm1, mm4
  1469. psrlw mm1, 1 // (green1+green2)/2
  1470. pand mm1, mm6 // green bits at 03e003e003e003e0
  1471. paddw mm2, mm5
  1472. psrlw mm2, 1 // (blue1+blue2)/2
  1473. // blue bits at 001f001f001f001f (no need to pand, lower bits were discareded)
  1474. por mm0, mm1
  1475. por mm0, mm2
  1476. movq [esi+ebx], mm0
  1477. lea esi, [esi+8]
  1478. dec ecx
  1479. jnz AvgLines555_loop
  1480. mov tmp, esi
  1481. }
  1482. for(int i = (pitch&7)>>1; i--; tmp++)
  1483. {
  1484. tmp[pitch] =
  1485. ((((*tmp&0x7c00) + (tmp[pitch<<1]&0x7c00)) >> 1)&0x7c00)|
  1486. ((((*tmp&0x03e0) + (tmp[pitch<<1]&0x03e0)) >> 1)&0x03e0)|
  1487. ((((*tmp&0x001f) + (tmp[pitch<<1]&0x001f)) >> 1)&0x001f);
  1488. }
  1489. }
  1490. if(!(h&1) && h >= 2)
  1491. {
  1492. dst += (h-2)*pitch;
  1493. memcpy_accel(dst + pitch, dst, pitch);
  1494. }
  1495. __asm emms;
  1496. }
  1497. void AvgLines565(BYTE* dst, DWORD h, DWORD pitch)
  1498. {
  1499. if(h <= 1) return;
  1500. unsigned __int64 __0xf800f800f800f800 = 0xf800f800f800f800;
  1501. unsigned __int64 __0x07e007e007e007e0 = 0x07e007e007e007e0;
  1502. unsigned __int64 __0x001f001f001f001f = 0x001f001f001f001f;
  1503. BYTE* s = dst;
  1504. BYTE* d = dst + (h-2)*pitch;
  1505. for(; s < d; s += pitch*2)
  1506. {
  1507. WORD* tmp = (WORD*)s;
  1508. __asm
  1509. {
  1510. mov esi, tmp
  1511. mov ebx, pitch
  1512. mov ecx, ebx
  1513. shr ecx, 3
  1514. movq mm6, __0x07e007e007e007e0
  1515. movq mm7, __0x001f001f001f001f
  1516. AvgLines565_loop:
  1517. movq mm0, [esi]
  1518. movq mm1, mm0
  1519. movq mm2, mm0
  1520. psrlw mm0, 11 // red1 bits: mm0 = 001f001f001f001f
  1521. pand mm1, mm6 // green1 bits: mm1 = 07e007e007e007e0
  1522. pand mm2, mm7 // blue1 bits: mm2 = 001f001f001f001f
  1523. movq mm3, [esi+ebx*2]
  1524. movq mm4, mm3
  1525. movq mm5, mm3
  1526. psrlw mm3, 11 // red2 bits: mm3 = 001f001f001f001f
  1527. pand mm4, mm6 // green2 bits: mm4 = 07e007e007e007e0
  1528. pand mm5, mm7 // blue2 bits: mm5 = 001f001f001f001f
  1529. paddw mm0, mm3
  1530. psrlw mm0, 1 // (red1+red2)/2
  1531. psllw mm0, 11 // red bits at f800f800f800f800
  1532. paddw mm1, mm4
  1533. psrlw mm1, 1 // (green1+green2)/2
  1534. pand mm1, mm6 // green bits at 03e003e003e003e0
  1535. paddw mm2, mm5
  1536. psrlw mm2, 1 // (blue1+blue2)/2
  1537. // blue bits at 001f001f001f001f (no need to pand, lower bits were discareded)
  1538. por mm0, mm1
  1539. por mm0, mm2
  1540. movq [esi+ebx], mm0
  1541. lea esi, [esi+8]
  1542. dec ecx
  1543. jnz AvgLines565_loop
  1544. mov tmp, esi
  1545. }
  1546. for(int i = (pitch&7)>>1; i--; tmp++)
  1547. {
  1548. tmp[pitch] =
  1549. ((((*tmp&0xf800) + (tmp[pitch<<1]&0xf800)) >> 1)&0xf800)|
  1550. ((((*tmp&0x07e0) + (tmp[pitch<<1]&0x07e0)) >> 1)&0x07e0)|
  1551. ((((*tmp&0x001f) + (tmp[pitch<<1]&0x001f)) >> 1)&0x001f);
  1552. }
  1553. }
  1554. if(!(h&1) && h >= 2)
  1555. {
  1556. dst += (h-2)*pitch;
  1557. memcpy_accel(dst + pitch, dst, pitch);
  1558. }
  1559. __asm emms;
  1560. }
  1561. extern "C" void mmx_YUY2toRGB24(const BYTE* src, BYTE* dst, const BYTE* src_end, int src_pitch, int row_size, bool rec709);
  1562. extern "C" void mmx_YUY2toRGB32(const BYTE* src, BYTE* dst, const BYTE* src_end, int src_pitch, int row_size, bool rec709);
  1563. bool BitBltFromYUY2ToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* src, int srcpitch)
  1564. {
  1565. void (* YUY2toRGB)(const BYTE* src, BYTE* dst, const BYTE* src_end, int src_pitch, int row_size, bool rec709) = NULL;
  1566. if(g_cpuid.m_flags & CCpuID::mmx)
  1567. {
  1568. YUY2toRGB =
  1569. dbpp == 32 ? mmx_YUY2toRGB32 :
  1570. dbpp == 24 ? mmx_YUY2toRGB24 :
  1571. // dbpp == 16 ? mmx_YUY2toRGB16 : // TODO
  1572. NULL;
  1573. }
  1574. else
  1575. {
  1576. // TODO
  1577. }
  1578. if(!YUY2toRGB)
  1579. {
  1580. if(dbpp == 16){
  1581. BYTE* tmp = (BYTE*) malloc( w*h*5) ;
  1582. if(tmp){
  1583. mmx_YUY2toRGB32(src, tmp, src + h*srcpitch, srcpitch, w, false);
  1584. BitBltFromRGBToRGB(w, h, dst, w*2, 16, tmp, w*4, 32 );
  1585. free(tmp);
  1586. }
  1587. return(true);
  1588. }else{
  1589. return(false);
  1590. }
  1591. }
  1592. YUY2toRGB(src, dst, src + h*srcpitch, srcpitch, w, false);
  1593. return(true);
  1594. }
  1595. #endif //_WIN64