PageRenderTime 72ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/Avc/pred_inter.cpp

http://github.com/mbebenita/Broadway
C++ | 2330 lines | 1974 code | 201 blank | 155 comment | 158 complexity | 5750c7959b2cde35dcd102bfcf79ae88 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /* ------------------------------------------------------------------
  2. * Copyright (C) 1998-2009 PacketVideo
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
  13. * express or implied.
  14. * See the License for the specific language governing permissions
  15. * and limitations under the License.
  16. * -------------------------------------------------------------------
  17. */
  18. #include "avcdec_lib.h"
  19. #define CLIP_RESULT(x) if((uint)x > 0xFF){ \
  20. x = 0xFF & (~(x>>31));}
  21. /* (blkwidth << 2) + (dy << 1) + dx */
  22. static void (*const ChromaMC_SIMD[8])(uint8 *, int , int , int , uint8 *, int, int , int) =
  23. {
  24. &ChromaFullMC_SIMD,
  25. &ChromaHorizontalMC_SIMD,
  26. &ChromaVerticalMC_SIMD,
  27. &ChromaDiagonalMC_SIMD,
  28. &ChromaFullMC_SIMD,
  29. &ChromaHorizontalMC2_SIMD,
  30. &ChromaVerticalMC2_SIMD,
  31. &ChromaDiagonalMC2_SIMD
  32. };
  33. /* Perform motion prediction and compensation with residue if exist. */
  34. void InterMBPrediction(AVCCommonObj *video)
  35. {
  36. AVCMacroblock *currMB = video->currMB;
  37. AVCPictureData *currPic = video->currPic;
  38. int mbPartIdx, subMbPartIdx;
  39. int ref_idx;
  40. int offset_MbPart_indx = 0;
  41. int16 *mv;
  42. uint32 x_pos, y_pos;
  43. uint8 *curL, *curCb, *curCr;
  44. uint8 *ref_l, *ref_Cb, *ref_Cr;
  45. uint8 *predBlock, *predCb, *predCr;
  46. int block_x, block_y, offset_x, offset_y, offsetP, offset;
  47. int x_position = (video->mb_x << 4);
  48. int y_position = (video->mb_y << 4);
  49. int MbHeight, MbWidth, mbPartIdx_X, mbPartIdx_Y, offset_indx;
  50. int picWidth = currPic->pitch;
  51. int picHeight = currPic->height;
  52. int16 *dataBlock;
  53. uint32 cbp4x4;
  54. uint32 tmp_word;
  55. tmp_word = y_position * picWidth;
  56. curL = currPic->Sl + tmp_word + x_position;
  57. offset = (tmp_word >> 2) + (x_position >> 1);
  58. curCb = currPic->Scb + offset;
  59. curCr = currPic->Scr + offset;
  60. #ifdef USE_PRED_BLOCK
  61. predBlock = video->pred + 84;
  62. predCb = video->pred + 452;
  63. predCr = video->pred + 596;
  64. #else
  65. predBlock = curL;
  66. predCb = curCb;
  67. predCr = curCr;
  68. #endif
  69. GetMotionVectorPredictor(video, false);
  70. for (mbPartIdx = 0; mbPartIdx < currMB->NumMbPart; mbPartIdx++)
  71. {
  72. MbHeight = currMB->SubMbPartHeight[mbPartIdx];
  73. MbWidth = currMB->SubMbPartWidth[mbPartIdx];
  74. mbPartIdx_X = ((mbPartIdx + offset_MbPart_indx) & 1);
  75. mbPartIdx_Y = (mbPartIdx + offset_MbPart_indx) >> 1;
  76. ref_idx = currMB->ref_idx_L0[(mbPartIdx_Y << 1) + mbPartIdx_X];
  77. offset_indx = 0;
  78. ref_l = video->RefPicList0[ref_idx]->Sl;
  79. ref_Cb = video->RefPicList0[ref_idx]->Scb;
  80. ref_Cr = video->RefPicList0[ref_idx]->Scr;
  81. for (subMbPartIdx = 0; subMbPartIdx < currMB->NumSubMbPart[mbPartIdx]; subMbPartIdx++)
  82. {
  83. block_x = (mbPartIdx_X << 1) + ((subMbPartIdx + offset_indx) & 1); // check this
  84. block_y = (mbPartIdx_Y << 1) + (((subMbPartIdx + offset_indx) >> 1) & 1);
  85. mv = (int16*)(currMB->mvL0 + block_x + (block_y << 2));
  86. offset_x = x_position + (block_x << 2);
  87. offset_y = y_position + (block_y << 2);
  88. x_pos = (offset_x << 2) + *mv++; /*quarter pel */
  89. y_pos = (offset_y << 2) + *mv; /*quarter pel */
  90. //offset = offset_y * currPic->width;
  91. //offsetC = (offset >> 2) + (offset_x >> 1);
  92. #ifdef USE_PRED_BLOCK
  93. offsetP = (block_y * 80) + (block_x << 2);
  94. LumaMotionComp(ref_l, picWidth, picHeight, x_pos, y_pos,
  95. /*comp_Sl + offset + offset_x,*/
  96. predBlock + offsetP, 20, MbWidth, MbHeight);
  97. #else
  98. offsetP = (block_y << 2) * picWidth + (block_x << 2);
  99. LumaMotionComp(ref_l, picWidth, picHeight, x_pos, y_pos,
  100. /*comp_Sl + offset + offset_x,*/
  101. predBlock + offsetP, picWidth, MbWidth, MbHeight);
  102. #endif
  103. #ifdef USE_PRED_BLOCK
  104. offsetP = (block_y * 24) + (block_x << 1);
  105. ChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
  106. /*comp_Scb + offsetC,*/
  107. predCb + offsetP, 12, MbWidth >> 1, MbHeight >> 1);
  108. ChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
  109. /*comp_Scr + offsetC,*/
  110. predCr + offsetP, 12, MbWidth >> 1, MbHeight >> 1);
  111. #else
  112. offsetP = (block_y * picWidth) + (block_x << 1);
  113. ChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
  114. /*comp_Scb + offsetC,*/
  115. predCb + offsetP, picWidth >> 1, MbWidth >> 1, MbHeight >> 1);
  116. ChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
  117. /*comp_Scr + offsetC,*/
  118. predCr + offsetP, picWidth >> 1, MbWidth >> 1, MbHeight >> 1);
  119. #endif
  120. offset_indx = currMB->SubMbPartWidth[mbPartIdx] >> 3;
  121. }
  122. offset_MbPart_indx = currMB->MbPartWidth >> 4;
  123. }
  124. /* used in decoder, used to be if(!encFlag) */
  125. /* transform in raster scan order */
  126. dataBlock = video->block;
  127. cbp4x4 = video->cbp4x4;
  128. /* luma */
  129. for (block_y = 4; block_y > 0; block_y--)
  130. {
  131. for (block_x = 4; block_x > 0; block_x--)
  132. {
  133. #ifdef USE_PRED_BLOCK
  134. if (cbp4x4&1)
  135. {
  136. itrans(dataBlock, predBlock, predBlock, 20);
  137. }
  138. #else
  139. if (cbp4x4&1)
  140. {
  141. itrans(dataBlock, curL, curL, picWidth);
  142. }
  143. #endif
  144. cbp4x4 >>= 1;
  145. dataBlock += 4;
  146. #ifdef USE_PRED_BLOCK
  147. predBlock += 4;
  148. #else
  149. curL += 4;
  150. #endif
  151. }
  152. dataBlock += 48;
  153. #ifdef USE_PRED_BLOCK
  154. predBlock += 64;
  155. #else
  156. curL += ((picWidth << 2) - 16);
  157. #endif
  158. }
  159. /* chroma */
  160. picWidth = (picWidth >> 1);
  161. for (block_y = 2; block_y > 0; block_y--)
  162. {
  163. for (block_x = 2; block_x > 0; block_x--)
  164. {
  165. #ifdef USE_PRED_BLOCK
  166. if (cbp4x4&1)
  167. {
  168. ictrans(dataBlock, predCb, predCb, 12);
  169. }
  170. #else
  171. if (cbp4x4&1)
  172. {
  173. ictrans(dataBlock, curCb, curCb, picWidth);
  174. }
  175. #endif
  176. cbp4x4 >>= 1;
  177. dataBlock += 4;
  178. #ifdef USE_PRED_BLOCK
  179. predCb += 4;
  180. #else
  181. curCb += 4;
  182. #endif
  183. }
  184. for (block_x = 2; block_x > 0; block_x--)
  185. {
  186. #ifdef USE_PRED_BLOCK
  187. if (cbp4x4&1)
  188. {
  189. ictrans(dataBlock, predCr, predCr, 12);
  190. }
  191. #else
  192. if (cbp4x4&1)
  193. {
  194. ictrans(dataBlock, curCr, curCr, picWidth);
  195. }
  196. #endif
  197. cbp4x4 >>= 1;
  198. dataBlock += 4;
  199. #ifdef USE_PRED_BLOCK
  200. predCr += 4;
  201. #else
  202. curCr += 4;
  203. #endif
  204. }
  205. dataBlock += 48;
  206. #ifdef USE_PRED_BLOCK
  207. predCb += 40;
  208. predCr += 40;
  209. #else
  210. curCb += ((picWidth << 2) - 8);
  211. curCr += ((picWidth << 2) - 8);
  212. #endif
  213. }
  214. #ifdef MB_BASED_DEBLOCK
  215. SaveNeighborForIntraPred(video, offset);
  216. #endif
  217. return ;
  218. }
  219. /* preform the actual motion comp here */
  220. void LumaMotionComp(uint8 *ref, int picwidth, int picheight,
  221. int x_pos, int y_pos,
  222. uint8 *pred, int pred_pitch,
  223. int blkwidth, int blkheight)
  224. {
  225. int dx, dy;
  226. uint8 temp[24][24]; /* for padding, make the size multiple of 4 for packing */
  227. int temp2[21][21]; /* for intermediate results */
  228. uint8 *ref2;
  229. dx = x_pos & 3;
  230. dy = y_pos & 3;
  231. x_pos = x_pos >> 2; /* round it to full-pel resolution */
  232. y_pos = y_pos >> 2;
  233. /* perform actual motion compensation */
  234. if (dx == 0 && dy == 0)
  235. { /* fullpel position *//* G */
  236. if (x_pos >= 0 && x_pos + blkwidth <= picwidth && y_pos >= 0 && y_pos + blkheight <= picheight)
  237. {
  238. ref += y_pos * picwidth + x_pos;
  239. FullPelMC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight);
  240. }
  241. else
  242. {
  243. CreatePad(ref, picwidth, picheight, x_pos, y_pos, &temp[0][0], blkwidth, blkheight);
  244. FullPelMC(&temp[0][0], 24, pred, pred_pitch, blkwidth, blkheight);
  245. }
  246. } /* other positions */
  247. else if (dy == 0)
  248. { /* no vertical interpolation *//* a,b,c*/
  249. if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos >= 0 && y_pos + blkheight <= picheight)
  250. {
  251. ref += y_pos * picwidth + x_pos;
  252. HorzInterp1MC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight, dx);
  253. }
  254. else /* need padding */
  255. {
  256. CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos, &temp[0][0], blkwidth + 5, blkheight);
  257. HorzInterp1MC(&temp[0][2], 24, pred, pred_pitch, blkwidth, blkheight, dx);
  258. }
  259. }
  260. else if (dx == 0)
  261. { /*no horizontal interpolation *//* d,h,n */
  262. if (x_pos >= 0 && x_pos + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
  263. {
  264. ref += y_pos * picwidth + x_pos;
  265. VertInterp1MC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight, dy);
  266. }
  267. else /* need padding */
  268. {
  269. CreatePad(ref, picwidth, picheight, x_pos, y_pos - 2, &temp[0][0], blkwidth, blkheight + 5);
  270. VertInterp1MC(&temp[2][0], 24, pred, pred_pitch, blkwidth, blkheight, dy);
  271. }
  272. }
  273. else if (dy == 2)
  274. { /* horizontal cross *//* i, j, k */
  275. if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
  276. {
  277. ref += y_pos * picwidth + x_pos - 2; /* move to the left 2 pixels */
  278. VertInterp2MC(ref, picwidth, &temp2[0][0], 21, blkwidth + 5, blkheight);
  279. HorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
  280. }
  281. else /* need padding */
  282. {
  283. CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5, blkheight + 5);
  284. VertInterp2MC(&temp[2][0], 24, &temp2[0][0], 21, blkwidth + 5, blkheight);
  285. HorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
  286. }
  287. }
  288. else if (dx == 2)
  289. { /* vertical cross */ /* f,q */
  290. if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
  291. {
  292. ref += (y_pos - 2) * picwidth + x_pos; /* move to up 2 lines */
  293. HorzInterp3MC(ref, picwidth, &temp2[0][0], 21, blkwidth, blkheight + 5);
  294. VertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
  295. }
  296. else /* need padding */
  297. {
  298. CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5, blkheight + 5);
  299. HorzInterp3MC(&temp[0][2], 24, &temp2[0][0], 21, blkwidth, blkheight + 5);
  300. VertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
  301. }
  302. }
  303. else
  304. { /* diagonal *//* e,g,p,r */
  305. if (x_pos - 2 >= 0 && x_pos + 3 + (dx / 2) + blkwidth <= picwidth &&
  306. y_pos - 2 >= 0 && y_pos + 3 + blkheight + (dy / 2) <= picheight)
  307. {
  308. ref2 = ref + (y_pos + (dy / 2)) * picwidth + x_pos;
  309. ref += (y_pos * picwidth) + x_pos + (dx / 2);
  310. DiagonalInterpMC(ref2, ref, picwidth, pred, pred_pitch, blkwidth, blkheight);
  311. }
  312. else /* need padding */
  313. {
  314. CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5 + (dx / 2), blkheight + 5 + (dy / 2));
  315. ref2 = &temp[2 + (dy/2)][2];
  316. ref = &temp[2][2 + (dx/2)];
  317. DiagonalInterpMC(ref2, ref, 24, pred, pred_pitch, blkwidth, blkheight);
  318. }
  319. }
  320. return ;
  321. }
  322. void CreateAlign(uint8 *ref, int picwidth, int y_pos,
  323. uint8 *out, int blkwidth, int blkheight)
  324. {
  325. int i, j;
  326. int offset, out_offset;
  327. uint32 prev_pix, result, pix1, pix2, pix4;
  328. out_offset = 24 - blkwidth;
  329. //switch(x_pos&0x3){
  330. switch (((uint32)ref)&0x3)
  331. {
  332. case 1:
  333. ref += y_pos * picwidth;
  334. offset = picwidth - blkwidth - 3;
  335. for (j = 0; j < blkheight; j++)
  336. {
  337. pix1 = *ref++;
  338. pix2 = *((uint16*)ref);
  339. ref += 2;
  340. result = (pix2 << 8) | pix1;
  341. for (i = 3; i < blkwidth; i += 4)
  342. {
  343. pix4 = *((uint32*)ref);
  344. ref += 4;
  345. prev_pix = (pix4 << 24) & 0xFF000000; /* mask out byte belong to previous word */
  346. result |= prev_pix;
  347. *((uint32*)out) = result; /* write 4 bytes */
  348. out += 4;
  349. result = pix4 >> 8; /* for the next loop */
  350. }
  351. ref += offset;
  352. out += out_offset;
  353. }
  354. break;
  355. case 2:
  356. ref += y_pos * picwidth;
  357. offset = picwidth - blkwidth - 2;
  358. for (j = 0; j < blkheight; j++)
  359. {
  360. result = *((uint16*)ref);
  361. ref += 2;
  362. for (i = 2; i < blkwidth; i += 4)
  363. {
  364. pix4 = *((uint32*)ref);
  365. ref += 4;
  366. prev_pix = (pix4 << 16) & 0xFFFF0000; /* mask out byte belong to previous word */
  367. result |= prev_pix;
  368. *((uint32*)out) = result; /* write 4 bytes */
  369. out += 4;
  370. result = pix4 >> 16; /* for the next loop */
  371. }
  372. ref += offset;
  373. out += out_offset;
  374. }
  375. break;
  376. case 3:
  377. ref += y_pos * picwidth;
  378. offset = picwidth - blkwidth - 1;
  379. for (j = 0; j < blkheight; j++)
  380. {
  381. result = *ref++;
  382. for (i = 1; i < blkwidth; i += 4)
  383. {
  384. pix4 = *((uint32*)ref);
  385. ref += 4;
  386. prev_pix = (pix4 << 8) & 0xFFFFFF00; /* mask out byte belong to previous word */
  387. result |= prev_pix;
  388. *((uint32*)out) = result; /* write 4 bytes */
  389. out += 4;
  390. result = pix4 >> 24; /* for the next loop */
  391. }
  392. ref += offset;
  393. out += out_offset;
  394. }
  395. break;
  396. }
  397. }
  398. void CreatePad(uint8 *ref, int picwidth, int picheight, int x_pos, int y_pos,
  399. uint8 *out, int blkwidth, int blkheight)
  400. {
  401. int x_inc0, x_mid;
  402. int y_inc, y_inc0, y_inc1, y_mid;
  403. int i, j;
  404. int offset;
  405. if (x_pos < 0)
  406. {
  407. x_inc0 = 0; /* increment for the first part */
  408. x_mid = ((blkwidth + x_pos > 0) ? -x_pos : blkwidth); /* stopping point */
  409. x_pos = 0;
  410. }
  411. else if (x_pos + blkwidth > picwidth)
  412. {
  413. x_inc0 = 1; /* increasing */
  414. x_mid = ((picwidth > x_pos) ? picwidth - x_pos - 1 : 0); /* clip negative to zero, encode fool proof! */
  415. }
  416. else /* normal case */
  417. {
  418. x_inc0 = 1;
  419. x_mid = blkwidth; /* just one run */
  420. }
  421. /* boundary for y_pos, taking the result from x_pos into account */
  422. if (y_pos < 0)
  423. {
  424. y_inc0 = (x_inc0 ? - x_mid : -blkwidth + x_mid); /* offset depending on x_inc1 and x_inc0 */
  425. y_inc1 = picwidth + y_inc0;
  426. y_mid = ((blkheight + y_pos > 0) ? -y_pos : blkheight); /* clip to prevent memory corruption */
  427. y_pos = 0;
  428. }
  429. else if (y_pos + blkheight > picheight)
  430. {
  431. y_inc1 = (x_inc0 ? - x_mid : -blkwidth + x_mid); /* saturate */
  432. y_inc0 = picwidth + y_inc1; /* increasing */
  433. y_mid = ((picheight > y_pos) ? picheight - 1 - y_pos : 0);
  434. }
  435. else /* normal case */
  436. {
  437. y_inc1 = (x_inc0 ? - x_mid : -blkwidth + x_mid);
  438. y_inc0 = picwidth + y_inc1;
  439. y_mid = blkheight;
  440. }
  441. /* clip y_pos and x_pos */
  442. if (y_pos > picheight - 1) y_pos = picheight - 1;
  443. if (x_pos > picwidth - 1) x_pos = picwidth - 1;
  444. ref += y_pos * picwidth + x_pos;
  445. y_inc = y_inc0; /* start with top half */
  446. offset = 24 - blkwidth; /* to use in offset out */
  447. blkwidth -= x_mid; /* to use in the loop limit */
  448. if (x_inc0 == 0)
  449. {
  450. for (j = 0; j < blkheight; j++)
  451. {
  452. if (j == y_mid) /* put a check here to reduce the code size (for unrolling the loop) */
  453. {
  454. y_inc = y_inc1; /* switch to lower half */
  455. }
  456. for (i = x_mid; i > 0; i--) /* first or third quarter */
  457. {
  458. *out++ = *ref;
  459. }
  460. for (i = blkwidth; i > 0; i--) /* second or fourth quarter */
  461. {
  462. *out++ = *ref++;
  463. }
  464. out += offset;
  465. ref += y_inc;
  466. }
  467. }
  468. else
  469. {
  470. for (j = 0; j < blkheight; j++)
  471. {
  472. if (j == y_mid) /* put a check here to reduce the code size (for unrolling the loop) */
  473. {
  474. y_inc = y_inc1; /* switch to lower half */
  475. }
  476. for (i = x_mid; i > 0; i--) /* first or third quarter */
  477. {
  478. *out++ = *ref++;
  479. }
  480. for (i = blkwidth; i > 0; i--) /* second or fourth quarter */
  481. {
  482. *out++ = *ref;
  483. }
  484. out += offset;
  485. ref += y_inc;
  486. }
  487. }
  488. return ;
  489. }
  490. void HorzInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
  491. int blkwidth, int blkheight, int dx)
  492. {
  493. uint8 *p_ref;
  494. uint32 *p_cur;
  495. uint32 tmp;
  496. uint32 pkres;
  497. int result, curr_offset, ref_offset;
  498. int j;
  499. int32 r0, r1, r2, r3, r4, r5;
  500. int32 r13, r6;
  501. p_cur = (uint32*)out; /* assume it's word aligned */
  502. curr_offset = (outpitch - blkwidth) >> 2;
  503. p_ref = in;
  504. ref_offset = inpitch - blkwidth;
  505. if (dx&1)
  506. {
  507. dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
  508. p_ref -= 2;
  509. r13 = 0;
  510. for (j = blkheight; j > 0; j--)
  511. {
  512. tmp = (uint32)(p_ref + blkwidth);
  513. r0 = p_ref[0];
  514. r1 = p_ref[2];
  515. r0 |= (r1 << 16); /* 0,c,0,a */
  516. r1 = p_ref[1];
  517. r2 = p_ref[3];
  518. r1 |= (r2 << 16); /* 0,d,0,b */
  519. while ((uint32)p_ref < tmp)
  520. {
  521. r2 = *(p_ref += 4); /* move pointer to e */
  522. r3 = p_ref[2];
  523. r2 |= (r3 << 16); /* 0,g,0,e */
  524. r3 = p_ref[1];
  525. r4 = p_ref[3];
  526. r3 |= (r4 << 16); /* 0,h,0,f */
  527. r4 = r0 + r3; /* c+h, a+f */
  528. r5 = r0 + r1; /* c+d, a+b */
  529. r6 = r2 + r3; /* g+h, e+f */
  530. r5 >>= 16;
  531. r5 |= (r6 << 16); /* e+f, c+d */
  532. r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */
  533. r4 += 0x100010; /* +16, +16 */
  534. r5 = r1 + r2; /* d+g, b+e */
  535. r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
  536. r4 >>= 5;
  537. r13 |= r4; /* check clipping */
  538. r5 = p_ref[dx+2];
  539. r6 = p_ref[dx+4];
  540. r5 |= (r6 << 16);
  541. r4 += r5;
  542. r4 += 0x10001;
  543. r4 = (r4 >> 1) & 0xFF00FF;
  544. r5 = p_ref[4]; /* i */
  545. r6 = (r5 << 16);
  546. r5 = r6 | (r2 >> 16);/* 0,i,0,g */
  547. r5 += r1; /* d+i, b+g */ /* r5 not free */
  548. r1 >>= 16;
  549. r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
  550. r1 += r2; /* f+g, d+e */
  551. r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */
  552. r0 >>= 16;
  553. r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
  554. r0 += r3; /* e+h, c+f */
  555. r5 += 0x100010; /* 16,16 */
  556. r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
  557. r5 >>= 5;
  558. r13 |= r5; /* check clipping */
  559. r0 = p_ref[dx+3];
  560. r1 = p_ref[dx+5];
  561. r0 |= (r1 << 16);
  562. r5 += r0;
  563. r5 += 0x10001;
  564. r5 = (r5 >> 1) & 0xFF00FF;
  565. r4 |= (r5 << 8); /* pack them together */
  566. *p_cur++ = r4;
  567. r1 = r3;
  568. r0 = r2;
  569. }
  570. p_cur += curr_offset; /* move to the next line */
  571. p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */
  572. if (r13&0xFF000700) /* need clipping */
  573. {
  574. /* move back to the beginning of the line */
  575. p_ref -= (ref_offset + blkwidth); /* input */
  576. p_cur -= (outpitch >> 2);
  577. tmp = (uint32)(p_ref + blkwidth);
  578. for (; (uint32)p_ref < tmp;)
  579. {
  580. r0 = *p_ref++;
  581. r1 = *p_ref++;
  582. r2 = *p_ref++;
  583. r3 = *p_ref++;
  584. r4 = *p_ref++;
  585. /* first pixel */
  586. r5 = *p_ref++;
  587. result = (r0 + r5);
  588. r0 = (r1 + r4);
  589. result -= (r0 * 5);//result -= r0; result -= (r0<<2);
  590. r0 = (r2 + r3);
  591. result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
  592. result = (result + 16) >> 5;
  593. CLIP_RESULT(result)
  594. /* 3/4 pel, no need to clip */
  595. result = (result + p_ref[dx] + 1);
  596. pkres = (result >> 1) ;
  597. /* second pixel */
  598. r0 = *p_ref++;
  599. result = (r1 + r0);
  600. r1 = (r2 + r5);
  601. result -= (r1 * 5);//result -= r1; result -= (r1<<2);
  602. r1 = (r3 + r4);
  603. result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
  604. result = (result + 16) >> 5;
  605. CLIP_RESULT(result)
  606. /* 3/4 pel, no need to clip */
  607. result = (result + p_ref[dx] + 1);
  608. result = (result >> 1);
  609. pkres |= (result << 8);
  610. /* third pixel */
  611. r1 = *p_ref++;
  612. result = (r2 + r1);
  613. r2 = (r3 + r0);
  614. result -= (r2 * 5);//result -= r2; result -= (r2<<2);
  615. r2 = (r4 + r5);
  616. result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
  617. result = (result + 16) >> 5;
  618. CLIP_RESULT(result)
  619. /* 3/4 pel, no need to clip */
  620. result = (result + p_ref[dx] + 1);
  621. result = (result >> 1);
  622. pkres |= (result << 16);
  623. /* fourth pixel */
  624. r2 = *p_ref++;
  625. result = (r3 + r2);
  626. r3 = (r4 + r1);
  627. result -= (r3 * 5);//result -= r3; result -= (r3<<2);
  628. r3 = (r5 + r0);
  629. result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
  630. result = (result + 16) >> 5;
  631. CLIP_RESULT(result)
  632. /* 3/4 pel, no need to clip */
  633. result = (result + p_ref[dx] + 1);
  634. result = (result >> 1);
  635. pkres |= (result << 24);
  636. *p_cur++ = pkres; /* write 4 pixels */
  637. p_ref -= 5; /* offset back to the middle of filter */
  638. }
  639. p_cur += curr_offset; /* move to the next line */
  640. p_ref += ref_offset; /* move to the next line */
  641. }
  642. }
  643. }
  644. else
  645. {
  646. p_ref -= 2;
  647. r13 = 0;
  648. for (j = blkheight; j > 0; j--)
  649. {
  650. tmp = (uint32)(p_ref + blkwidth);
  651. r0 = p_ref[0];
  652. r1 = p_ref[2];
  653. r0 |= (r1 << 16); /* 0,c,0,a */
  654. r1 = p_ref[1];
  655. r2 = p_ref[3];
  656. r1 |= (r2 << 16); /* 0,d,0,b */
  657. while ((uint32)p_ref < tmp)
  658. {
  659. r2 = *(p_ref += 4); /* move pointer to e */
  660. r3 = p_ref[2];
  661. r2 |= (r3 << 16); /* 0,g,0,e */
  662. r3 = p_ref[1];
  663. r4 = p_ref[3];
  664. r3 |= (r4 << 16); /* 0,h,0,f */
  665. r4 = r0 + r3; /* c+h, a+f */
  666. r5 = r0 + r1; /* c+d, a+b */
  667. r6 = r2 + r3; /* g+h, e+f */
  668. r5 >>= 16;
  669. r5 |= (r6 << 16); /* e+f, c+d */
  670. r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */
  671. r4 += 0x100010; /* +16, +16 */
  672. r5 = r1 + r2; /* d+g, b+e */
  673. r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
  674. r4 >>= 5;
  675. r13 |= r4; /* check clipping */
  676. r4 &= 0xFF00FF; /* mask */
  677. r5 = p_ref[4]; /* i */
  678. r6 = (r5 << 16);
  679. r5 = r6 | (r2 >> 16);/* 0,i,0,g */
  680. r5 += r1; /* d+i, b+g */ /* r5 not free */
  681. r1 >>= 16;
  682. r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
  683. r1 += r2; /* f+g, d+e */
  684. r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */
  685. r0 >>= 16;
  686. r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
  687. r0 += r3; /* e+h, c+f */
  688. r5 += 0x100010; /* 16,16 */
  689. r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
  690. r5 >>= 5;
  691. r13 |= r5; /* check clipping */
  692. r5 &= 0xFF00FF; /* mask */
  693. r4 |= (r5 << 8); /* pack them together */
  694. *p_cur++ = r4;
  695. r1 = r3;
  696. r0 = r2;
  697. }
  698. p_cur += curr_offset; /* move to the next line */
  699. p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */
  700. if (r13&0xFF000700) /* need clipping */
  701. {
  702. /* move back to the beginning of the line */
  703. p_ref -= (ref_offset + blkwidth); /* input */
  704. p_cur -= (outpitch >> 2);
  705. tmp = (uint32)(p_ref + blkwidth);
  706. for (; (uint32)p_ref < tmp;)
  707. {
  708. r0 = *p_ref++;
  709. r1 = *p_ref++;
  710. r2 = *p_ref++;
  711. r3 = *p_ref++;
  712. r4 = *p_ref++;
  713. /* first pixel */
  714. r5 = *p_ref++;
  715. result = (r0 + r5);
  716. r0 = (r1 + r4);
  717. result -= (r0 * 5);//result -= r0; result -= (r0<<2);
  718. r0 = (r2 + r3);
  719. result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
  720. result = (result + 16) >> 5;
  721. CLIP_RESULT(result)
  722. pkres = result;
  723. /* second pixel */
  724. r0 = *p_ref++;
  725. result = (r1 + r0);
  726. r1 = (r2 + r5);
  727. result -= (r1 * 5);//result -= r1; result -= (r1<<2);
  728. r1 = (r3 + r4);
  729. result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
  730. result = (result + 16) >> 5;
  731. CLIP_RESULT(result)
  732. pkres |= (result << 8);
  733. /* third pixel */
  734. r1 = *p_ref++;
  735. result = (r2 + r1);
  736. r2 = (r3 + r0);
  737. result -= (r2 * 5);//result -= r2; result -= (r2<<2);
  738. r2 = (r4 + r5);
  739. result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
  740. result = (result + 16) >> 5;
  741. CLIP_RESULT(result)
  742. pkres |= (result << 16);
  743. /* fourth pixel */
  744. r2 = *p_ref++;
  745. result = (r3 + r2);
  746. r3 = (r4 + r1);
  747. result -= (r3 * 5);//result -= r3; result -= (r3<<2);
  748. r3 = (r5 + r0);
  749. result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
  750. result = (result + 16) >> 5;
  751. CLIP_RESULT(result)
  752. pkres |= (result << 24);
  753. *p_cur++ = pkres; /* write 4 pixels */
  754. p_ref -= 5;
  755. }
  756. p_cur += curr_offset; /* move to the next line */
  757. p_ref += ref_offset;
  758. }
  759. }
  760. }
  761. return ;
  762. }
  763. void HorzInterp2MC(int *in, int inpitch, uint8 *out, int outpitch,
  764. int blkwidth, int blkheight, int dx)
  765. {
  766. int *p_ref;
  767. uint32 *p_cur;
  768. uint32 tmp, pkres;
  769. int result, result2, curr_offset, ref_offset;
  770. int j, r0, r1, r2, r3, r4, r5;
  771. p_cur = (uint32*)out; /* assume it's word aligned */
  772. curr_offset = (outpitch - blkwidth) >> 2;
  773. p_ref = in;
  774. ref_offset = inpitch - blkwidth;
  775. if (dx&1)
  776. {
  777. dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
  778. for (j = blkheight; j > 0 ; j--)
  779. {
  780. tmp = (uint32)(p_ref + blkwidth);
  781. for (; (uint32)p_ref < tmp;)
  782. {
  783. r0 = p_ref[-2];
  784. r1 = p_ref[-1];
  785. r2 = *p_ref++;
  786. r3 = *p_ref++;
  787. r4 = *p_ref++;
  788. /* first pixel */
  789. r5 = *p_ref++;
  790. result = (r0 + r5);
  791. r0 = (r1 + r4);
  792. result -= (r0 * 5);//result -= r0; result -= (r0<<2);
  793. r0 = (r2 + r3);
  794. result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
  795. result = (result + 512) >> 10;
  796. CLIP_RESULT(result)
  797. result2 = ((p_ref[dx] + 16) >> 5);
  798. CLIP_RESULT(result2)
  799. /* 3/4 pel, no need to clip */
  800. result = (result + result2 + 1);
  801. pkres = (result >> 1);
  802. /* second pixel */
  803. r0 = *p_ref++;
  804. result = (r1 + r0);
  805. r1 = (r2 + r5);
  806. result -= (r1 * 5);//result -= r1; result -= (r1<<2);
  807. r1 = (r3 + r4);
  808. result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
  809. result = (result + 512) >> 10;
  810. CLIP_RESULT(result)
  811. result2 = ((p_ref[dx] + 16) >> 5);
  812. CLIP_RESULT(result2)
  813. /* 3/4 pel, no need to clip */
  814. result = (result + result2 + 1);
  815. result = (result >> 1);
  816. pkres |= (result << 8);
  817. /* third pixel */
  818. r1 = *p_ref++;
  819. result = (r2 + r1);
  820. r2 = (r3 + r0);
  821. result -= (r2 * 5);//result -= r2; result -= (r2<<2);
  822. r2 = (r4 + r5);
  823. result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
  824. result = (result + 512) >> 10;
  825. CLIP_RESULT(result)
  826. result2 = ((p_ref[dx] + 16) >> 5);
  827. CLIP_RESULT(result2)
  828. /* 3/4 pel, no need to clip */
  829. result = (result + result2 + 1);
  830. result = (result >> 1);
  831. pkres |= (result << 16);
  832. /* fourth pixel */
  833. r2 = *p_ref++;
  834. result = (r3 + r2);
  835. r3 = (r4 + r1);
  836. result -= (r3 * 5);//result -= r3; result -= (r3<<2);
  837. r3 = (r5 + r0);
  838. result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
  839. result = (result + 512) >> 10;
  840. CLIP_RESULT(result)
  841. result2 = ((p_ref[dx] + 16) >> 5);
  842. CLIP_RESULT(result2)
  843. /* 3/4 pel, no need to clip */
  844. result = (result + result2 + 1);
  845. result = (result >> 1);
  846. pkres |= (result << 24);
  847. *p_cur++ = pkres; /* write 4 pixels */
  848. p_ref -= 3; /* offset back to the middle of filter */
  849. }
  850. p_cur += curr_offset; /* move to the next line */
  851. p_ref += ref_offset; /* move to the next line */
  852. }
  853. }
  854. else
  855. {
  856. for (j = blkheight; j > 0 ; j--)
  857. {
  858. tmp = (uint32)(p_ref + blkwidth);
  859. for (; (uint32)p_ref < tmp;)
  860. {
  861. r0 = p_ref[-2];
  862. r1 = p_ref[-1];
  863. r2 = *p_ref++;
  864. r3 = *p_ref++;
  865. r4 = *p_ref++;
  866. /* first pixel */
  867. r5 = *p_ref++;
  868. result = (r0 + r5);
  869. r0 = (r1 + r4);
  870. result -= (r0 * 5);//result -= r0; result -= (r0<<2);
  871. r0 = (r2 + r3);
  872. result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
  873. result = (result + 512) >> 10;
  874. CLIP_RESULT(result)
  875. pkres = result;
  876. /* second pixel */
  877. r0 = *p_ref++;
  878. result = (r1 + r0);
  879. r1 = (r2 + r5);
  880. result -= (r1 * 5);//result -= r1; result -= (r1<<2);
  881. r1 = (r3 + r4);
  882. result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
  883. result = (result + 512) >> 10;
  884. CLIP_RESULT(result)
  885. pkres |= (result << 8);
  886. /* third pixel */
  887. r1 = *p_ref++;
  888. result = (r2 + r1);
  889. r2 = (r3 + r0);
  890. result -= (r2 * 5);//result -= r2; result -= (r2<<2);
  891. r2 = (r4 + r5);
  892. result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
  893. result = (result + 512) >> 10;
  894. CLIP_RESULT(result)
  895. pkres |= (result << 16);
  896. /* fourth pixel */
  897. r2 = *p_ref++;
  898. result = (r3 + r2);
  899. r3 = (r4 + r1);
  900. result -= (r3 * 5);//result -= r3; result -= (r3<<2);
  901. r3 = (r5 + r0);
  902. result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
  903. result = (result + 512) >> 10;
  904. CLIP_RESULT(result)
  905. pkres |= (result << 24);
  906. *p_cur++ = pkres; /* write 4 pixels */
  907. p_ref -= 3; /* offset back to the middle of filter */
  908. }
  909. p_cur += curr_offset; /* move to the next line */
  910. p_ref += ref_offset; /* move to the next line */
  911. }
  912. }
  913. return ;
  914. }
  915. void HorzInterp3MC(uint8 *in, int inpitch, int *out, int outpitch,
  916. int blkwidth, int blkheight)
  917. {
  918. uint8 *p_ref;
  919. int *p_cur;
  920. uint32 tmp;
  921. int result, curr_offset, ref_offset;
  922. int j, r0, r1, r2, r3, r4, r5;
  923. p_cur = out;
  924. curr_offset = (outpitch - blkwidth);
  925. p_ref = in;
  926. ref_offset = inpitch - blkwidth;
  927. for (j = blkheight; j > 0 ; j--)
  928. {
  929. tmp = (uint32)(p_ref + blkwidth);
  930. for (; (uint32)p_ref < tmp;)
  931. {
  932. r0 = p_ref[-2];
  933. r1 = p_ref[-1];
  934. r2 = *p_ref++;
  935. r3 = *p_ref++;
  936. r4 = *p_ref++;
  937. /* first pixel */
  938. r5 = *p_ref++;
  939. result = (r0 + r5);
  940. r0 = (r1 + r4);
  941. result -= (r0 * 5);//result -= r0; result -= (r0<<2);
  942. r0 = (r2 + r3);
  943. result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
  944. *p_cur++ = result;
  945. /* second pixel */
  946. r0 = *p_ref++;
  947. result = (r1 + r0);
  948. r1 = (r2 + r5);
  949. result -= (r1 * 5);//result -= r1; result -= (r1<<2);
  950. r1 = (r3 + r4);
  951. result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
  952. *p_cur++ = result;
  953. /* third pixel */
  954. r1 = *p_ref++;
  955. result = (r2 + r1);
  956. r2 = (r3 + r0);
  957. result -= (r2 * 5);//result -= r2; result -= (r2<<2);
  958. r2 = (r4 + r5);
  959. result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
  960. *p_cur++ = result;
  961. /* fourth pixel */
  962. r2 = *p_ref++;
  963. result = (r3 + r2);
  964. r3 = (r4 + r1);
  965. result -= (r3 * 5);//result -= r3; result -= (r3<<2);
  966. r3 = (r5 + r0);
  967. result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
  968. *p_cur++ = result;
  969. p_ref -= 3; /* move back to the middle of the filter */
  970. }
  971. p_cur += curr_offset; /* move to the next line */
  972. p_ref += ref_offset;
  973. }
  974. return ;
  975. }
  976. void VertInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
  977. int blkwidth, int blkheight, int dy)
  978. {
  979. uint8 *p_cur, *p_ref;
  980. uint32 tmp;
  981. int result, curr_offset, ref_offset;
  982. int j, i;
  983. int32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r13;
  984. uint8 tmp_in[24][24];
  985. /* not word-aligned */
  986. if (((uint32)in)&0x3)
  987. {
  988. CreateAlign(in, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
  989. in = &tmp_in[2][0];
  990. inpitch = 24;
  991. }
  992. p_cur = out;
  993. curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
  994. ref_offset = blkheight * inpitch; /* for limit */
  995. curr_offset += 3;
  996. if (dy&1)
  997. {
  998. dy = (dy >> 1) ? 0 : -inpitch;
  999. for (j = 0; j < blkwidth; j += 4, in += 4)
  1000. {
  1001. r13 = 0;
  1002. p_ref = in;
  1003. p_cur -= outpitch; /* compensate for the first offset */
  1004. tmp = (uint32)(p_ref + ref_offset); /* limit */
  1005. while ((uint32)p_ref < tmp) /* the loop un-rolled */
  1006. {
  1007. r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
  1008. p_ref += inpitch;
  1009. r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
  1010. r0 &= 0xFF00FF;
  1011. r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */
  1012. r7 = (r1 >> 8) & 0xFF00FF;
  1013. r1 &= 0xFF00FF;
  1014. r0 += r1;
  1015. r6 += r7;
  1016. r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
  1017. r8 = (r2 >> 8) & 0xFF00FF;
  1018. r2 &= 0xFF00FF;
  1019. r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
  1020. r7 = (r1 >> 8) & 0xFF00FF;
  1021. r1 &= 0xFF00FF;
  1022. r1 += r2;
  1023. r7 += r8;
  1024. r0 += 20 * r1;
  1025. r6 += 20 * r7;
  1026. r0 += 0x100010;
  1027. r6 += 0x100010;
  1028. r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
  1029. r8 = (r2 >> 8) & 0xFF00FF;
  1030. r2 &= 0xFF00FF;
  1031. r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
  1032. r7 = (r1 >> 8) & 0xFF00FF;
  1033. r1 &= 0xFF00FF;
  1034. r1 += r2;
  1035. r7 += r8;
  1036. r0 -= 5 * r1;
  1037. r6 -= 5 * r7;
  1038. r0 >>= 5;
  1039. r6 >>= 5;
  1040. /* clip */
  1041. r13 |= r6;
  1042. r13 |= r0;
  1043. //CLIPPACK(r6,result)
  1044. r1 = *((uint32*)(p_ref + dy));
  1045. r2 = (r1 >> 8) & 0xFF00FF;
  1046. r1 &= 0xFF00FF;
  1047. r0 += r1;
  1048. r6 += r2;
  1049. r0 += 0x10001;
  1050. r6 += 0x10001;
  1051. r0 = (r0 >> 1) & 0xFF00FF;
  1052. r6 = (r6 >> 1) & 0xFF00FF;
  1053. r0 |= (r6 << 8); /* pack it back */
  1054. *((uint32*)(p_cur += outpitch)) = r0;
  1055. }
  1056. p_cur += curr_offset; /* offset to the next pixel */
  1057. if (r13 & 0xFF000700) /* this column need clipping */
  1058. {
  1059. p_cur -= 4;
  1060. for (i = 0; i < 4; i++)
  1061. {
  1062. p_ref = in + i;
  1063. p_cur -= outpitch; /* compensate for the first offset */
  1064. tmp = (uint32)(p_ref + ref_offset); /* limit */
  1065. while ((uint32)p_ref < tmp)
  1066. { /* loop un-rolled */
  1067. r0 = *(p_ref - (inpitch << 1));
  1068. r1 = *(p_ref - inpitch);
  1069. r2 = *p_ref;
  1070. r3 = *(p_ref += inpitch); /* modify pointer before loading */
  1071. r4 = *(p_ref += inpitch);
  1072. /* first pixel */
  1073. r5 = *(p_ref += inpitch);
  1074. result = (r0 + r5);
  1075. r0 = (r1 + r4);
  1076. result -= (r0 * 5);//result -= r0; result -= (r0<<2);
  1077. r0 = (r2 + r3);
  1078. result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
  1079. result = (result + 16) >> 5;
  1080. CLIP_RESULT(result)
  1081. /* 3/4 pel, no need to clip */
  1082. result = (result + p_ref[dy-(inpitch<<1)] + 1);
  1083. result = (result >> 1);
  1084. *(p_cur += outpitch) = result;
  1085. /* second pixel */
  1086. r0 = *(p_ref += inpitch);
  1087. result = (r1 + r0);
  1088. r1 = (r2 + r5);
  1089. result -= (r1 * 5);//result -= r1; result -= (r1<<2);
  1090. r1 = (r3 + r4);
  1091. result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
  1092. result = (result + 16) >> 5;
  1093. CLIP_RESULT(result)
  1094. /* 3/4 pel, no need to clip */
  1095. result = (result + p_ref[dy-(inpitch<<1)] + 1);
  1096. result = (result >> 1);
  1097. *(p_cur += outpitch) = result;
  1098. /* third pixel */
  1099. r1 = *(p_ref += inpitch);
  1100. result = (r2 + r1);
  1101. r2 = (r3 + r0);
  1102. result -= (r2 * 5);//result -= r2; result -= (r2<<2);
  1103. r2 = (r4 + r5);
  1104. result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
  1105. result = (result + 16) >> 5;
  1106. CLIP_RESULT(result)
  1107. /* 3/4 pel, no need to clip */
  1108. result = (result + p_ref[dy-(inpitch<<1)] + 1);
  1109. result = (result >> 1);
  1110. *(p_cur += outpitch) = result;
  1111. /* fourth pixel */
  1112. r2 = *(p_ref += inpitch);
  1113. result = (r3 + r2);
  1114. r3 = (r4 + r1);
  1115. result -= (r3 * 5);//result -= r3; result -= (r3<<2);
  1116. r3 = (r5 + r0);
  1117. result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
  1118. result = (result + 16) >> 5;
  1119. CLIP_RESULT(result)
  1120. /* 3/4 pel, no need to clip */
  1121. result = (result + p_ref[dy-(inpitch<<1)] + 1);
  1122. result = (result >> 1);
  1123. *(p_cur += outpitch) = result;
  1124. p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */
  1125. }
  1126. p_cur += (curr_offset - 3);
  1127. }
  1128. }
  1129. }
  1130. }
  1131. else
  1132. {
  1133. for (j = 0; j < blkwidth; j += 4, in += 4)
  1134. {
  1135. r13 = 0;
  1136. p_ref = in;
  1137. p_cur -= outpitch; /* compensate for the first offset */
  1138. tmp = (uint32)(p_ref + ref_offset); /* limit */
  1139. while ((uint32)p_ref < tmp) /* the loop un-rolled */
  1140. {
  1141. r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
  1142. p_ref += inpitch;
  1143. r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
  1144. r0 &= 0xFF00FF;
  1145. r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */
  1146. r7 = (r1 >> 8) & 0xFF00FF;
  1147. r1 &= 0xFF00FF;
  1148. r0 += r1;
  1149. r6 += r7;
  1150. r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
  1151. r8 = (r2 >> 8) & 0xFF00FF;
  1152. r2 &= 0xFF00FF;
  1153. r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
  1154. r7 = (r1 >> 8) & 0xFF00FF;
  1155. r1 &= 0xFF00FF;
  1156. r1 += r2;
  1157. r7 += r8;
  1158. r0 += 20 * r1;
  1159. r6 += 20 * r7;
  1160. r0 += 0x100010;
  1161. r6 += 0x100010;
  1162. r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
  1163. r8 = (r2 >> 8) & 0xFF00FF;
  1164. r2 &= 0xFF00FF;
  1165. r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
  1166. r7 = (r1 >> 8) & 0xFF00FF;
  1167. r1 &= 0xFF00FF;
  1168. r1 += r2;
  1169. r7 += r8;
  1170. r0 -= 5 * r1;
  1171. r6 -= 5 * r7;
  1172. r0 >>= 5;
  1173. r6 >>= 5;
  1174. /* clip */
  1175. r13 |= r6;
  1176. r13 |= r0;
  1177. //CLIPPACK(r6,result)
  1178. r0 &= 0xFF00FF;
  1179. r6 &= 0xFF00FF;
  1180. r0 |= (r6 << 8); /* pack it back */
  1181. *((uint32*)(p_cur += outpitch)) = r0;
  1182. }
  1183. p_cur += curr_offset; /* offset to the next pixel */
  1184. if (r13 & 0xFF000700) /* this column need clipping */
  1185. {
  1186. p_cur -= 4;
  1187. for (i = 0; i < 4; i++)
  1188. {
  1189. p_ref = in + i;
  1190. p_cur -= outpitch; /* compensate for the first offset */
  1191. tmp = (uint32)(p_ref + ref_offset); /* limit */
  1192. while ((uint32)p_ref < tmp)
  1193. { /* loop un-rolled */
  1194. r0 = *(p_ref - (inpitch << 1));
  1195. r1 = *(p_ref - inpitch);
  1196. r2 = *p_ref;
  1197. r3 = *(p_ref += inpitch); /* modify pointer before loading */
  1198. r4 = *(p_ref += inpitch);
  1199. /* first pixel */
  1200. r5 = *(p_ref += inpitch);
  1201. result = (r0 + r5);
  1202. r0 = (r1 + r4);
  1203. result -= (r0 * 5);//result -= r0; result -= (r0<<2);
  1204. r0 = (r2 + r3);
  1205. result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
  1206. result = (result + 16) >> 5;
  1207. CLIP_RESULT(result)
  1208. *(p_cur += outpitch) = result;
  1209. /* second pixel */
  1210. r0 = *(p_ref += inpitch);
  1211. result = (r1 + r0);
  1212. r1 = (r2 + r5);
  1213. result -= (r1 * 5);//result -= r1; result -= (r1<<2);
  1214. r1 = (r3 + r4);
  1215. result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
  1216. result = (result + 16) >> 5;
  1217. CLIP_RESULT(result)
  1218. *(p_cur += outpitch) = result;
  1219. /* third pixel */
  1220. r1 = *(p_ref += inpitch);
  1221. result = (r2 + r1);
  1222. r2 = (r3 + r0);
  1223. result -= (r2 * 5);//result -= r2; result -= (r2<<2);
  1224. r2 = (r4 + r5);
  1225. result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
  1226. result = (result + 16) >> 5;
  1227. CLIP_RESULT(result)
  1228. *(p_cur += outpitch) = result;
  1229. /* fourth pixel */
  1230. r2 = *(p_ref += inpitch);
  1231. result = (r3 + r2);
  1232. r3 = (r4 + r1);
  1233. result -= (r3 * 5);//result -= r3; result -= (r3<<2);
  1234. r3 = (r5 + r0);
  1235. result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
  1236. result = (result + 16) >> 5;
  1237. CLIP_RESULT(result)
  1238. *(p_cur += outpitch) = result;
  1239. p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */
  1240. }
  1241. p_cur += (curr_offset - 3);
  1242. }
  1243. }
  1244. }
  1245. }
  1246. return ;
  1247. }
  1248. void VertInterp2MC(uint8 *in, int inpitch, int *out, int outpitch,
  1249. int blkwidth, int blkheight)
  1250. {
  1251. int *p_cur;
  1252. uint8 *p_ref;
  1253. uint32 tmp;
  1254. int result, curr_offset, ref_offset;
  1255. int j, r0, r1, r2, r3, r4, r5;
  1256. p_cur = out;
  1257. curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
  1258. ref_offset = blkheight * inpitch; /* for limit */
  1259. for (j = 0; j < blkwidth; j++)
  1260. {
  1261. p_cur -= outpitch; /* compensate for the first offset */
  1262. p_ref = in++;
  1263. tmp = (uint32)(p_ref + ref_offset); /* limit */
  1264. while ((uint32)p_ref < tmp)
  1265. { /* loop un-rolled */
  1266. r0 = *(p_ref - (inpitch << 1));
  1267. r1 = *(p_ref - inpitch);
  1268. r2 = *p_ref;
  1269. r3 = *(p_ref += inpitch); /* modify pointer before loading */
  1270. r4 = *(p_ref += inpitch);
  1271. /* first pixel */
  1272. r5 = *(p_ref += inpitch);
  1273. result = (r0 + r5);
  1274. r0 = (r1 + r4);
  1275. result -= (r0 * 5);//result -= r0; result -= (r0<<2);
  1276. r0 = (r2 + r3);
  1277. result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
  1278. *(p_cur += outpitch) = result;
  1279. /* second pixel */
  1280. r0 = *(p_ref += inpitch);
  1281. result = (r1 + r0);
  1282. r1 = (r2 + r5);
  1283. result -= (r1 * 5);//result -= r1; result -= (r1<<2);
  1284. r1 = (r3 + r4);
  1285. result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
  1286. *(p_cur += outpitch) = result;
  1287. /* third pixel */
  1288. r1 = *(p_ref += inpitch);
  1289. result = (r2 + r1);
  1290. r2 = (r3 + r0);
  1291. result -= (r2 * 5);//result -= r2; result -= (r2<<2);
  1292. r2 = (r4 + r5);
  1293. result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
  1294. *(p_cur += outpitch) = result;
  1295. /* fourth pixel */
  1296. r2 = *(p_ref += inpitch);
  1297. result = (r3 + r2);
  1298. r3 = (r4 + r1);
  1299. result -= (r3 * 5);//result -= r3; result -= (r3<<2);
  1300. r3 = (r5 + r0);
  1301. result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
  1302. *(p_cur += outpitch) = result;
  1303. p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */
  1304. }
  1305. p_cur += curr_offset;
  1306. }
  1307. return ;
  1308. }
  1309. void VertInterp3MC(int *in, int inpitch, uint8 *out, int outpitch,
  1310. int blkwidth, int blkheight, int dy)
  1311. {
  1312. uint8 *p_cur;
  1313. int *p_ref;
  1314. uint32 tmp;
  1315. int result, result2, curr_offset, ref_offset;
  1316. int j, r0, r1, r2, r3, r4, r5;
  1317. p_cur = out;
  1318. curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
  1319. ref_offset = blkheight * inpitch; /* for limit */
  1320. if (dy&1)
  1321. {
  1322. dy = (dy >> 1) ? -(inpitch << 1) : -(inpitch << 1) - inpitch;
  1323. for (j = 0; j < blkwidth; j++)
  1324. {
  1325. p_cur -= outpitch; /* compensate for the first offset */
  1326. p_ref = in++;
  1327. tmp = (uint32)(p_ref + ref_offset); /* limit */
  1328. while ((uint32)p_ref < tmp)
  1329. { /* loop un-rolled */
  1330. r0 = *(p_ref - (inpitch << 1));
  1331. r1 = *(p_ref - inpitch);
  1332. r2 = *p_ref;
  1333. r3 = *(p_ref += inpitch); /* modify pointer before loading */
  1334. r4 = *(p_ref += inpitch);
  1335. /* first pixel */
  1336. r5 = *(p_ref += inpitch);
  1337. result = (r0 + r5);
  1338. r0 = (r1 + r4);
  1339. result -= (r0 * 5);//result -= r0; result -= (r0<<2);
  1340. r0 = (r2 + r3);
  1341. result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
  1342. result = (result + 512) >> 10;
  1343. CLIP_RESULT(result)
  1344. result2 = ((p_ref[dy] + 16) >> 5);
  1345. CLIP_RESULT(result2)
  1346. /* 3/4 pel, no need to clip */
  1347. result = (result + result2 + 1);
  1348. result = (result >> 1);
  1349. *(p_cur += outpitch) = result;
  1350. /* second pixel */
  1351. r0 = *(p_ref += inpitch);
  1352. result = (r1 + r0);
  1353. r1 = (r2 + r5);
  1354. result -= (r1 * 5);//result -= r1; result -= (r1<<2);
  1355. r1 = (r3 + r4);
  1356. result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
  1357. result = (result + 512) >> 10;
  1358. CLIP_RESULT(result)
  1359. result2 = ((p_ref[dy] + 16) >> 5);
  1360. CLIP_RESULT(result2)
  1361. /* 3/4 pel, no need to clip */
  1362. result = (result + result2 + 1);
  1363. result = (result >> 1);
  1364. *(p_cur += outpitch) = result;
  1365. /* third pixel */
  1366. r1 = *(p_ref += inpitch);
  1367. result = (r2 + r1);
  1368. r2 = (r3 + r0);
  1369. result -= (r2 * 5);//result -= r2; result -= (r2<<2);
  1370. r2 = (r4 + r5);
  1371. result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
  1372. result = (result + 512) >> 10;
  1373. CLIP_RESULT(result)
  1374. result2 = ((p_ref[dy] + 16) >> 5);
  1375. CLIP_RESULT(result2)
  1376. /* 3/4 pel, no need to clip */
  1377. result = (result + result2 + 1);
  1378. result = (result >> 1);
  1379. *(p_cur += outpitch) = result;
  1380. /* fourth pixel */
  1381. r2 = *(p_ref += inpitch);
  1382. result = (r3 + r2);
  1383. r3 = (r4 + r1);
  1384. result -= (r3 * 5);//result -= r3; result -= (r3<<2);
  1385. r3 = (r5 + r0);
  1386. result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
  1387. result = (result + 512) >> 10;
  1388. CLIP_RESULT(result)
  1389. result2 = ((p_ref[dy] + 16) >> 5);
  1390. CLIP_RESULT(result2)
  1391. /* 3/4 pel, no need to clip */
  1392. result = (result + result2 + 1);
  1393. result = (result >> 1);
  1394. *(p_cur += outpitch) = result;
  1395. p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */
  1396. }
  1397. p_cur += curr_offset;
  1398. }
  1399. }
  1400. else
  1401. {
  1402. for (j = 0; j < blkwidth; j++)
  1403. {
  1404. p_cur -= outpitch; /* compensate for the first offset */
  1405. p_ref = in++;
  1406. tmp = (uint32)(p_ref + ref_offset); /* limit */
  1407. while ((uint32)p_ref < tmp)
  1408. { /* loop un-rolled */
  1409. r0 = *(p_ref - (inpitch << 1));
  1410. r1 = *(p_ref - inpitch);
  1411. r2 = *p_ref;
  1412. r3 = *(p_ref += inpitch); /* modify pointer before loading */
  1413. r4 = *(p_ref += inpitch);
  1414. /* first pixel */
  1415. r5 = *(p_ref += inpitch);
  1416. result = (r0 + r5);
  1417. r0 = (r1 + r4);
  1418. result -= (r0 * 5);//result -= r0; result -= (r0<<2);
  1419. r0 = (r2 + r3);
  1420. result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
  1421. result = (result + 512) >> 10;
  1422. CLIP_RESULT(result)
  1423. *(p_cur += outpitch) = result;
  1424. /* second pixel */
  1425. r0 = *(p_ref += inpitch);
  1426. result = (r1 + r0);
  1427. r1 = (r2 + r5);
  1428. result -= (r1 * 5);//result -= r1; result -= (r1<<2);
  1429. r1 = (r3 + r4);
  1430. result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
  1431. result = (result + 512) >> 10;
  1432. CLIP_RESULT(result)
  1433. *(p_cur += outpitch) = result;
  1434. /* third pixel */
  1435. r1 = *(p_ref += inpitch);
  1436. result = (r2 + r1);
  1437. r2 = (r3 + r0);
  1438. result -= (r2 * 5);//result -= r2; result -= (r2<<2);
  1439. r2 = (r4 + r5);
  1440. result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
  1441. result = (result + 512) >> 10;
  1442. CLIP_RESULT(result)
  1443. *(p_cur += outpitch) = result;
  1444. /* fourth pixel */
  1445. r2 = *(p_ref += inpitch);
  1446. result = (r3 + r2);
  1447. r3 = (r4 + r1);
  1448. result -= (r3 * 5);//result -= r3; result -= (r3<<2);
  1449. r3 = (r5 + r0);
  1450. result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
  1451. result = (result + 512) >> 10;
  1452. CLIP_RESULT(result)
  1453. *(p_cur += outpitch) = result;
  1454. p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */
  1455. }
  1456. p_cur += curr_offset;
  1457. }
  1458. }
  1459. return ;
  1460. }
  1461. void DiagonalInterpMC(uint8 *in1, uint8 *in2, int inpitch,
  1462. uint8 *out, int outpitch,
  1463. int blkwidth, int blkheight)
  1464. {
  1465. int j, i;
  1466. int result;
  1467. uint8 *p_cur, *p_ref, *p_tmp8;
  1468. int curr_offset, ref_offset;
  1469. uint8 tmp_res[24][24], tmp_in[24][24];
  1470. uint32 *p_tmp;
  1471. uint32 tmp, pkres, tmp_result;
  1472. int32 r0, r1, r2, r3, r4, r5;
  1473. int32 r6, r7, r8, r9, r10, r13;
  1474. ref_offset = inpitch - blkwidth;
  1475. p_ref = in1 - 2;
  1476. /* perform horizontal interpolation */
  1477. /* not word-aligned */
  1478. /* It is faster to read 1 byte at time to avoid calling CreateAlign */
  1479. /* if(((uint32)p_ref)&0x3)
  1480. {
  1481. CreateAlign(p_ref,inpitch,0,&tmp_in[0][0],blkwidth+8,blkheight);
  1482. p_ref = &tmp_in[0][0];
  1483. ref_offset = 24-blkwidth;
  1484. }*/
  1485. p_tmp = (uint32*) & (tmp_res[0][0]);
  1486. for (j = blkheight; j > 0; j--)
  1487. {
  1488. r13 = 0;
  1489. tmp = (uint32)(p_ref + blkwidth);
  1490. //r0 = *((uint32*)p_ref); /* d,c,b,a */
  1491. //r1 = (r0>>8)&0xFF00FF; /* 0,d,0,b */
  1492. //r0 &= 0xFF00FF; /* 0,c,0,a */
  1493. /* It is faster to read 1 byte at a time, */
  1494. r0 = p_ref[0];
  1495. r1 = p_ref[2];
  1496. r0 |= (r1 << 16); /* 0,c,0,a */
  1497. r1 = p_ref[1];
  1498. r2 = p_ref[3];
  1499. r1 |= (r2 << 16); /* 0,d,0,b */
  1500. while ((uint32)p_ref < tmp)
  1501. {
  1502. //r2 = *((uint32*)(p_ref+=4));/* h,g,f,e */
  1503. //r3 = (r2>>8)&0xFF00FF; /* 0,h,0,f */
  1504. //r2 &= 0xFF00FF; /* 0,g,0,e */
  1505. /* It is faster to read 1 byte at a time, */
  1506. r2 = *(p_ref += 4);
  1507. r3 = p_ref[2];
  1508. r2 |= (r3 << 16); /* 0,g,0,e */
  1509. r3 = p_ref[1];
  1510. r4 = p_ref[3];
  1511. r3 |= (r4 << 16); /* 0,h,0,f */
  1512. r4 = r0 + r3; /* c+h, a+f */
  1513. r5 = r0 + r1; /* c+d, a+b */
  1514. r6 = r2 + r3; /* g+h, e+f */
  1515. r5 >>= 16;
  1516. r5 |= (r6 << 16); /* e+f, c+d */
  1517. r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */
  1518. r4 += 0x100010; /* +16, +16 */
  1519. r5 = r1 + r2; /* d+g, b+e */
  1520. r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
  1521. r4 >>= 5;
  1522. r13 |= r4; /* check clipping */
  1523. r4 &= 0xFF00FF; /* mask */
  1524. r5 = p_ref[4]; /* i */
  1525. r6 = (r5 << 16);
  1526. r5 = r6 | (r2 >> 16);/* 0,i,0,g */
  1527. r5 += r1; /* d+i, b+g */ /* r5 not free */
  1528. r1 >>= 16;
  1529. r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
  1530. r1 += r2; /* f+g, d+e */
  1531. r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */
  1532. r0 >>= 16;
  1533. r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
  1534. r0 += r3; /* e+h, c+f */
  1535. r5 += 0x100010; /* 16,16 */
  1536. r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
  1537. r5 >>= 5;
  1538. r13 |= r5; /* check clipping */
  1539. r5 &= 0xFF00FF; /* mask */
  1540. r4 |= (r5 << 8); /* pack them together */
  1541. *p_tmp++ = r4;
  1542. r1 = r3;
  1543. r0 = r2;
  1544. }
  1545. p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
  1546. p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */
  1547. if (r13&0xFF000700) /* need clipping */
  1548. {
  1549. /* move back to the beginning of the line */
  1550. p_ref -= (ref_offset + blkwidth); /* input */
  1551. p_tmp -= 6; /* intermediate output */
  1552. tmp = (uint32)(p_ref + blkwidth);
  1553. while ((uint32)p_ref < tmp)
  1554. {
  1555. r0 = *p_ref++;
  1556. r1 = *p_ref++;
  1557. r2 = *p_ref++;
  1558. r3 = *p_ref++;
  1559. r4 = *p_ref++;
  1560. /* first pixel */
  1561. r5 = *p_ref++;
  1562. result = (r0 + r5);
  1563. r0 = (r1 + r4);
  1564. result -= (r0 * 5);//result -= r0; result -= (r0<<2);
  1565. r0 = (r2 + r3);
  1566. result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
  1567. result = (result + 16) >> 5;
  1568. CLIP_RESULT(result)
  1569. pkres = result;
  1570. /* second pixel */
  1571. r0 = *p_ref++;
  1572. result = (r1 + r0);
  1573. r1 = (r2 + r5);
  1574. result -= (r1 * 5);//result -= r1; result -= (r1<<2);
  1575. r1 = (r3 + r4);
  1576. result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
  1577. result = (result + 16) >> 5;
  1578. CLIP_RESULT(result)
  1579. pkres |= (result << 8);
  1580. /* third pixel */
  1581. r1 = *p_ref++;
  1582. result = (r2 + r1);
  1583. r2 = (r3 + r0);
  1584. result -= (r2 * 5);//result -= r2; result -= (r2<<2);
  1585. r2 = (r4 + r5);
  1586. result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
  1587. result = (result + 16) >> 5;
  1588. CLIP_RESULT(result)
  1589. pkres |= (result << 16);
  1590. /* fourth pixel */
  1591. r2 = *p_ref++;
  1592. result = (r3 + r2);
  1593. r3 = (r4 + r1);
  1594. result -= (r3 * 5);//result -= r3; result -= (r3<<2);
  1595. r3 = (r5 + r0);
  1596. result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
  1597. result = (result + 16) >> 5;
  1598. CLIP_RESULT(result)
  1599. pkres |= (result << 24);
  1600. *p_tmp++ = pkres; /* write 4 pixel */
  1601. p_ref -= 5;
  1602. }
  1603. p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
  1604. p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */
  1605. }
  1606. }
  1607. /* perform vertical interpolation */
  1608. /* not word-aligned */
  1609. if (((uint32)in2)&0x3)
  1610. {
  1611. CreateAlign(in2, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
  1612. in2 = &tmp_in[2][0];
  1613. inpitch = 24;
  1614. }
  1615. p_cur = out;
  1616. curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically up and one pixel right */
  1617. pkres = blkheight * inpitch; /* reuse it for limit */
  1618. curr_offset += 3;
  1619. for (j = 0; j < blkwidth; j += 4, in2 += 4)
  1620. {
  1621. r13 = 0;
  1622. p_ref = in2;
  1623. p_tmp8 = &(tmp_res[0][j]); /* intermediate result */
  1624. p_tmp8 -= 24; /* compensate for the first offset */
  1625. p_cur -= outpitch; /* compensate for the first offset */
  1626. tmp = (uint32)(p_ref + pkres); /* limit */
  1627. while ((uint32)p_ref < tmp) /* the loop un-rolled */
  1628. {
  1629. /* Read 1 byte at a time is too slow, too many read and pack ops, need to call CreateAlign, */
  1630. /*p_ref8 = p_ref-(inpitch<<1); r0 = p_ref8[0]; r1 = p_ref8[2];
  1631. r0 |= (r1<<16); r6 = p_ref8[1]; r1 = p_ref8[3];
  1632. r6 |= (r1<<16); p_ref+=inpitch; */
  1633. r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
  1634. p_ref += inpitch;
  1635. r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
  1636. r0 &= 0xFF00FF;
  1637. /*p_ref8 = p_ref+(inpitch<<1);
  1638. r1 = p_ref8[0]; r7 = p_ref8[2]; r1 |= (r7<<16);
  1639. r7 = p_ref8[1]; r2 = p_ref8[3]; r7 |= (r2<<16);*/
  1640. r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */
  1641. r7 = (r1 >> 8) & 0xFF00FF;
  1642. r1 &= 0xFF00FF;
  1643. r0 += r1;
  1644. r6 += r7;
  1645. /*r2 = p_ref[0]; r8 = p_ref[2]; r2 |= (r8<<16);
  1646. r8 = p_ref[1]; r1 = p_ref[3]; r8 |= (r1<<16);*/
  1647. r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
  1648. r8 = (r2 >> 8) & 0xFF00FF;
  1649. r2 &= 0xFF00FF;
  1650. /*p_ref8 = p_ref-inpitch; r1 = p_ref8[0]; r7 = p_ref8[2];
  1651. r1 |= (r7<<16); r1 += r2; r7 = p_ref8[1];
  1652. r2 = p_ref8[3]; r7 |= (r2<<16);*/
  1653. r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
  1654. r7 = (r1 >> 8) & 0xFF00FF;
  1655. r1 &= 0xFF00FF;
  1656. r1 += r2;
  1657. r7 += r8;
  1658. r0 += 20 * r1;
  1659. r6 += 20 * r7;
  1660. r0 += 0x100010;
  1661. r6 += 0x100010;
  1662. /*p_ref8 = p_ref-(inpitch<<1); r2 = p_ref8[0]; r8 = p_ref8[2];
  1663. r2 |= (r8<<16); r8 = p_ref8[1]; r1 = p_ref8[3]; r8 |= (r1<<16);*/
  1664. r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
  1665. r8 = (r2 >> 8) & 0xFF00FF;
  1666. r2 &= 0xFF00FF;
  1667. /*p_ref8 = p_ref+inpitch; r1 = p_ref8[0]; r7 = p_ref8[2];
  1668. r1 |= (r7<<16); r1 += r2; r7 = p_ref8[1];
  1669. r2 = p_ref8[3]; r7 |= (r2<<16);*/
  1670. r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
  1671. r7 = (r1 >> 8) & 0xFF00FF;
  1672. r1 &= 0xFF00FF;
  1673. r1 += r2;
  1674. r7 += r8;
  1675. r0 -= 5 * r1;
  1676. r6 -= 5 * r7;
  1677. r0 >>= 5;
  1678. r6 >>= 5;
  1679. /* clip */
  1680. r13 |= r6;
  1681. r13 |= r0;
  1682. //CLIPPACK(r6,result)
  1683. /* add with horizontal results */
  1684. r10 = *((uint32*)(p_tmp8 += 24));
  1685. r9 = (r10 >> 8) & 0xFF00FF;
  1686. r10 &= 0xFF00FF;
  1687. r0 += r10;
  1688. r0 += 0x10001;
  1689. r0 = (r0 >> 1) & 0xFF00FF; /* mask to 8 bytes */
  1690. r6 += r9;
  1691. r6 += 0x10001;
  1692. r6 = (r6 >> 1) & 0xFF00FF; /* mask to 8 bytes */
  1693. r0 |= (r6 << 8); /* pack it back */
  1694. *((uint32*)(p_cur += outpitch)) = r0;
  1695. }
  1696. p_cur += curr_offset; /* offset to the next pixel */
  1697. if (r13 & 0xFF000700) /* this column need clipping */
  1698. {
  1699. p_cur -= 4;
  1700. for (i = 0; i < 4; i++)
  1701. {
  1702. p_ref = in2 + i;
  1703. p_tmp8 = &(tmp_res[0][j+i]); /* intermediate result */
  1704. p_tmp8 -= 24; /* compensate for the first offset */
  1705. p_cur -= outpitch; /* compensate for the first offset */
  1706. tmp = (uint32)(p_ref + pkres); /* limit */
  1707. while ((uint32)p_ref < tmp) /* the loop un-rolled */
  1708. {
  1709. r0 = *(p_ref - (inpitch << 1));
  1710. r1 = *(p_ref - inpitch);
  1711. r2 = *p_ref;
  1712. r3 = *(p_ref += inpitch); /* modify pointer before loading */
  1713. r4 = *(p_ref += inpitch);
  1714. /* first pixel */
  1715. r5 = *(p_ref += inpitch);
  1716. result = (r0 + r5);
  1717. r0 = (r1 + r4);
  1718. result -= (r0 * 5);//result -= r0; result -= (r0<<2);
  1719. r0 = (r2 + r3);
  1720. result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
  1721. result = (result + 16) >> 5;
  1722. CLIP_RESULT(result)
  1723. tmp_result = *(p_tmp8 += 24); /* modify pointer before loading */
  1724. result = (result + tmp_result + 1); /* no clip */
  1725. result = (result >> 1);
  1726. *(p_cur += outpitch) = result;
  1727. /* second pixel */
  1728. r0 = *(p_ref += inpitch);
  1729. result = (r1 + r0);
  1730. r1 = (r2 + r5);
  1731. result -= (r1 * 5);//result -= r1; result -= (r1<<2);
  1732. r1 = (r3 + r4);
  1733. result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
  1734. result = (result + 16) >> 5;
  1735. CLIP_RESULT(result)
  1736. tmp_result = *(p_tmp8 += 24); /* intermediate result */
  1737. result = (result + tmp_result + 1); /* no clip */
  1738. result = (result >> 1);
  1739. *(p_cur += outpitch) = result;
  1740. /* third pixel */
  1741. r1 = *(p_ref += inpitch);
  1742. result = (r2 + r1);
  1743. r2 = (r3 + r0);
  1744. result -= (r2 * 5);//result -= r2; result -= (r2<<2);
  1745. r2 = (r4 + r5);
  1746. result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
  1747. result = (result + 16) >> 5;
  1748. CLIP_RESULT(result)
  1749. tmp_result = *(p_tmp8 += 24); /* intermediate result */
  1750. result = (result + tmp_result + 1); /* no clip */
  1751. result = (result >> 1);
  1752. *(p_cur += outpitch) = result;
  1753. /* fourth pixel */
  1754. r2 = *(p_ref += inpitch);
  1755. result = (r3 + r2);
  1756. r3 = (r4 + r1);
  1757. result -= (r3 * 5);//result -= r3; result -= (r3<<2);
  1758. r3 = (r5 + r0);
  1759. result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
  1760. result = (result + 16) >> 5;
  1761. CLIP_RESULT(result)
  1762. tmp_result = *(p_tmp8 += 24); /* intermediate result */
  1763. result = (result + tmp_result + 1); /* no clip */
  1764. result = (result >> 1);
  1765. *(p_cur += outpitch) = result;
  1766. p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */
  1767. }
  1768. p_cur += (curr_offset - 3);
  1769. }
  1770. }
  1771. }
  1772. return ;
  1773. }
  1774. /* position G */
  1775. void FullPelMC(uint8 *in, int inpitch, uint8 *out, int outpitch,
  1776. int blkwidth, int blkheight)
  1777. {
  1778. int i, j;
  1779. int offset_in = inpitch - blkwidth;
  1780. int offset_out = outpitch - blkwidth;
  1781. uint32 temp;
  1782. uint8 byte;
  1783. if (((uint32)in)&3)
  1784. {
  1785. for (j = blkheight; j > 0; j--)
  1786. {
  1787. for (i = blkwidth; i > 0; i -= 4)
  1788. {
  1789. temp = *in++;
  1790. byte = *in++;
  1791. temp |= (byte << 8);
  1792. byte = *in++;
  1793. temp |= (byte << 16);
  1794. byte = *in++;
  1795. temp |= (byte << 24);
  1796. *((uint32*)out) = temp; /* write 4 bytes */
  1797. out += 4;
  1798. }
  1799. out += offset_out;
  1800. in += offset_in;
  1801. }
  1802. }
  1803. else
  1804. {
  1805. for (j = blkheight; j > 0; j--)
  1806. {
  1807. for (i = blkwidth; i > 0; i -= 4)
  1808. {
  1809. temp = *((uint32*)in);
  1810. *((uint32*)out) = temp;
  1811. in += 4;
  1812. out += 4;
  1813. }
  1814. out += offset_out;
  1815. in += offset_in;
  1816. }
  1817. }
  1818. return ;
  1819. }
  1820. void ChromaMotionComp(uint8 *ref, int picwidth, int picheight,
  1821. int x_pos, int y_pos,
  1822. uint8 *pred, int pred_pitch,
  1823. int blkwidth, int blkheight)
  1824. {
  1825. int dx, dy;
  1826. int offset_dx, offset_dy;
  1827. int index;
  1828. uint8 temp[24][24];
  1829. dx = x_pos & 7;
  1830. dy = y_pos & 7;
  1831. offset_dx = (dx + 7) >> 3;
  1832. offset_dy = (dy + 7) >> 3;
  1833. x_pos = x_pos >> 3; /* round it to full-pel resolution */
  1834. y_pos = y_pos >> 3;
  1835. if ((x_pos >= 0 && x_pos + blkwidth + offset_dx <= picwidth) && (y_pos >= 0 && y_pos + blkheight + offset_dy <= picheight))
  1836. {
  1837. ref += y_pos * picwidth + x_pos;
  1838. }
  1839. else
  1840. {
  1841. CreatePad(ref, picwidth, picheight, x_pos, y_pos, &temp[0][0], blkwidth + offset_dx, blkheight + offset_dy);
  1842. ref = &temp[0][0];
  1843. picwidth = 24;
  1844. }
  1845. index = offset_dx + (offset_dy << 1) + ((blkwidth << 1) & 0x7);
  1846. (*(ChromaMC_SIMD[index]))(ref, picwidth , dx, dy, pred, pred_pitch, blkwidth, blkheight);
  1847. return ;
  1848. }
  1849. /* SIMD routines, unroll the loops in vertical direction, decreasing loops (things to be done) */
  1850. void ChromaDiagonalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
  1851. uint8 *pOut, int predPitch, int blkwidth, int blkheight)
  1852. {
  1853. int32 r0, r1, r2, r3, result0, result1;
  1854. uint8 temp[288];
  1855. uint8 *ref, *out;
  1856. int i, j;
  1857. int dx_8 = 8 - dx;
  1858. int dy_8 = 8 - dy;
  1859. /* horizontal first */
  1860. out = temp;
  1861. for (i = 0; i < blkheight + 1; i++)
  1862. {
  1863. ref = pRef;
  1864. r0 = ref[0];
  1865. for (j = 0; j < blkwidth; j += 4)
  1866. {
  1867. r0 |= (ref[2] << 16);
  1868. result0 = dx_8 * r0;
  1869. r1 = ref[1] | (ref[3] << 16);
  1870. result0 += dx * r1;
  1871. *(int32 *)out = result0;
  1872. result0 = dx_8 * r1;
  1873. r2 = ref[4];
  1874. r0 = r0 >> 16;
  1875. r1 = r0 | (r2 << 16);
  1876. result0 += dx * r1;
  1877. *(int32 *)(out + 16) = result0;
  1878. ref += 4;
  1879. out += 4;
  1880. r0 = r2;
  1881. }
  1882. pRef += srcPitch;
  1883. out += (32 - blkwidth);
  1884. }
  1885. // pRef -= srcPitch*(blkheight+1);
  1886. ref = temp;
  1887. for (j = 0; j < blkwidth; j += 4)
  1888. {
  1889. r0 = *(int32 *)ref;
  1890. r1 = *(int32 *)(ref + 16);
  1891. ref += 32;
  1892. out = pOut;
  1893. for (i = 0; i < (blkheight >> 1); i++)
  1894. {
  1895. result0 = dy_8 * r0 + 0x00200020;
  1896. r2 = *(int32 *)ref;
  1897. result0 += dy * r2;
  1898. result0 >>= 6;
  1899. result0 &= 0x00FF00FF;
  1900. r0 = r2;
  1901. result1 = dy_8 * r1 + 0x00200020;
  1902. r3 = *(int32 *)(ref + 16);
  1903. result1 += dy * r3;
  1904. result1 >>= 6;
  1905. result1 &= 0x00FF00FF;
  1906. r1 = r3;
  1907. *(int32 *)out = result0 | (result1 << 8);
  1908. out += predPitch;
  1909. ref += 32;
  1910. result0 = dy_8 * r0 + 0x00200020;
  1911. r2 = *(int32 *)ref;
  1912. result0 += dy * r2;
  1913. result0 >>= 6;
  1914. result0 &= 0x00FF00FF;
  1915. r0 = r2;
  1916. result1 = dy_8 * r1 + 0x00200020;
  1917. r3 = *(int32 *)(ref + 16);
  1918. result1 += dy * r3;
  1919. result1 >>= 6;
  1920. result1 &= 0x00FF00FF;
  1921. r1 = r3;
  1922. *(int32 *)out = result0 | (result1 << 8);
  1923. out += predPitch;
  1924. ref += 32;
  1925. }
  1926. pOut += 4;
  1927. ref = temp + 4; /* since it can only iterate twice max */
  1928. }
  1929. return;
  1930. }
  1931. void ChromaHorizontalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
  1932. uint8 *pOut, int predPitch, int blkwidth, int blkheight)
  1933. {
  1934. OSCL_UNUSED_ARG(dy);
  1935. int32 r0, r1, r2, result0, result1;
  1936. uint8 *ref, *out;
  1937. int i, j;
  1938. int dx_8 = 8 - dx;
  1939. /* horizontal first */
  1940. for (i = 0; i < blkheight; i++)
  1941. {
  1942. ref = pRef;
  1943. out = pOut;
  1944. r0 = ref[0];
  1945. for (j = 0; j < blkwidth; j += 4)
  1946. {
  1947. r0 |= (ref[2] << 16);
  1948. result0 = dx_8 * r0 + 0x00040004;
  1949. r1 = ref[1] | (ref[3] << 16);
  1950. result0 += dx * r1;
  1951. result0 >>= 3;
  1952. result0 &= 0x00FF00FF;
  1953. result1 = dx_8 * r1 + 0x00040004;
  1954. r2 = ref[4];
  1955. r0 = r0 >> 16;
  1956. r1 = r0 | (r2 << 16);
  1957. result1 += dx * r1;
  1958. result1 >>= 3;
  1959. result1 &= 0x00FF00FF;
  1960. *(int32 *)out = result0 | (result1 << 8);
  1961. ref += 4;
  1962. out += 4;
  1963. r0 = r2;
  1964. }
  1965. pRef += srcPitch;
  1966. pOut += predPitch;
  1967. }
  1968. return;
  1969. }
  1970. void ChromaVerticalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
  1971. uint8 *pOut, int predPitch, int blkwidth, int blkheight)
  1972. {
  1973. OSCL_UNUSED_ARG(dx);
  1974. int32 r0, r1, r2, r3, result0, result1;
  1975. int i, j;
  1976. uint8 *ref, *out;
  1977. int dy_8 = 8 - dy;
  1978. /* vertical first */
  1979. for (i = 0; i < blkwidth; i += 4)
  1980. {
  1981. ref = pRef;
  1982. out = pOut;
  1983. r0 = ref[0] | (ref[2] << 16);
  1984. r1 = ref[1] | (ref[3] << 16);
  1985. ref += srcPitch;
  1986. for (j = 0; j < blkheight; j++)
  1987. {
  1988. result0 = dy_8 * r0 + 0x00040004;
  1989. r2 = ref[0] | (ref[2] << 16);
  1990. result0 += dy * r2;
  1991. result0 >>= 3;
  1992. result0 &= 0x00FF00FF;
  1993. r0 = r2;
  1994. result1 = dy_8 * r1 + 0x00040004;
  1995. r3 = ref[1] | (ref[3] << 16);
  1996. result1 += dy * r3;
  1997. result1 >>= 3;
  1998. result1 &= 0x00FF00FF;
  1999. r1 = r3;
  2000. *(int32 *)out = result0 | (result1 << 8);
  2001. ref += srcPitch;
  2002. out += predPitch;
  2003. }
  2004. pOut += 4;
  2005. pRef += 4;
  2006. }
  2007. return;
  2008. }
  2009. void ChromaDiagonalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
  2010. uint8 *pOut, int predPitch, int blkwidth, int blkheight)
  2011. {
  2012. OSCL_UNUSED_ARG(blkwidth);
  2013. int32 r0, r1, temp0, temp1, result;
  2014. int32 temp[9];
  2015. int32 *out;
  2016. int i, r_temp;
  2017. int dy_8 = 8 - dy;
  2018. /* horizontal first */
  2019. out = temp;
  2020. for (i = 0; i < blkheight + 1; i++)
  2021. {
  2022. r_temp = pRef[1];
  2023. temp0 = (pRef[0] << 3) + dx * (r_temp - pRef[0]);
  2024. temp1 = (r_temp << 3) + dx * (pRef[2] - r_temp);
  2025. r0 = temp0 | (temp1 << 16);
  2026. *out++ = r0;
  2027. pRef += srcPitch;
  2028. }
  2029. pRef -= srcPitch * (blkheight + 1);
  2030. out = temp;
  2031. r0 = *out++;
  2032. for (i = 0; i < blkheight; i++)
  2033. {
  2034. result = dy_8 * r0 + 0x00200020;
  2035. r1 = *out++;
  2036. result += dy * r1;
  2037. result >>= 6;
  2038. result &= 0x00FF00FF;
  2039. *(int16 *)pOut = (result >> 8) | (result & 0xFF);
  2040. r0 = r1;
  2041. pOut += predPitch;
  2042. }
  2043. return;
  2044. }
  2045. void ChromaHorizontalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
  2046. uint8 *pOut, int predPitch, int blkwidth, int blkheight)
  2047. {
  2048. OSCL_UNUSED_ARG(dy);
  2049. OSCL_UNUSED_ARG(blkwidth);
  2050. int i, temp, temp0, temp1;
  2051. /* horizontal first */
  2052. for (i = 0; i < blkheight; i++)
  2053. {
  2054. temp = pRef[1];
  2055. temp0 = ((pRef[0] << 3) + dx * (temp - pRef[0]) + 4) >> 3;
  2056. temp1 = ((temp << 3) + dx * (pRef[2] - temp) + 4) >> 3;
  2057. *(int16 *)pOut = temp0 | (temp1 << 8);
  2058. pRef += srcPitch;
  2059. pOut += predPitch;
  2060. }
  2061. return;
  2062. }
  2063. void ChromaVerticalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
  2064. uint8 *pOut, int predPitch, int blkwidth, int blkheight)
  2065. {
  2066. OSCL_UNUSED_ARG(dx);
  2067. OSCL_UNUSED_ARG(blkwidth);
  2068. int32 r0, r1, result;
  2069. int i;
  2070. int dy_8 = 8 - dy;
  2071. r0 = pRef[0] | (pRef[1] << 16);
  2072. pRef += srcPitch;
  2073. for (i = 0; i < blkheight; i++)
  2074. {
  2075. result = dy_8 * r0 + 0x00040004;
  2076. r1 = pRef[0] | (pRef[1] << 16);
  2077. result += dy * r1;
  2078. result >>= 3;
  2079. result &= 0x00FF00FF;
  2080. *(int16 *)pOut = (result >> 8) | (result & 0xFF);
  2081. r0 = r1;
  2082. pRef += srcPitch;
  2083. pOut += predPitch;
  2084. }
  2085. return;
  2086. }
  2087. void ChromaFullMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
  2088. uint8 *pOut, int predPitch, int blkwidth, int blkheight)
  2089. {
  2090. OSCL_UNUSED_ARG(dx);
  2091. OSCL_UNUSED_ARG(dy);
  2092. int i, j;
  2093. int offset_in = srcPitch - blkwidth;
  2094. int offset_out = predPitch - blkwidth;
  2095. uint16 temp;
  2096. uint8 byte;
  2097. if (((uint32)pRef)&1)
  2098. {
  2099. for (j = blkheight; j > 0; j--)
  2100. {
  2101. for (i = blkwidth; i > 0; i -= 2)
  2102. {
  2103. temp = *pRef++;
  2104. byte = *pRef++;
  2105. temp |= (byte << 8);
  2106. *((uint16*)pOut) = temp; /* write 2 bytes */
  2107. pOut += 2;
  2108. }
  2109. pOut += offset_out;
  2110. pRef += offset_in;
  2111. }
  2112. }
  2113. else
  2114. {
  2115. for (j = blkheight; j > 0; j--)
  2116. {
  2117. for (i = blkwidth; i > 0; i -= 2)
  2118. {
  2119. temp = *((uint16*)pRef);
  2120. *((uint16*)pOut) = temp;
  2121. pRef += 2;
  2122. pOut += 2;
  2123. }
  2124. pOut += offset_out;
  2125. pRef += offset_in;
  2126. }
  2127. }
  2128. return ;
  2129. }