PageRenderTime 168ms CodeModel.GetById 0ms RepoModel.GetById 0ms app.codeStats 0ms

/Show/avc/pred_inter.cpp

http://github.com/mbebenita/Broadway
C++ | 2330 lines | 1974 code | 201 blank | 155 comment | 158 complexity | 5750c7959b2cde35dcd102bfcf79ae88 MD5 | raw file
Possible License(s): BSD-3-Clause

Large files files are truncated, but you can click here to view the full file

  1. /* ------------------------------------------------------------------
  2. * Copyright (C) 1998-2009 PacketVideo
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
  13. * express or implied.
  14. * See the License for the specific language governing permissions
  15. * and limitations under the License.
  16. * -------------------------------------------------------------------
  17. */
  18. #include "avcdec_lib.h"
  19. #define CLIP_RESULT(x) if((uint)x > 0xFF){ \
  20. x = 0xFF & (~(x>>31));}
  21. /* (blkwidth << 2) + (dy << 1) + dx */
  22. static void (*const ChromaMC_SIMD[8])(uint8 *, int , int , int , uint8 *, int, int , int) =
  23. {
  24. &ChromaFullMC_SIMD,
  25. &ChromaHorizontalMC_SIMD,
  26. &ChromaVerticalMC_SIMD,
  27. &ChromaDiagonalMC_SIMD,
  28. &ChromaFullMC_SIMD,
  29. &ChromaHorizontalMC2_SIMD,
  30. &ChromaVerticalMC2_SIMD,
  31. &ChromaDiagonalMC2_SIMD
  32. };
  33. /* Perform motion prediction and compensation with residue if exist. */
  34. void InterMBPrediction(AVCCommonObj *video)
  35. {
  36. AVCMacroblock *currMB = video->currMB;
  37. AVCPictureData *currPic = video->currPic;
  38. int mbPartIdx, subMbPartIdx;
  39. int ref_idx;
  40. int offset_MbPart_indx = 0;
  41. int16 *mv;
  42. uint32 x_pos, y_pos;
  43. uint8 *curL, *curCb, *curCr;
  44. uint8 *ref_l, *ref_Cb, *ref_Cr;
  45. uint8 *predBlock, *predCb, *predCr;
  46. int block_x, block_y, offset_x, offset_y, offsetP, offset;
  47. int x_position = (video->mb_x << 4);
  48. int y_position = (video->mb_y << 4);
  49. int MbHeight, MbWidth, mbPartIdx_X, mbPartIdx_Y, offset_indx;
  50. int picWidth = currPic->pitch;
  51. int picHeight = currPic->height;
  52. int16 *dataBlock;
  53. uint32 cbp4x4;
  54. uint32 tmp_word;
  55. tmp_word = y_position * picWidth;
  56. curL = currPic->Sl + tmp_word + x_position;
  57. offset = (tmp_word >> 2) + (x_position >> 1);
  58. curCb = currPic->Scb + offset;
  59. curCr = currPic->Scr + offset;
  60. #ifdef USE_PRED_BLOCK
  61. predBlock = video->pred + 84;
  62. predCb = video->pred + 452;
  63. predCr = video->pred + 596;
  64. #else
  65. predBlock = curL;
  66. predCb = curCb;
  67. predCr = curCr;
  68. #endif
  69. GetMotionVectorPredictor(video, false);
  70. for (mbPartIdx = 0; mbPartIdx < currMB->NumMbPart; mbPartIdx++)
  71. {
  72. MbHeight = currMB->SubMbPartHeight[mbPartIdx];
  73. MbWidth = currMB->SubMbPartWidth[mbPartIdx];
  74. mbPartIdx_X = ((mbPartIdx + offset_MbPart_indx) & 1);
  75. mbPartIdx_Y = (mbPartIdx + offset_MbPart_indx) >> 1;
  76. ref_idx = currMB->ref_idx_L0[(mbPartIdx_Y << 1) + mbPartIdx_X];
  77. offset_indx = 0;
  78. ref_l = video->RefPicList0[ref_idx]->Sl;
  79. ref_Cb = video->RefPicList0[ref_idx]->Scb;
  80. ref_Cr = video->RefPicList0[ref_idx]->Scr;
  81. for (subMbPartIdx = 0; subMbPartIdx < currMB->NumSubMbPart[mbPartIdx]; subMbPartIdx++)
  82. {
  83. block_x = (mbPartIdx_X << 1) + ((subMbPartIdx + offset_indx) & 1); // check this
  84. block_y = (mbPartIdx_Y << 1) + (((subMbPartIdx + offset_indx) >> 1) & 1);
  85. mv = (int16*)(currMB->mvL0 + block_x + (block_y << 2));
  86. offset_x = x_position + (block_x << 2);
  87. offset_y = y_position + (block_y << 2);
  88. x_pos = (offset_x << 2) + *mv++; /*quarter pel */
  89. y_pos = (offset_y << 2) + *mv; /*quarter pel */
  90. //offset = offset_y * currPic->width;
  91. //offsetC = (offset >> 2) + (offset_x >> 1);
  92. #ifdef USE_PRED_BLOCK
  93. offsetP = (block_y * 80) + (block_x << 2);
  94. LumaMotionComp(ref_l, picWidth, picHeight, x_pos, y_pos,
  95. /*comp_Sl + offset + offset_x,*/
  96. predBlock + offsetP, 20, MbWidth, MbHeight);
  97. #else
  98. offsetP = (block_y << 2) * picWidth + (block_x << 2);
  99. LumaMotionComp(ref_l, picWidth, picHeight, x_pos, y_pos,
  100. /*comp_Sl + offset + offset_x,*/
  101. predBlock + offsetP, picWidth, MbWidth, MbHeight);
  102. #endif
  103. #ifdef USE_PRED_BLOCK
  104. offsetP = (block_y * 24) + (block_x << 1);
  105. ChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
  106. /*comp_Scb + offsetC,*/
  107. predCb + offsetP, 12, MbWidth >> 1, MbHeight >> 1);
  108. ChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
  109. /*comp_Scr + offsetC,*/
  110. predCr + offsetP, 12, MbWidth >> 1, MbHeight >> 1);
  111. #else
  112. offsetP = (block_y * picWidth) + (block_x << 1);
  113. ChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
  114. /*comp_Scb + offsetC,*/
  115. predCb + offsetP, picWidth >> 1, MbWidth >> 1, MbHeight >> 1);
  116. ChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
  117. /*comp_Scr + offsetC,*/
  118. predCr + offsetP, picWidth >> 1, MbWidth >> 1, MbHeight >> 1);
  119. #endif
  120. offset_indx = currMB->SubMbPartWidth[mbPartIdx] >> 3;
  121. }
  122. offset_MbPart_indx = currMB->MbPartWidth >> 4;
  123. }
  124. /* used in decoder, used to be if(!encFlag) */
  125. /* transform in raster scan order */
  126. dataBlock = video->block;
  127. cbp4x4 = video->cbp4x4;
  128. /* luma */
  129. for (block_y = 4; block_y > 0; block_y--)
  130. {
  131. for (block_x = 4; block_x > 0; block_x--)
  132. {
  133. #ifdef USE_PRED_BLOCK
  134. if (cbp4x4&1)
  135. {
  136. itrans(dataBlock, predBlock, predBlock, 20);
  137. }
  138. #else
  139. if (cbp4x4&1)
  140. {
  141. itrans(dataBlock, curL, curL, picWidth);
  142. }
  143. #endif
  144. cbp4x4 >>= 1;
  145. dataBlock += 4;
  146. #ifdef USE_PRED_BLOCK
  147. predBlock += 4;
  148. #else
  149. curL += 4;
  150. #endif
  151. }
  152. dataBlock += 48;
  153. #ifdef USE_PRED_BLOCK
  154. predBlock += 64;
  155. #else
  156. curL += ((picWidth << 2) - 16);
  157. #endif
  158. }
  159. /* chroma */
  160. picWidth = (picWidth >> 1);
  161. for (block_y = 2; block_y > 0; block_y--)
  162. {
  163. for (block_x = 2; block_x > 0; block_x--)
  164. {
  165. #ifdef USE_PRED_BLOCK
  166. if (cbp4x4&1)
  167. {
  168. ictrans(dataBlock, predCb, predCb, 12);
  169. }
  170. #else
  171. if (cbp4x4&1)
  172. {
  173. ictrans(dataBlock, curCb, curCb, picWidth);
  174. }
  175. #endif
  176. cbp4x4 >>= 1;
  177. dataBlock += 4;
  178. #ifdef USE_PRED_BLOCK
  179. predCb += 4;
  180. #else
  181. curCb += 4;
  182. #endif
  183. }
  184. for (block_x = 2; block_x > 0; block_x--)
  185. {
  186. #ifdef USE_PRED_BLOCK
  187. if (cbp4x4&1)
  188. {
  189. ictrans(dataBlock, predCr, predCr, 12);
  190. }
  191. #else
  192. if (cbp4x4&1)
  193. {
  194. ictrans(dataBlock, curCr, curCr, picWidth);
  195. }
  196. #endif
  197. cbp4x4 >>= 1;
  198. dataBlock += 4;
  199. #ifdef USE_PRED_BLOCK
  200. predCr += 4;
  201. #else
  202. curCr += 4;
  203. #endif
  204. }
  205. dataBlock += 48;
  206. #ifdef USE_PRED_BLOCK
  207. predCb += 40;
  208. predCr += 40;
  209. #else
  210. curCb += ((picWidth << 2) - 8);
  211. curCr += ((picWidth << 2) - 8);
  212. #endif
  213. }
  214. #ifdef MB_BASED_DEBLOCK
  215. SaveNeighborForIntraPred(video, offset);
  216. #endif
  217. return ;
  218. }
  219. /* preform the actual motion comp here */
  220. void LumaMotionComp(uint8 *ref, int picwidth, int picheight,
  221. int x_pos, int y_pos,
  222. uint8 *pred, int pred_pitch,
  223. int blkwidth, int blkheight)
  224. {
  225. int dx, dy;
  226. uint8 temp[24][24]; /* for padding, make the size multiple of 4 for packing */
  227. int temp2[21][21]; /* for intermediate results */
  228. uint8 *ref2;
  229. dx = x_pos & 3;
  230. dy = y_pos & 3;
  231. x_pos = x_pos >> 2; /* round it to full-pel resolution */
  232. y_pos = y_pos >> 2;
  233. /* perform actual motion compensation */
  234. if (dx == 0 && dy == 0)
  235. { /* fullpel position *//* G */
  236. if (x_pos >= 0 && x_pos + blkwidth <= picwidth && y_pos >= 0 && y_pos + blkheight <= picheight)
  237. {
  238. ref += y_pos * picwidth + x_pos;
  239. FullPelMC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight);
  240. }
  241. else
  242. {
  243. CreatePad(ref, picwidth, picheight, x_pos, y_pos, &temp[0][0], blkwidth, blkheight);
  244. FullPelMC(&temp[0][0], 24, pred, pred_pitch, blkwidth, blkheight);
  245. }
  246. } /* other positions */
  247. else if (dy == 0)
  248. { /* no vertical interpolation *//* a,b,c*/
  249. if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos >= 0 && y_pos + blkheight <= picheight)
  250. {
  251. ref += y_pos * picwidth + x_pos;
  252. HorzInterp1MC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight, dx);
  253. }
  254. else /* need padding */
  255. {
  256. CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos, &temp[0][0], blkwidth + 5, blkheight);
  257. HorzInterp1MC(&temp[0][2], 24, pred, pred_pitch, blkwidth, blkheight, dx);
  258. }
  259. }
  260. else if (dx == 0)
  261. { /*no horizontal interpolation *//* d,h,n */
  262. if (x_pos >= 0 && x_pos + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
  263. {
  264. ref += y_pos * picwidth + x_pos;
  265. VertInterp1MC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight, dy);
  266. }
  267. else /* need padding */
  268. {
  269. CreatePad(ref, picwidth, picheight, x_pos, y_pos - 2, &temp[0][0], blkwidth, blkheight + 5);
  270. VertInterp1MC(&temp[2][0], 24, pred, pred_pitch, blkwidth, blkheight, dy);
  271. }
  272. }
  273. else if (dy == 2)
  274. { /* horizontal cross *//* i, j, k */
  275. if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
  276. {
  277. ref += y_pos * picwidth + x_pos - 2; /* move to the left 2 pixels */
  278. VertInterp2MC(ref, picwidth, &temp2[0][0], 21, blkwidth + 5, blkheight);
  279. HorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
  280. }
  281. else /* need padding */
  282. {
  283. CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5, blkheight + 5);
  284. VertInterp2MC(&temp[2][0], 24, &temp2[0][0], 21, blkwidth + 5, blkheight);
  285. HorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
  286. }
  287. }
  288. else if (dx == 2)
  289. { /* vertical cross */ /* f,q */
  290. if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
  291. {
  292. ref += (y_pos - 2) * picwidth + x_pos; /* move to up 2 lines */
  293. HorzInterp3MC(ref, picwidth, &temp2[0][0], 21, blkwidth, blkheight + 5);
  294. VertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
  295. }
  296. else /* need padding */
  297. {
  298. CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5, blkheight + 5);
  299. HorzInterp3MC(&temp[0][2], 24, &temp2[0][0], 21, blkwidth, blkheight + 5);
  300. VertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
  301. }
  302. }
  303. else
  304. { /* diagonal *//* e,g,p,r */
  305. if (x_pos - 2 >= 0 && x_pos + 3 + (dx / 2) + blkwidth <= picwidth &&
  306. y_pos - 2 >= 0 && y_pos + 3 + blkheight + (dy / 2) <= picheight)
  307. {
  308. ref2 = ref + (y_pos + (dy / 2)) * picwidth + x_pos;
  309. ref += (y_pos * picwidth) + x_pos + (dx / 2);
  310. DiagonalInterpMC(ref2, ref, picwidth, pred, pred_pitch, blkwidth, blkheight);
  311. }
  312. else /* need padding */
  313. {
  314. CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5 + (dx / 2), blkheight + 5 + (dy / 2));
  315. ref2 = &temp[2 + (dy/2)][2];
  316. ref = &temp[2][2 + (dx/2)];
  317. DiagonalInterpMC(ref2, ref, 24, pred, pred_pitch, blkwidth, blkheight);
  318. }
  319. }
  320. return ;
  321. }
  322. void CreateAlign(uint8 *ref, int picwidth, int y_pos,
  323. uint8 *out, int blkwidth, int blkheight)
  324. {
  325. int i, j;
  326. int offset, out_offset;
  327. uint32 prev_pix, result, pix1, pix2, pix4;
  328. out_offset = 24 - blkwidth;
  329. //switch(x_pos&0x3){
  330. switch (((uint32)ref)&0x3)
  331. {
  332. case 1:
  333. ref += y_pos * picwidth;
  334. offset = picwidth - blkwidth - 3;
  335. for (j = 0; j < blkheight; j++)
  336. {
  337. pix1 = *ref++;
  338. pix2 = *((uint16*)ref);
  339. ref += 2;
  340. result = (pix2 << 8) | pix1;
  341. for (i = 3; i < blkwidth; i += 4)
  342. {
  343. pix4 = *((uint32*)ref);
  344. ref += 4;
  345. prev_pix = (pix4 << 24) & 0xFF000000; /* mask out byte belong to previous word */
  346. result |= prev_pix;
  347. *((uint32*)out) = result; /* write 4 bytes */
  348. out += 4;
  349. result = pix4 >> 8; /* for the next loop */
  350. }
  351. ref += offset;
  352. out += out_offset;
  353. }
  354. break;
  355. case 2:
  356. ref += y_pos * picwidth;
  357. offset = picwidth - blkwidth - 2;
  358. for (j = 0; j < blkheight; j++)
  359. {
  360. result = *((uint16*)ref);
  361. ref += 2;
  362. for (i = 2; i < blkwidth; i += 4)
  363. {
  364. pix4 = *((uint32*)ref);
  365. ref += 4;
  366. prev_pix = (pix4 << 16) & 0xFFFF0000; /* mask out byte belong to previous word */
  367. result |= prev_pix;
  368. *((uint32*)out) = result; /* write 4 bytes */
  369. out += 4;
  370. result = pix4 >> 16; /* for the next loop */
  371. }
  372. ref += offset;
  373. out += out_offset;
  374. }
  375. break;
  376. case 3:
  377. ref += y_pos * picwidth;
  378. offset = picwidth - blkwidth - 1;
  379. for (j = 0; j < blkheight; j++)
  380. {
  381. result = *ref++;
  382. for (i = 1; i < blkwidth; i += 4)
  383. {
  384. pix4 = *((uint32*)ref);
  385. ref += 4;
  386. prev_pix = (pix4 << 8) & 0xFFFFFF00; /* mask out byte belong to previous word */
  387. result |= prev_pix;
  388. *((uint32*)out) = result; /* write 4 bytes */
  389. out += 4;
  390. result = pix4 >> 24; /* for the next loop */
  391. }
  392. ref += offset;
  393. out += out_offset;
  394. }
  395. break;
  396. }
  397. }
  398. void CreatePad(uint8 *ref, int picwidth, int picheight, int x_pos, int y_pos,
  399. uint8 *out, int blkwidth, int blkheight)
  400. {
  401. int x_inc0, x_mid;
  402. int y_inc, y_inc0, y_inc1, y_mid;
  403. int i, j;
  404. int offset;
  405. if (x_pos < 0)
  406. {
  407. x_inc0 = 0; /* increment for the first part */
  408. x_mid = ((blkwidth + x_pos > 0) ? -x_pos : blkwidth); /* stopping point */
  409. x_pos = 0;
  410. }
  411. else if (x_pos + blkwidth > picwidth)
  412. {
  413. x_inc0 = 1; /* increasing */
  414. x_mid = ((picwidth > x_pos) ? picwidth - x_pos - 1 : 0); /* clip negative to zero, encode fool proof! */
  415. }
  416. else /* normal case */
  417. {
  418. x_inc0 = 1;
  419. x_mid = blkwidth; /* just one run */
  420. }
  421. /* boundary for y_pos, taking the result from x_pos into account */
  422. if (y_pos < 0)
  423. {
  424. y_inc0 = (x_inc0 ? - x_mid : -blkwidth + x_mid); /* offset depending on x_inc1 and x_inc0 */
  425. y_inc1 = picwidth + y_inc0;
  426. y_mid = ((blkheight + y_pos > 0) ? -y_pos : blkheight); /* clip to prevent memory corruption */
  427. y_pos = 0;
  428. }
  429. else if (y_pos + blkheight > picheight)
  430. {
  431. y_inc1 = (x_inc0 ? - x_mid : -blkwidth + x_mid); /* saturate */
  432. y_inc0 = picwidth + y_inc1; /* increasing */
  433. y_mid = ((picheight > y_pos) ? picheight - 1 - y_pos : 0);
  434. }
  435. else /* normal case */
  436. {
  437. y_inc1 = (x_inc0 ? - x_mid : -blkwidth + x_mid);
  438. y_inc0 = picwidth + y_inc1;
  439. y_mid = blkheight;
  440. }
  441. /* clip y_pos and x_pos */
  442. if (y_pos > picheight - 1) y_pos = picheight - 1;
  443. if (x_pos > picwidth - 1) x_pos = picwidth - 1;
  444. ref += y_pos * picwidth + x_pos;
  445. y_inc = y_inc0; /* start with top half */
  446. offset = 24 - blkwidth; /* to use in offset out */
  447. blkwidth -= x_mid; /* to use in the loop limit */
  448. if (x_inc0 == 0)
  449. {
  450. for (j = 0; j < blkheight; j++)
  451. {
  452. if (j == y_mid) /* put a check here to reduce the code size (for unrolling the loop) */
  453. {
  454. y_inc = y_inc1; /* switch to lower half */
  455. }
  456. for (i = x_mid; i > 0; i--) /* first or third quarter */
  457. {
  458. *out++ = *ref;
  459. }
  460. for (i = blkwidth; i > 0; i--) /* second or fourth quarter */
  461. {
  462. *out++ = *ref++;
  463. }
  464. out += offset;
  465. ref += y_inc;
  466. }
  467. }
  468. else
  469. {
  470. for (j = 0; j < blkheight; j++)
  471. {
  472. if (j == y_mid) /* put a check here to reduce the code size (for unrolling the loop) */
  473. {
  474. y_inc = y_inc1; /* switch to lower half */
  475. }
  476. for (i = x_mid; i > 0; i--) /* first or third quarter */
  477. {
  478. *out++ = *ref++;
  479. }
  480. for (i = blkwidth; i > 0; i--) /* second or fourth quarter */
  481. {
  482. *out++ = *ref;
  483. }
  484. out += offset;
  485. ref += y_inc;
  486. }
  487. }
  488. return ;
  489. }
  490. void HorzInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
  491. int blkwidth, int blkheight, int dx)
  492. {
  493. uint8 *p_ref;
  494. uint32 *p_cur;
  495. uint32 tmp;
  496. uint32 pkres;
  497. int result, curr_offset, ref_offset;
  498. int j;
  499. int32 r0, r1, r2, r3, r4, r5;
  500. int32 r13, r6;
  501. p_cur = (uint32*)out; /* assume it's word aligned */
  502. curr_offset = (outpitch - blkwidth) >> 2;
  503. p_ref = in;
  504. ref_offset = inpitch - blkwidth;
  505. if (dx&1)
  506. {
  507. dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
  508. p_ref -= 2;
  509. r13 = 0;
  510. for (j = blkheight; j > 0; j--)
  511. {
  512. tmp = (uint32)(p_ref + blkwidth);
  513. r0 = p_ref[0];
  514. r1 = p_ref[2];
  515. r0 |= (r1 << 16); /* 0,c,0,a */
  516. r1 = p_ref[1];
  517. r2 = p_ref[3];
  518. r1 |= (r2 << 16); /* 0,d,0,b */
  519. while ((uint32)p_ref < tmp)
  520. {
  521. r2 = *(p_ref += 4); /* move pointer to e */
  522. r3 = p_ref[2];
  523. r2 |= (r3 << 16); /* 0,g,0,e */
  524. r3 = p_ref[1];
  525. r4 = p_ref[3];
  526. r3 |= (r4 << 16); /* 0,h,0,f */
  527. r4 = r0 + r3; /* c+h, a+f */
  528. r5 = r0 + r1; /* c+d, a+b */
  529. r6 = r2 + r3; /* g+h, e+f */
  530. r5 >>= 16;
  531. r5 |= (r6 << 16); /* e+f, c+d */
  532. r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */
  533. r4 += 0x100010; /* +16, +16 */
  534. r5 = r1 + r2; /* d+g, b+e */
  535. r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
  536. r4 >>= 5;
  537. r13 |= r4; /* check clipping */
  538. r5 = p_ref[dx+2];
  539. r6 = p_ref[dx+4];
  540. r5 |= (r6 << 16);
  541. r4 += r5;
  542. r4 += 0x10001;
  543. r4 = (r4 >> 1) & 0xFF00FF;
  544. r5 = p_ref[4]; /* i */
  545. r6 = (r5 << 16);
  546. r5 = r6 | (r2 >> 16);/* 0,i,0,g */
  547. r5 += r1; /* d+i, b+g */ /* r5 not free */
  548. r1 >>= 16;
  549. r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
  550. r1 += r2; /* f+g, d+e */
  551. r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */
  552. r0 >>= 16;
  553. r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
  554. r0 += r3; /* e+h, c+f */
  555. r5 += 0x100010; /* 16,16 */
  556. r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
  557. r5 >>= 5;
  558. r13 |= r5; /* check clipping */
  559. r0 = p_ref[dx+3];
  560. r1 = p_ref[dx+5];
  561. r0 |= (r1 << 16);
  562. r5 += r0;
  563. r5 += 0x10001;
  564. r5 = (r5 >> 1) & 0xFF00FF;
  565. r4 |= (r5 << 8); /* pack them together */
  566. *p_cur++ = r4;
  567. r1 = r3;
  568. r0 = r2;
  569. }
  570. p_cur += curr_offset; /* move to the next line */
  571. p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */
  572. if (r13&0xFF000700) /* need clipping */
  573. {
  574. /* move back to the beginning of the line */
  575. p_ref -= (ref_offset + blkwidth); /* input */
  576. p_cur -= (outpitch >> 2);
  577. tmp = (uint32)(p_ref + blkwidth);
  578. for (; (uint32)p_ref < tmp;)
  579. {
  580. r0 = *p_ref++;
  581. r1 = *p_ref++;
  582. r2 = *p_ref++;
  583. r3 = *p_ref++;
  584. r4 = *p_ref++;
  585. /* first pixel */
  586. r5 = *p_ref++;
  587. result = (r0 + r5);
  588. r0 = (r1 + r4);
  589. result -= (r0 * 5);//result -= r0; result -= (r0<<2);
  590. r0 = (r2 + r3);
  591. result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
  592. result = (result + 16) >> 5;
  593. CLIP_RESULT(result)
  594. /* 3/4 pel, no need to clip */
  595. result = (result + p_ref[dx] + 1);
  596. pkres = (result >> 1) ;
  597. /* second pixel */
  598. r0 = *p_ref++;
  599. result = (r1 + r0);
  600. r1 = (r2 + r5);
  601. result -= (r1 * 5);//result -= r1; result -= (r1<<2);
  602. r1 = (r3 + r4);
  603. result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
  604. result = (result + 16) >> 5;
  605. CLIP_RESULT(result)
  606. /* 3/4 pel, no need to clip */
  607. result = (result + p_ref[dx] + 1);
  608. result = (result >> 1);
  609. pkres |= (result << 8);
  610. /* third pixel */
  611. r1 = *p_ref++;
  612. result = (r2 + r1);
  613. r2 = (r3 + r0);
  614. result -= (r2 * 5);//result -= r2; result -= (r2<<2);
  615. r2 = (r4 + r5);
  616. result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
  617. result = (result + 16) >> 5;
  618. CLIP_RESULT(result)
  619. /* 3/4 pel, no need to clip */
  620. result = (result + p_ref[dx] + 1);
  621. result = (result >> 1);
  622. pkres |= (result << 16);
  623. /* fourth pixel */
  624. r2 = *p_ref++;
  625. result = (r3 + r2);
  626. r3 = (r4 + r1);
  627. result -= (r3 * 5);//result -= r3; result -= (r3<<2);
  628. r3 = (r5 + r0);
  629. result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
  630. result = (result + 16) >> 5;
  631. CLIP_RESULT(result)
  632. /* 3/4 pel, no need to clip */
  633. result = (result + p_ref[dx] + 1);
  634. result = (result >> 1);
  635. pkres |= (result << 24);
  636. *p_cur++ = pkres; /* write 4 pixels */
  637. p_ref -= 5; /* offset back to the middle of filter */
  638. }
  639. p_cur += curr_offset; /* move to the next line */
  640. p_ref += ref_offset; /* move to the next line */
  641. }
  642. }
  643. }
  644. else
  645. {
  646. p_ref -= 2;
  647. r13 = 0;
  648. for (j = blkheight; j > 0; j--)
  649. {
  650. tmp = (uint32)(p_ref + blkwidth);
  651. r0 = p_ref[0];
  652. r1 = p_ref[2];
  653. r0 |= (r1 << 16); /* 0,c,0,a */
  654. r1 = p_ref[1];
  655. r2 = p_ref[3];
  656. r1 |= (r2 << 16); /* 0,d,0,b */
  657. while ((uint32)p_ref < tmp)
  658. {
  659. r2 = *(p_ref += 4); /* move pointer to e */
  660. r3 = p_ref[2];
  661. r2 |= (r3 << 16); /* 0,g,0,e */
  662. r3 = p_ref[1];
  663. r4 = p_ref[3];
  664. r3 |= (r4 << 16); /* 0,h,0,f */
  665. r4 = r0 + r3; /* c+h, a+f */
  666. r5 = r0 + r1; /* c+d, a+b */
  667. r6 = r2 + r3; /* g+h, e+f */
  668. r5 >>= 16;
  669. r5 |= (r6 << 16); /* e+f, c+d */
  670. r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */
  671. r4 += 0x100010; /* +16, +16 */
  672. r5 = r1 + r2; /* d+g, b+e */
  673. r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
  674. r4 >>= 5;
  675. r13 |= r4; /* check clipping */
  676. r4 &= 0xFF00FF; /* mask */
  677. r5 = p_ref[4]; /* i */
  678. r6 = (r5 << 16);
  679. r5 = r6 | (r2 >> 16);/* 0,i,0,g */
  680. r5 += r1; /* d+i, b+g */ /* r5 not free */
  681. r1 >>= 16;
  682. r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
  683. r1 += r2; /* f+g, d+e */
  684. r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */
  685. r0 >>= 16;
  686. r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
  687. r0 += r3; /* e+h, c+f */
  688. r5 += 0x100010; /* 16,16 */
  689. r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
  690. r5 >>= 5;
  691. r13 |= r5; /* check clipping */
  692. r5 &= 0xFF00FF; /* mask */
  693. r4 |= (r5 << 8); /* pack them together */
  694. *p_cur++ = r4;
  695. r1 = r3;
  696. r0 = r2;
  697. }
  698. p_cur += curr_offset; /* move to the next line */
  699. p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */
  700. if (r13&0xFF000700) /* need clipping */
  701. {
  702. /* move back to the beginning of the line */
  703. p_ref -= (ref_offset + blkwidth); /* input */
  704. p_cur -= (outpitch >> 2);
  705. tmp = (uint32)(p_ref + blkwidth);
  706. for (; (uint32)p_ref < tmp;)
  707. {
  708. r0 = *p_ref++;
  709. r1 = *p_ref++;
  710. r2 = *p_ref++;
  711. r3 = *p_ref++;
  712. r4 = *p_ref++;
  713. /* first pixel */
  714. r5 = *p_ref++;
  715. result = (r0 + r5);
  716. r0 = (r1 + r4);
  717. result -= (r0 * 5);//result -= r0; result -= (r0<<2);
  718. r0 = (r2 + r3);
  719. result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
  720. result = (result + 16) >> 5;
  721. CLIP_RESULT(result)
  722. pkres = result;
  723. /* second pixel */
  724. r0 = *p_ref++;
  725. result = (r1 + r0);
  726. r1 = (r2 + r5);
  727. result -= (r1 * 5);//result -= r1; result -= (r1<<2);
  728. r1 = (r3 + r4);
  729. result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
  730. result = (result + 16) >> 5;
  731. CLIP_RESULT(result)
  732. pkres |= (result << 8);
  733. /* third pixel */
  734. r1 = *p_ref++;
  735. result = (r2 + r1);
  736. r2 = (r3 + r0);
  737. result -= (r2 * 5);//result -= r2; result -= (r2<<2);
  738. r2 = (r4 + r5);
  739. result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
  740. result = (result + 16) >> 5;
  741. CLIP_RESULT(result)
  742. pkres |= (result << 16);
  743. /* fourth pixel */
  744. r2 = *p_ref++;
  745. result = (r3 + r2);
  746. r3 = (r4 + r1);
  747. result -= (r3 * 5);//result -= r3; result -= (r3<<2);
  748. r3 = (r5 + r0);
  749. result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
  750. result = (result + 16) >> 5;
  751. CLIP_RESULT(result)
  752. pkres |= (result << 24);
  753. *p_cur++ = pkres; /* write 4 pixels */
  754. p_ref -= 5;
  755. }
  756. p_cur += curr_offset; /* move to the next line */
  757. p_ref += ref_offset;
  758. }
  759. }
  760. }
  761. return ;
  762. }
  763. void HorzInterp2MC(int *in, int inpitch, uint8 *out, int outpitch,
  764. int blkwidth, int blkheight, int dx)
  765. {
  766. int *p_ref;
  767. uint32 *p_cur;
  768. uint32 tmp, pkres;
  769. int result, result2, curr_offset, ref_offset;
  770. int j, r0, r1, r2, r3, r4, r5;
  771. p_cur = (uint32*)out; /* assume it's word aligned */
  772. curr_offset = (outpitch - blkwidth) >> 2;
  773. p_ref = in;
  774. ref_offset = inpitch - blkwidth;
  775. if (dx&1)
  776. {
  777. dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
  778. for (j = blkheight; j > 0 ; j--)
  779. {
  780. tmp = (uint32)(p_ref + blkwidth);
  781. for (; (uint32)p_ref < tmp;)
  782. {
  783. r0 = p_ref[-2];
  784. r1 = p_ref[-1];
  785. r2 = *p_ref++;
  786. r3 = *p_ref++;
  787. r4 = *p_ref++;
  788. /* first pixel */
  789. r5 = *p_ref++;
  790. result = (r0 + r5);
  791. r0 = (r1 + r4);
  792. result -= (r0 * 5);//result -= r0; result -= (r0<<2);
  793. r0 = (r2 + r3);
  794. result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
  795. result = (result + 512) >> 10;
  796. CLIP_RESULT(result)
  797. result2 = ((p_ref[dx] + 16) >> 5);
  798. CLIP_RESULT(result2)
  799. /* 3/4 pel, no need to clip */
  800. result = (result + result2 + 1);
  801. pkres = (result >> 1);
  802. /* second pixel */
  803. r0 = *p_ref++;
  804. result = (r1 + r0);
  805. r1 = (r2 + r5);
  806. result -= (r1 * 5);//result -= r1; result -= (r1<<2);
  807. r1 = (r3 + r4);
  808. result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
  809. result = (result + 512) >> 10;
  810. CLIP_RESULT(result)
  811. result2 = ((p_ref[dx] + 16) >> 5);
  812. CLIP_RESULT(result2)
  813. /* 3/4 pel, no need to clip */
  814. result = (result + result2 + 1);
  815. result = (result >> 1);
  816. pkres |= (result << 8);
  817. /* third pixel */
  818. r1 = *p_ref++;
  819. result = (r2 + r1);
  820. r2 = (r3 + r0);
  821. result -= (r2 * 5);//result -= r2; result -= (r2<<2);
  822. r2 = (r4 + r5);
  823. result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
  824. result = (result + 512) >> 10;
  825. CLIP_RESULT(result)
  826. result2 = ((p_ref[dx] + 16) >> 5);
  827. CLIP_RESULT(result2)
  828. /* 3/4 pel, no need to clip */
  829. result = (result + result2 + 1);
  830. result = (result >> 1);
  831. pkres |= (result << 16);
  832. /* fourth pixel */
  833. r2 = *p_ref++;
  834. result = (r3 + r2);
  835. r3 = (r4 + r1);
  836. result -= (r3 * 5);//result -= r3; result -= (r3<<2);
  837. r3 = (r5 + r0);
  838. result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
  839. result = (result + 512) >> 10;
  840. CLIP_RESULT(result)
  841. result2 = ((p_ref[dx] + 16) >> 5);
  842. CLIP_RESULT(result2)
  843. /* 3/4 pel, no need to clip */
  844. result = (result + result2 + 1);
  845. result = (result >> 1);
  846. pkres |= (result << 24);
  847. *p_cur++ = pkres; /* write 4 pixels */
  848. p_ref -= 3; /* offset back to the middle of filter */
  849. }
  850. p_cur += curr_offset; /* move to the next line */
  851. p_ref += ref_offset; /* move to the next line */
  852. }
  853. }
  854. else
  855. {
  856. for (j = blkheight; j > 0 ; j--)
  857. {
  858. tmp = (uint32)(p_ref + blkwidth);
  859. for (; (uint32)p_ref < tmp;)
  860. {
  861. r0 = p_ref[-2];
  862. r1 = p_ref[-1];
  863. r2 = *p_ref++;
  864. r3 = *p_ref++;
  865. r4 = *p_ref++;
  866. /* first pixel */
  867. r5 = *p_ref++;
  868. result = (r0 + r5);
  869. r0 = (r1 + r4);
  870. result -= (r0 * 5);//result -= r0; result -= (r0<<2);
  871. r0 = (r2 + r3);
  872. result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
  873. result = (result + 512) >> 10;
  874. CLIP_RESULT(result)
  875. pkres = result;
  876. /* second pixel */
  877. r0 = *p_ref++;
  878. result = (r1 + r0);
  879. r1 = (r2 + r5);
  880. result -= (r1 * 5);//result -= r1; result -= (r1<<2);
  881. r1 = (r3 + r4);
  882. result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
  883. result = (result + 512) >> 10;
  884. CLIP_RESULT(result)
  885. pkres |= (result << 8);
  886. /* third pixel */
  887. r1 = *p_ref++;
  888. result = (r2 + r1);
  889. r2 = (r3 + r0);
  890. result -= (r2 * 5);//result -= r2; result -= (r2<<2);
  891. r2 = (r4 + r5);
  892. result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
  893. result = (result + 512) >> 10;
  894. CLIP_RESULT(result)
  895. pkres |= (result << 16);
  896. /* fourth pixel */
  897. r2 = *p_ref++;
  898. result = (r3 + r2);
  899. r3 = (r4 + r1);
  900. result -= (r3 * 5);//result -= r3; result -= (r3<<2);
  901. r3 = (r5 + r0);
  902. result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
  903. result = (result + 512) >> 10;
  904. CLIP_RESULT(result)
  905. pkres |= (result << 24);
  906. *p_cur++ = pkres; /* write 4 pixels */
  907. p_ref -= 3; /* offset back to the middle of filter */
  908. }
  909. p_cur += curr_offset; /* move to the next line */
  910. p_ref += ref_offset; /* move to the next line */
  911. }
  912. }
  913. return ;
  914. }
  915. void HorzInterp3MC(uint8 *in, int inpitch, int *out, int outpitch,
  916. int blkwidth, int blkheight)
  917. {
  918. uint8 *p_ref;
  919. int *p_cur;
  920. uint32 tmp;
  921. int result, curr_offset, ref_offset;
  922. int j, r0, r1, r2, r3, r4, r5;
  923. p_cur = out;
  924. curr_offset = (outpitch - blkwidth);
  925. p_ref = in;
  926. ref_offset = inpitch - blkwidth;
  927. for (j = blkheight; j > 0 ; j--)
  928. {
  929. tmp = (uint32)(p_ref + blkwidth);
  930. for (; (uint32)p_ref < tmp;)
  931. {
  932. r0 = p_ref[-2];
  933. r1 = p_ref[-1];
  934. r2 = *p_ref++;
  935. r3 = *p_ref++;
  936. r4 = *p_ref++;
  937. /* first pixel */
  938. r5 = *p_ref++;
  939. result = (r0 + r5);
  940. r0 = (r1 + r4);
  941. result -= (r0 * 5);//result -= r0; result -= (r0<<2);
  942. r0 = (r2 + r3);
  943. result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
  944. *p_cur++ = result;
  945. /* second pixel */
  946. r0 = *p_ref++;
  947. result = (r1 + r0);
  948. r1 = (r2 + r5);
  949. result -= (r1 * 5);//result -= r1; result -= (r1<<2);
  950. r1 = (r3 + r4);
  951. result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
  952. *p_cur++ = result;
  953. /* third pixel */
  954. r1 = *p_ref++;
  955. result = (r2 + r1);
  956. r2 = (r3 + r0);
  957. result -= (r2 * 5);//result -= r2; result -= (r2<<2);
  958. r2 = (r4 + r5);
  959. result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
  960. *p_cur++ = result;
  961. /* fourth pixel */
  962. r2 = *p_ref++;
  963. result = (r3 + r2);
  964. r3 = (r4 + r1);
  965. result -= (r3 * 5);//result -= r3; result -= (r3<<2);
  966. r3 = (r5 + r0);
  967. result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
  968. *p_cur++ = result;
  969. p_ref -= 3; /* move back to the middle of the filter */
  970. }
  971. p_cur += curr_offset; /* move to the next line */
  972. p_ref += ref_offset;
  973. }
  974. return ;
  975. }
  976. void VertInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
  977. int blkwidth, int blkheight, int dy)
  978. {
  979. uint8 *p_cur, *p_ref;
  980. uint32 tmp;
  981. int result, curr_offset, ref_offset;
  982. int j, i;
  983. int32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r13;
  984. uint8 tmp_in[24][24];
  985. /* not word-aligned */
  986. if (((uint32)in)&0x3)
  987. {
  988. CreateAlign(in, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
  989. in = &tmp_in[2][0];
  990. inpitch = 24;
  991. }
  992. p_cur = out;
  993. curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
  994. ref_offset = blkheight * inpitch; /* for limit */
  995. curr_offset += 3;
  996. if (dy&1)
  997. {
  998. dy = (dy >> 1) ? 0 : -inpitch;
  999. for (j = 0; j < blkwidth; j += 4, in += 4)
  1000. {
  1001. r13 = 0;
  1002. p_ref = in;
  1003. p_cur -= outpitch; /* compensate for the first offset */
  1004. tmp = (uint32)(p_ref + ref_offset); /* limit */
  1005. while ((uint32)p_ref < tmp) /* the loop un-rolled */
  1006. {
  1007. r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
  1008. p_ref += inpitch;
  1009. r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
  1010. r0 &= 0xFF00FF;
  1011. r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */
  1012. r7 = (r1 >> 8) & 0xFF00FF;
  1013. r1 &= 0xFF00FF;
  1014. r0 += r1;
  1015. r6 += r7;
  1016. r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
  1017. r8 = (r2 >> 8) & 0xFF00FF;
  1018. r2 &= 0xFF00FF;
  1019. r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
  1020. r7 = (r1 >> 8) & 0xFF00FF;
  1021. r1 &= 0xFF00FF;
  1022. r1 += r2;
  1023. r7 += r8;
  1024. r0 += 20 * r1;
  1025. r6 += 20 * r7;
  1026. r0 += 0x100010;
  1027. r6 += 0x100010;
  1028. r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
  1029. r8 = (r2 >> 8) & 0xFF00FF;
  1030. r2 &= 0xFF00FF;
  1031. r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
  1032. r7 = (r1 >> 8) & 0xFF00FF;
  1033. r1 &= 0xFF00FF;
  1034. r1 += r2;
  1035. r7 += r8;
  1036. r0 -= 5 * r1;
  1037. r6 -= 5 * r7;
  1038. r0 >>= 5;
  1039. r6 >>= 5;
  1040. /* clip */
  1041. r13 |= r6;
  1042. r13 |= r0;
  1043. //CLIPPACK(r6,result)
  1044. r1 = *((uint32*)(p_ref + dy));
  1045. r2 = (r1 >> 8) & 0xFF00FF;
  1046. r1 &= 0xFF00FF;
  1047. r0 += r1;
  1048. r6 += r2;
  1049. r0 += 0x10001;
  1050. r6 += 0x10001;
  1051. r0 = (r0 >> 1) & 0xFF00FF;
  1052. r6 = (r6 >> 1) & 0xFF00FF;
  1053. r0 |= (r6 << 8); /* pack it back */
  1054. *((uint32*)(p_cur += outpitch)) = r0;
  1055. }
  1056. p_cur += curr_offset; /* offset to the next pixel */
  1057. if (r13 & 0xFF000700) /* this column need clipping */
  1058. {
  1059. p_cur -= 4;
  1060. for (i = 0; i < 4; i++)
  1061. {
  1062. p_ref = in + i;
  1063. p_cur -= outpitch; /* compensate for the first offset */
  1064. tmp = (uint32)(p_ref + ref_offset); /* limit */
  1065. while ((uint32)p_ref < tmp)
  1066. { /* loop un-rolled */
  1067. r0 = *(p_ref - (inpitch << 1));
  1068. r1 = *(p_ref - inpitch);
  1069. r2 = *p_ref;
  1070. r3 = *(p_ref += inpitch); /* modify pointer before loading */
  1071. r4 = *(p_ref += inpitch);
  1072. /* first pixel */
  1073. r5 = *(p_ref += inpitch);
  1074. result = (r0 + r5);
  1075. r0 = (r1 + r4);
  1076. result -= (r0 * 5);//result -= r0; result -= (r0<<2);
  1077. r0 = (r2 + r3);
  1078. result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
  1079. result = (result + 16) >> 5;
  1080. CLIP_RESULT(result)
  1081. /* 3/4 pel, no need to clip */
  1082. result = (result + p_ref[dy-(inpitch<<1)] + 1);
  1083. result = (result >> 1);
  1084. *(p_cur += outpitch) = result;
  1085. /* second pixel */
  1086. r0 = *(p_ref += inpitch);
  1087. result = (r1 + r0);
  1088. r1 = (r2 + r5);
  1089. result -= (r1 * 5);//result -= r1; result -= (r1<<2);
  1090. r1 = (r3 + r4);
  1091. result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
  1092. result = (result + 16) >> 5;
  1093. CLIP_RESULT(result)
  1094. /* 3/4 pel, no need to clip */
  1095. result = (result + p_ref[dy-(inpitch<<1)] + 1);
  1096. result = (result >> 1);
  1097. *(p_cur += outpitch) = result;
  1098. /* third pixel */
  1099. r1 = *(p_ref += inpitch);
  1100. result = (r2 + r1);
  1101. r2 = (r3 + r0);
  1102. result -= (r2 * 5);//result -= r2; result -= (r2<<2);
  1103. r2 = (r4 + r5);
  1104. result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
  1105. result = (result + 16) >> 5;
  1106. CLIP_RESULT(result)
  1107. /* 3/4 pel, no need to clip */
  1108. result = (result + p_ref[dy-(inpitch<<1)] + 1);
  1109. result = (result >> 1);
  1110. *(p_cur += outpitch) = result;
  1111. /* fourth pixel */
  1112. r2 = *(p_ref += inpitch);
  1113. result = (r3 + r2);
  1114. r3 = (r4 + r1);
  1115. result -= (r3 * 5);//result -= r3; result -= (r3<<2);
  1116. r3 = (r5 + r0);
  1117. result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
  1118. result = (result + 16) >> 5;
  1119. CLIP_RESULT(result)
  1120. /* 3/4 pel, no need to clip */
  1121. result = (result + p_ref[dy-(inpitch<<1)] + 1);
  1122. result = (result >> 1);
  1123. *(p_cur += outpitch) = result;
  1124. p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */
  1125. }
  1126. p_cur += (curr_offset - 3);
  1127. }
  1128. }
  1129. }
  1130. }
  1131. else
  1132. {
  1133. for (j = 0; j < blkwidth; j += 4, in += 4)
  1134. {
  1135. r13 = 0;
  1136. p_ref = in;
  1137. p_cur -= outpitch; /* compensate for the first offset */
  1138. tmp = (uint32)(p_ref + ref_offset); /* limit */
  1139. while ((uint32)p_ref < tmp) /* the loop un-rolled */
  1140. {
  1141. r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
  1142. p_ref += inpitch;
  1143. r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
  1144. r0 &= 0xFF00FF;
  1145. r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */
  1146. r7 = (r1 >> 8) & 0xFF00FF;
  1147. r1 &= 0xFF00FF;
  1148. r0 += r1;
  1149. r6 += r7;
  1150. r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
  1151. r8 = (r2 >> 8) & 0xFF00FF;
  1152. r2 &= 0xFF00FF;
  1153. r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
  1154. r7 = (r1 >> 8) & 0xFF00FF;
  1155. r1 &= 0xFF00FF;
  1156. r1 += r2;
  1157. r7 += r8;
  1158. r0 += 20 * r1;
  1159. r6 += 20 * r7;
  1160. r0 += 0x100010;
  1161. r6 += 0x100010;
  1162. r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
  1163. r8 = (r2 >> 8) & 0xFF00FF;
  1164. r2 &= 0xFF00FF;
  1165. r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
  1166. r7 = (r1 >> 8) & 0xFF00FF;
  1167. r1 &= 0xFF00FF;
  1168. r1 += r2;
  1169. r7 += r8;
  1170. r0 -= 5 * r1;
  1171. r6 -= 5 * r7;
  1172. r0 >>= 5;
  1173. r6 >>= 5;
  1174. /* clip */
  1175. r13 |= r6;
  1176. r13 |= r0;
  1177. //CLIPPACK(r6,result)
  1178. r0 &= 0xFF00FF;
  1179. r6 &= 0xFF00FF;
  1180. r0 |= (r6 << 8); /* pack it back */
  1181. *((uint32*)(p_cur += outpitch)) = r0;
  1182. }
  1183. p_cur += curr_offset; /* offset to the next pixel */
  1184. if (r13 & 0xFF000700) /* this column need clipping */
  1185. {
  1186. p_cur -= 4;
  1187. for (i = 0; i < 4; i++)
  1188. {
  1189. p_ref = in + i;
  1190. p_cur -= outpitch; /* compensate for the first offset */
  1191. tmp = (uint32)(p_ref + ref_offset); /* limit */
  1192. while ((uint32)p_ref < tmp)
  1193. { /* loop un-rolled */
  1194. r0 = *(p_ref - (inpitch << 1));
  1195. r1 = *(p_ref - inpitch);
  1196. r2 = *p_ref;
  1197. r3 = *(p_ref += inpitch); /* modify pointer before loading */
  1198. r4 = *(p_ref += inpitch);
  1199. /* first pixel */
  1200. r5 = *(p_ref += inpitch);
  1201. result = (r0 + r5);
  1202. r0 = (r1 + r4);
  1203. result -= (r0 * 5);//result -= r0; result -= (r0<<2);
  1204. r0 = (r2 + r3);
  1205. result += (r0 * 20);//result += (r0<<4); result += (r0<<2);
  1206. result = (result + 16) >> 5;
  1207. CLIP_RESULT(result)
  1208. *(p_cur += outpitch) = result;
  1209. /* second pixel */
  1210. r0 = *(p_ref += inpitch);
  1211. result = (r1 + r0);
  1212. r1 = (r2 + r5);
  1213. result -= (r1 * 5);//result -= r1; result -= (r1<<2);
  1214. r1 = (r3 + r4);
  1215. result += (r1 * 20);//result += (r1<<4); result += (r1<<2);
  1216. result = (result + 16) >> 5;
  1217. CLIP_RESULT(result)
  1218. *(p_cur += outpitch) = result;
  1219. /* third pixel */
  1220. r1 = *(p_ref += inpitch);
  1221. result = (r2 + r1);
  1222. r2 = (r3 + r0);
  1223. result -= (r2 * 5);//result -= r2; result -= (r2<<2);
  1224. r2 = (r4 + r5);
  1225. result += (r2 * 20);//result += (r2<<4); result += (r2<<2);
  1226. result = (result + 16) >> 5;
  1227. CLIP_RESULT(result)
  1228. *(p_cur += outpitch) = result;
  1229. /* fourth pixel */
  1230. r2 = *(p_ref += inpitch);
  1231. result = (r3 + r2);
  1232. r3 = (r4 + r1);
  1233. result -= (r3 * 5);//result -= r3; result -= (r3<<2);
  1234. r3 = (r5 + r0);
  1235. result += (r3 * 20);//result += (r3<<4); result += (r3<<2);
  1236. result = (result + 16) >> 5;
  1237. CLIP_RESULT(result)
  1238. *(p_cur += outpitch) = result;
  1239. p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */
  1240. }
  1241. p_cur += (curr_offset - 3);
  1242. }
  1243. }
  1244. }
  1245. }
  1246. return ;
  1247. }
  1248. void VertInterp2MC(uint8 *in, int inpitch, int *out, int outpitch,

Large files files are truncated, but you can click here to view the full file