PageRenderTime 51ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/H264Dec/source/h264bsd_reconstruct.c

http://github.com/mbebenita/Broadway
C | 2315 lines | 1666 code | 259 blank | 390 comment | 110 complexity | a243c534b0997850d53f65736fd8ffbe MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /*
  2. * Copyright (C) 2009 The Android Open Source Project
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /*------------------------------------------------------------------------------
  17. Table of contents
  18. 1. Include headers
  19. 2. External compiler flags
  20. 3. Module defines
  21. 4. Local function prototypes
  22. 5. Functions
  23. ------------------------------------------------------------------------------*/
  24. /*------------------------------------------------------------------------------
  25. 1. Include headers
  26. ------------------------------------------------------------------------------*/
  27. #include "basetype.h"
  28. #include "h264bsd_reconstruct.h"
  29. #include "h264bsd_macroblock_layer.h"
  30. #include "h264bsd_image.h"
  31. #include "h264bsd_util.h"
  32. #ifdef H264DEC_OMXDL
  33. #include "omxtypes.h"
  34. #include "omxVC.h"
  35. #include "armVC.h"
  36. #endif /* H264DEC_OMXDL */
  37. /*------------------------------------------------------------------------------
  38. 2. External compiler flags
  39. --------------------------------------------------------------------------------
  40. --------------------------------------------------------------------------------
  41. 3. Module defines
  42. ------------------------------------------------------------------------------*/
  43. /* Switch off the following Lint messages for this file:
  44. * Info 701: Shift left of signed quantity (int)
  45. * Info 702: Shift right of signed quantity (int)
  46. */
  47. /*lint -e701 -e702 */
  48. /* Luma fractional-sample positions
  49. *
  50. * G a b c H
  51. * d e f g
  52. * h i j k m
  53. * n p q r
  54. * M s N
  55. *
  56. * G, H, M and N are integer sample positions
  57. * a-s are fractional samples that need to be interpolated.
  58. */
  59. #ifndef H264DEC_OMXDL
  60. static const u32 lumaFracPos[4][4] = {
  61. /* G d h n a e i p b f j q c g k r */
  62. {0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}, {12, 13, 14, 15}};
  63. #endif /* H264DEC_OMXDL */
  64. /* clipping table, defined in h264bsd_intra_prediction.c */
  65. extern const u8 h264bsdClip[];
  66. /*------------------------------------------------------------------------------
  67. 4. Local function prototypes
  68. ------------------------------------------------------------------------------*/
  69. #ifndef H264DEC_OMXDL
  70. /*------------------------------------------------------------------------------
  71. Function: h264bsdInterpolateChromaHor
  72. Functional description:
  73. This function performs chroma interpolation in horizontal direction.
  74. Overfilling is done only if needed. Reference image (pRef) is
  75. read at correct position and the predicted part is written to
  76. macroblock's chrominance (predPartChroma)
  77. Inputs:
  78. pRef pointer to reference frame Cb top-left corner
  79. x0 integer x-coordinate for prediction
  80. y0 integer y-coordinate for prediction
  81. width width of the reference frame chrominance in pixels
  82. height height of the reference frame chrominance in pixels
  83. xFrac horizontal fraction for prediction in 1/8 pixels
  84. chromaPartWidth width of the predicted part in pixels
  85. chromaPartHeight height of the predicted part in pixels
  86. Outputs:
  87. predPartChroma pointer where predicted part is written
  88. ------------------------------------------------------------------------------*/
  89. #ifndef H264DEC_ARM11
  90. void h264bsdInterpolateChromaHor(
  91. u8 *pRef,
  92. u8 *predPartChroma,
  93. i32 x0,
  94. i32 y0,
  95. u32 width,
  96. u32 height,
  97. u32 xFrac,
  98. u32 chromaPartWidth,
  99. u32 chromaPartHeight)
  100. {
  101. /* Variables */
  102. u32 x, y, tmp1, tmp2, tmp3, tmp4, c, val;
  103. u8 *ptrA, *cbr;
  104. u32 comp;
  105. u8 block[9*8*2];
  106. /* Code */
  107. ASSERT(predPartChroma);
  108. ASSERT(chromaPartWidth);
  109. ASSERT(chromaPartHeight);
  110. ASSERT(xFrac < 8);
  111. ASSERT(pRef);
  112. if ((x0 < 0) || ((u32)x0+chromaPartWidth+1 > width) ||
  113. (y0 < 0) || ((u32)y0+chromaPartHeight > height))
  114. {
  115. h264bsdFillBlock(pRef, block, x0, y0, width, height,
  116. chromaPartWidth + 1, chromaPartHeight, chromaPartWidth + 1);
  117. pRef += width * height;
  118. h264bsdFillBlock(pRef, block + (chromaPartWidth+1)*chromaPartHeight,
  119. x0, y0, width, height, chromaPartWidth + 1,
  120. chromaPartHeight, chromaPartWidth + 1);
  121. pRef = block;
  122. x0 = 0;
  123. y0 = 0;
  124. width = chromaPartWidth+1;
  125. height = chromaPartHeight;
  126. }
  127. val = 8 - xFrac;
  128. for (comp = 0; comp <= 1; comp++)
  129. {
  130. ptrA = pRef + (comp * height + (u32)y0) * width + x0;
  131. cbr = predPartChroma + comp * 8 * 8;
  132. /* 2x2 pels per iteration
  133. * bilinear horizontal interpolation */
  134. for (y = (chromaPartHeight >> 1); y; y--)
  135. {
  136. for (x = (chromaPartWidth >> 1); x; x--)
  137. {
  138. tmp1 = ptrA[width];
  139. tmp2 = *ptrA++;
  140. tmp3 = ptrA[width];
  141. tmp4 = *ptrA++;
  142. c = ((val * tmp1 + xFrac * tmp3) << 3) + 32;
  143. c >>= 6;
  144. cbr[8] = (u8)c;
  145. c = ((val * tmp2 + xFrac * tmp4) << 3) + 32;
  146. c >>= 6;
  147. *cbr++ = (u8)c;
  148. tmp1 = ptrA[width];
  149. tmp2 = *ptrA;
  150. c = ((val * tmp3 + xFrac * tmp1) << 3) + 32;
  151. c >>= 6;
  152. cbr[8] = (u8)c;
  153. c = ((val * tmp4 + xFrac * tmp2) << 3) + 32;
  154. c >>= 6;
  155. *cbr++ = (u8)c;
  156. }
  157. cbr += 2*8 - chromaPartWidth;
  158. ptrA += 2*width - chromaPartWidth;
  159. }
  160. }
  161. }
  162. /*------------------------------------------------------------------------------
  163. Function: h264bsdInterpolateChromaVer
  164. Functional description:
  165. This function performs chroma interpolation in vertical direction.
  166. Overfilling is done only if needed. Reference image (pRef) is
  167. read at correct position and the predicted part is written to
  168. macroblock's chrominance (predPartChroma)
  169. ------------------------------------------------------------------------------*/
  170. void h264bsdInterpolateChromaVer(
  171. u8 *pRef,
  172. u8 *predPartChroma,
  173. i32 x0,
  174. i32 y0,
  175. u32 width,
  176. u32 height,
  177. u32 yFrac,
  178. u32 chromaPartWidth,
  179. u32 chromaPartHeight)
  180. {
  181. /* Variables */
  182. u32 x, y, tmp1, tmp2, tmp3, c, val;
  183. u8 *ptrA, *cbr;
  184. u32 comp;
  185. u8 block[9*8*2];
  186. /* Code */
  187. ASSERT(predPartChroma);
  188. ASSERT(chromaPartWidth);
  189. ASSERT(chromaPartHeight);
  190. ASSERT(yFrac < 8);
  191. ASSERT(pRef);
  192. if ((x0 < 0) || ((u32)x0+chromaPartWidth > width) ||
  193. (y0 < 0) || ((u32)y0+chromaPartHeight+1 > height))
  194. {
  195. h264bsdFillBlock(pRef, block, x0, y0, width, height, chromaPartWidth,
  196. chromaPartHeight + 1, chromaPartWidth);
  197. pRef += width * height;
  198. h264bsdFillBlock(pRef, block + chromaPartWidth*(chromaPartHeight+1),
  199. x0, y0, width, height, chromaPartWidth,
  200. chromaPartHeight + 1, chromaPartWidth);
  201. pRef = block;
  202. x0 = 0;
  203. y0 = 0;
  204. width = chromaPartWidth;
  205. height = chromaPartHeight+1;
  206. }
  207. val = 8 - yFrac;
  208. for (comp = 0; comp <= 1; comp++)
  209. {
  210. ptrA = pRef + (comp * height + (u32)y0) * width + x0;
  211. cbr = predPartChroma + comp * 8 * 8;
  212. /* 2x2 pels per iteration
  213. * bilinear vertical interpolation */
  214. for (y = (chromaPartHeight >> 1); y; y--)
  215. {
  216. for (x = (chromaPartWidth >> 1); x; x--)
  217. {
  218. tmp3 = ptrA[width*2];
  219. tmp2 = ptrA[width];
  220. tmp1 = *ptrA++;
  221. c = ((val * tmp2 + yFrac * tmp3) << 3) + 32;
  222. c >>= 6;
  223. cbr[8] = (u8)c;
  224. c = ((val * tmp1 + yFrac * tmp2) << 3) + 32;
  225. c >>= 6;
  226. *cbr++ = (u8)c;
  227. tmp3 = ptrA[width*2];
  228. tmp2 = ptrA[width];
  229. tmp1 = *ptrA++;
  230. c = ((val * tmp2 + yFrac * tmp3) << 3) + 32;
  231. c >>= 6;
  232. cbr[8] = (u8)c;
  233. c = ((val * tmp1 + yFrac * tmp2) << 3) + 32;
  234. c >>= 6;
  235. *cbr++ = (u8)c;
  236. }
  237. cbr += 2*8 - chromaPartWidth;
  238. ptrA += 2*width - chromaPartWidth;
  239. }
  240. }
  241. }
  242. #endif
  243. /*------------------------------------------------------------------------------
  244. Function: h264bsdInterpolateChromaHorVer
  245. Functional description:
  246. This function performs chroma interpolation in horizontal and
  247. vertical direction. Overfilling is done only if needed. Reference
  248. image (ref) is read at correct position and the predicted part
  249. is written to macroblock's chrominance (predPartChroma)
  250. ------------------------------------------------------------------------------*/
  251. void h264bsdInterpolateChromaHorVer(
  252. u8 *ref,
  253. u8 *predPartChroma,
  254. i32 x0,
  255. i32 y0,
  256. u32 width,
  257. u32 height,
  258. u32 xFrac,
  259. u32 yFrac,
  260. u32 chromaPartWidth,
  261. u32 chromaPartHeight)
  262. {
  263. u8 block[9*9*2];
  264. u32 x, y, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, valX, valY, plus32 = 32;
  265. u32 comp;
  266. u8 *ptrA, *cbr;
  267. /* Code */
  268. ASSERT(predPartChroma);
  269. ASSERT(chromaPartWidth);
  270. ASSERT(chromaPartHeight);
  271. ASSERT(xFrac < 8);
  272. ASSERT(yFrac < 8);
  273. ASSERT(ref);
  274. if ((x0 < 0) || ((u32)x0+chromaPartWidth+1 > width) ||
  275. (y0 < 0) || ((u32)y0+chromaPartHeight+1 > height))
  276. {
  277. h264bsdFillBlock(ref, block, x0, y0, width, height,
  278. chromaPartWidth + 1, chromaPartHeight + 1, chromaPartWidth + 1);
  279. ref += width * height;
  280. h264bsdFillBlock(ref, block + (chromaPartWidth+1)*(chromaPartHeight+1),
  281. x0, y0, width, height, chromaPartWidth + 1,
  282. chromaPartHeight + 1, chromaPartWidth + 1);
  283. ref = block;
  284. x0 = 0;
  285. y0 = 0;
  286. width = chromaPartWidth+1;
  287. height = chromaPartHeight+1;
  288. }
  289. valX = 8 - xFrac;
  290. valY = 8 - yFrac;
  291. for (comp = 0; comp <= 1; comp++)
  292. {
  293. ptrA = ref + (comp * height + (u32)y0) * width + x0;
  294. cbr = predPartChroma + comp * 8 * 8;
  295. /* 2x2 pels per iteration
  296. * bilinear vertical and horizontal interpolation */
  297. for (y = (chromaPartHeight >> 1); y; y--)
  298. {
  299. tmp1 = *ptrA;
  300. tmp3 = ptrA[width];
  301. tmp5 = ptrA[width*2];
  302. tmp1 *= valY;
  303. tmp1 += tmp3 * yFrac;
  304. tmp3 *= valY;
  305. tmp3 += tmp5 * yFrac;
  306. for (x = (chromaPartWidth >> 1); x; x--)
  307. {
  308. tmp2 = *++ptrA;
  309. tmp4 = ptrA[width];
  310. tmp6 = ptrA[width*2];
  311. tmp2 *= valY;
  312. tmp2 += tmp4 * yFrac;
  313. tmp4 *= valY;
  314. tmp4 += tmp6 * yFrac;
  315. tmp1 = tmp1 * valX + plus32;
  316. tmp3 = tmp3 * valX + plus32;
  317. tmp1 += tmp2 * xFrac;
  318. tmp1 >>= 6;
  319. tmp3 += tmp4 * xFrac;
  320. tmp3 >>= 6;
  321. cbr[8] = (u8)tmp3;
  322. *cbr++ = (u8)tmp1;
  323. tmp1 = *++ptrA;
  324. tmp3 = ptrA[width];
  325. tmp5 = ptrA[width*2];
  326. tmp1 *= valY;
  327. tmp1 += tmp3 * yFrac;
  328. tmp3 *= valY;
  329. tmp3 += tmp5 * yFrac;
  330. tmp2 = tmp2 * valX + plus32;
  331. tmp4 = tmp4 * valX + plus32;
  332. tmp2 += tmp1 * xFrac;
  333. tmp2 >>= 6;
  334. tmp4 += tmp3 * xFrac;
  335. tmp4 >>= 6;
  336. cbr[8] = (u8)tmp4;
  337. *cbr++ = (u8)tmp2;
  338. }
  339. cbr += 2*8 - chromaPartWidth;
  340. ptrA += 2*width - chromaPartWidth;
  341. }
  342. }
  343. }
  344. /*------------------------------------------------------------------------------
  345. Function: PredictChroma
  346. Functional description:
  347. Top level chroma prediction function that calls the appropriate
  348. interpolation function. The output is written to macroblock array.
  349. ------------------------------------------------------------------------------*/
  350. static void PredictChroma(
  351. u8 *mbPartChroma,
  352. u32 xAL,
  353. u32 yAL,
  354. u32 partWidth,
  355. u32 partHeight,
  356. mv_t *mv,
  357. image_t *refPic)
  358. {
  359. /* Variables */
  360. u32 xFrac, yFrac, width, height, chromaPartWidth, chromaPartHeight;
  361. i32 xInt, yInt;
  362. u8 *ref;
  363. /* Code */
  364. ASSERT(mv);
  365. ASSERT(refPic);
  366. ASSERT(refPic->data);
  367. ASSERT(refPic->width);
  368. ASSERT(refPic->height);
  369. width = 8 * refPic->width;
  370. height = 8 * refPic->height;
  371. xInt = (xAL >> 1) + (mv->hor >> 3);
  372. yInt = (yAL >> 1) + (mv->ver >> 3);
  373. xFrac = mv->hor & 0x7;
  374. yFrac = mv->ver & 0x7;
  375. chromaPartWidth = partWidth >> 1;
  376. chromaPartHeight = partHeight >> 1;
  377. ref = refPic->data + 256 * refPic->width * refPic->height;
  378. if (xFrac && yFrac)
  379. {
  380. h264bsdInterpolateChromaHorVer(ref, mbPartChroma, xInt, yInt, width,
  381. height, xFrac, yFrac, chromaPartWidth, chromaPartHeight);
  382. }
  383. else if (xFrac)
  384. {
  385. h264bsdInterpolateChromaHor(ref, mbPartChroma, xInt, yInt, width,
  386. height, xFrac, chromaPartWidth, chromaPartHeight);
  387. }
  388. else if (yFrac)
  389. {
  390. h264bsdInterpolateChromaVer(ref, mbPartChroma, xInt, yInt, width,
  391. height, yFrac, chromaPartWidth, chromaPartHeight);
  392. }
  393. else
  394. {
  395. h264bsdFillBlock(ref, mbPartChroma, xInt, yInt, width, height,
  396. chromaPartWidth, chromaPartHeight, 8);
  397. ref += width * height;
  398. h264bsdFillBlock(ref, mbPartChroma + 8*8, xInt, yInt, width, height,
  399. chromaPartWidth, chromaPartHeight, 8);
  400. }
  401. }
  402. /*------------------------------------------------------------------------------
  403. Function: h264bsdInterpolateVerHalf
  404. Functional description:
  405. Function to perform vertical interpolation of pixel position 'h'
  406. for a block. Overfilling is done only if needed. Reference
  407. image (ref) is read at correct position and the predicted part
  408. is written to macroblock array (mb)
  409. ------------------------------------------------------------------------------*/
  410. #ifndef H264DEC_ARM11
  411. void h264bsdInterpolateVerHalf(
  412. u8 *ref,
  413. u8 *mb,
  414. i32 x0,
  415. i32 y0,
  416. u32 width,
  417. u32 height,
  418. u32 partWidth,
  419. u32 partHeight)
  420. {
  421. u32 p1[21*21/4+1];
  422. u32 i, j;
  423. i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  424. u8 *ptrC, *ptrV;
  425. const u8 *clp = h264bsdClip + 512;
  426. /* Code */
  427. ASSERT(ref);
  428. ASSERT(mb);
  429. if ((x0 < 0) || ((u32)x0+partWidth > width) ||
  430. (y0 < 0) || ((u32)y0+partHeight+5 > height))
  431. {
  432. h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
  433. partWidth, partHeight+5, partWidth);
  434. x0 = 0;
  435. y0 = 0;
  436. ref = (u8*)p1;
  437. width = partWidth;
  438. }
  439. ref += (u32)y0 * width + (u32)x0;
  440. ptrC = ref + width;
  441. ptrV = ptrC + 5*width;
  442. /* 4 pixels per iteration, interpolate using 5 vertical samples */
  443. for (i = (partHeight >> 2); i; i--)
  444. {
  445. /* h1 = (16 + A + 16(G+M) + 4(G+M) - 4(C+R) - (C+R) + T) >> 5 */
  446. for (j = partWidth; j; j--)
  447. {
  448. tmp4 = ptrV[-(i32)width*2];
  449. tmp5 = ptrV[-(i32)width];
  450. tmp1 = ptrV[width];
  451. tmp2 = ptrV[width*2];
  452. tmp6 = *ptrV++;
  453. tmp7 = tmp4 + tmp1;
  454. tmp2 -= (tmp7 << 2);
  455. tmp2 -= tmp7;
  456. tmp2 += 16;
  457. tmp7 = tmp5 + tmp6;
  458. tmp3 = ptrC[width*2];
  459. tmp2 += (tmp7 << 4);
  460. tmp2 += (tmp7 << 2);
  461. tmp2 += tmp3;
  462. tmp2 = clp[tmp2>>5];
  463. tmp1 += 16;
  464. mb[48] = (u8)tmp2;
  465. tmp7 = tmp3 + tmp6;
  466. tmp1 -= (tmp7 << 2);
  467. tmp1 -= tmp7;
  468. tmp7 = tmp4 + tmp5;
  469. tmp2 = ptrC[width];
  470. tmp1 += (tmp7 << 4);
  471. tmp1 += (tmp7 << 2);
  472. tmp1 += tmp2;
  473. tmp1 = clp[tmp1>>5];
  474. tmp6 += 16;
  475. mb[32] = (u8)tmp1;
  476. tmp7 = tmp2 + tmp5;
  477. tmp6 -= (tmp7 << 2);
  478. tmp6 -= tmp7;
  479. tmp7 = tmp4 + tmp3;
  480. tmp1 = *ptrC;
  481. tmp6 += (tmp7 << 4);
  482. tmp6 += (tmp7 << 2);
  483. tmp6 += tmp1;
  484. tmp6 = clp[tmp6>>5];
  485. tmp5 += 16;
  486. mb[16] = (u8)tmp6;
  487. tmp1 += tmp4;
  488. tmp5 -= (tmp1 << 2);
  489. tmp5 -= tmp1;
  490. tmp3 += tmp2;
  491. tmp6 = ptrC[-(i32)width];
  492. tmp5 += (tmp3 << 4);
  493. tmp5 += (tmp3 << 2);
  494. tmp5 += tmp6;
  495. tmp5 = clp[tmp5>>5];
  496. *mb++ = (u8)tmp5;
  497. ptrC++;
  498. }
  499. ptrC += 4*width - partWidth;
  500. ptrV += 4*width - partWidth;
  501. mb += 4*16 - partWidth;
  502. }
  503. }
  504. /*------------------------------------------------------------------------------
  505. Function: h264bsdInterpolateVerQuarter
  506. Functional description:
  507. Function to perform vertical interpolation of pixel position 'd'
  508. or 'n' for a block. Overfilling is done only if needed. Reference
  509. image (ref) is read at correct position and the predicted part
  510. is written to macroblock array (mb)
  511. ------------------------------------------------------------------------------*/
  512. void h264bsdInterpolateVerQuarter(
  513. u8 *ref,
  514. u8 *mb,
  515. i32 x0,
  516. i32 y0,
  517. u32 width,
  518. u32 height,
  519. u32 partWidth,
  520. u32 partHeight,
  521. u32 verOffset) /* 0 for pixel d, 1 for pixel n */
  522. {
  523. u32 p1[21*21/4+1];
  524. u32 i, j;
  525. i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  526. u8 *ptrC, *ptrV, *ptrInt;
  527. const u8 *clp = h264bsdClip + 512;
  528. /* Code */
  529. ASSERT(ref);
  530. ASSERT(mb);
  531. if ((x0 < 0) || ((u32)x0+partWidth > width) ||
  532. (y0 < 0) || ((u32)y0+partHeight+5 > height))
  533. {
  534. h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
  535. partWidth, partHeight+5, partWidth);
  536. x0 = 0;
  537. y0 = 0;
  538. ref = (u8*)p1;
  539. width = partWidth;
  540. }
  541. ref += (u32)y0 * width + (u32)x0;
  542. ptrC = ref + width;
  543. ptrV = ptrC + 5*width;
  544. /* Pointer to integer sample position, either M or R */
  545. ptrInt = ptrC + (2+verOffset)*width;
  546. /* 4 pixels per iteration
  547. * interpolate using 5 vertical samples and average between
  548. * interpolated value and integer sample value */
  549. for (i = (partHeight >> 2); i; i--)
  550. {
  551. /* h1 = (16 + A + 16(G+M) + 4(G+M) - 4(C+R) - (C+R) + T) >> 5 */
  552. for (j = partWidth; j; j--)
  553. {
  554. tmp4 = ptrV[-(i32)width*2];
  555. tmp5 = ptrV[-(i32)width];
  556. tmp1 = ptrV[width];
  557. tmp2 = ptrV[width*2];
  558. tmp6 = *ptrV++;
  559. tmp7 = tmp4 + tmp1;
  560. tmp2 -= (tmp7 << 2);
  561. tmp2 -= tmp7;
  562. tmp2 += 16;
  563. tmp7 = tmp5 + tmp6;
  564. tmp3 = ptrC[width*2];
  565. tmp2 += (tmp7 << 4);
  566. tmp2 += (tmp7 << 2);
  567. tmp2 += tmp3;
  568. tmp2 = clp[tmp2>>5];
  569. tmp7 = ptrInt[width*2];
  570. tmp1 += 16;
  571. tmp2++;
  572. mb[48] = (u8)((tmp2 + tmp7) >> 1);
  573. tmp7 = tmp3 + tmp6;
  574. tmp1 -= (tmp7 << 2);
  575. tmp1 -= tmp7;
  576. tmp7 = tmp4 + tmp5;
  577. tmp2 = ptrC[width];
  578. tmp1 += (tmp7 << 4);
  579. tmp1 += (tmp7 << 2);
  580. tmp1 += tmp2;
  581. tmp1 = clp[tmp1>>5];
  582. tmp7 = ptrInt[width];
  583. tmp6 += 16;
  584. tmp1++;
  585. mb[32] = (u8)((tmp1 + tmp7) >> 1);
  586. tmp7 = tmp2 + tmp5;
  587. tmp6 -= (tmp7 << 2);
  588. tmp6 -= tmp7;
  589. tmp7 = tmp4 + tmp3;
  590. tmp1 = *ptrC;
  591. tmp6 += (tmp7 << 4);
  592. tmp6 += (tmp7 << 2);
  593. tmp6 += tmp1;
  594. tmp6 = clp[tmp6>>5];
  595. tmp7 = *ptrInt;
  596. tmp5 += 16;
  597. tmp6++;
  598. mb[16] = (u8)((tmp6 + tmp7) >> 1);
  599. tmp1 += tmp4;
  600. tmp5 -= (tmp1 << 2);
  601. tmp5 -= tmp1;
  602. tmp3 += tmp2;
  603. tmp6 = ptrC[-(i32)width];
  604. tmp5 += (tmp3 << 4);
  605. tmp5 += (tmp3 << 2);
  606. tmp5 += tmp6;
  607. tmp5 = clp[tmp5>>5];
  608. tmp7 = ptrInt[-(i32)width];
  609. tmp5++;
  610. *mb++ = (u8)((tmp5 + tmp7) >> 1);
  611. ptrC++;
  612. ptrInt++;
  613. }
  614. ptrC += 4*width - partWidth;
  615. ptrV += 4*width - partWidth;
  616. ptrInt += 4*width - partWidth;
  617. mb += 4*16 - partWidth;
  618. }
  619. }
  620. /*------------------------------------------------------------------------------
  621. Function: h264bsdInterpolateHorHalf
  622. Functional description:
  623. Function to perform horizontal interpolation of pixel position 'b'
  624. for a block. Overfilling is done only if needed. Reference
  625. image (ref) is read at correct position and the predicted part
  626. is written to macroblock array (mb)
  627. ------------------------------------------------------------------------------*/
  628. void h264bsdInterpolateHorHalf(
  629. u8 *ref,
  630. u8 *mb,
  631. i32 x0,
  632. i32 y0,
  633. u32 width,
  634. u32 height,
  635. u32 partWidth,
  636. u32 partHeight)
  637. {
  638. u32 p1[21*21/4+1];
  639. u8 *ptrJ;
  640. u32 x, y;
  641. i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  642. const u8 *clp = h264bsdClip + 512;
  643. /* Code */
  644. ASSERT(ref);
  645. ASSERT(mb);
  646. ASSERT((partWidth&0x3) == 0);
  647. ASSERT((partHeight&0x3) == 0);
  648. if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
  649. (y0 < 0) || ((u32)y0+partHeight > height))
  650. {
  651. h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
  652. partWidth+5, partHeight, partWidth+5);
  653. x0 = 0;
  654. y0 = 0;
  655. ref = (u8*)p1;
  656. width = partWidth + 5;
  657. }
  658. ref += (u32)y0 * width + (u32)x0;
  659. ptrJ = ref + 5;
  660. for (y = partHeight; y; y--)
  661. {
  662. tmp6 = *(ptrJ - 5);
  663. tmp5 = *(ptrJ - 4);
  664. tmp4 = *(ptrJ - 3);
  665. tmp3 = *(ptrJ - 2);
  666. tmp2 = *(ptrJ - 1);
  667. /* calculate 4 pels per iteration */
  668. for (x = (partWidth >> 2); x; x--)
  669. {
  670. /* First pixel */
  671. tmp6 += 16;
  672. tmp7 = tmp3 + tmp4;
  673. tmp6 += (tmp7 << 4);
  674. tmp6 += (tmp7 << 2);
  675. tmp7 = tmp2 + tmp5;
  676. tmp1 = *ptrJ++;
  677. tmp6 -= (tmp7 << 2);
  678. tmp6 -= tmp7;
  679. tmp6 += tmp1;
  680. tmp6 = clp[tmp6>>5];
  681. /* Second pixel */
  682. tmp5 += 16;
  683. tmp7 = tmp2 + tmp3;
  684. *mb++ = (u8)tmp6;
  685. tmp5 += (tmp7 << 4);
  686. tmp5 += (tmp7 << 2);
  687. tmp7 = tmp1 + tmp4;
  688. tmp6 = *ptrJ++;
  689. tmp5 -= (tmp7 << 2);
  690. tmp5 -= tmp7;
  691. tmp5 += tmp6;
  692. tmp5 = clp[tmp5>>5];
  693. /* Third pixel */
  694. tmp4 += 16;
  695. tmp7 = tmp1 + tmp2;
  696. *mb++ = (u8)tmp5;
  697. tmp4 += (tmp7 << 4);
  698. tmp4 += (tmp7 << 2);
  699. tmp7 = tmp6 + tmp3;
  700. tmp5 = *ptrJ++;
  701. tmp4 -= (tmp7 << 2);
  702. tmp4 -= tmp7;
  703. tmp4 += tmp5;
  704. tmp4 = clp[tmp4>>5];
  705. /* Fourth pixel */
  706. tmp3 += 16;
  707. tmp7 = tmp6 + tmp1;
  708. *mb++ = (u8)tmp4;
  709. tmp3 += (tmp7 << 4);
  710. tmp3 += (tmp7 << 2);
  711. tmp7 = tmp5 + tmp2;
  712. tmp4 = *ptrJ++;
  713. tmp3 -= (tmp7 << 2);
  714. tmp3 -= tmp7;
  715. tmp3 += tmp4;
  716. tmp3 = clp[tmp3>>5];
  717. tmp7 = tmp4;
  718. tmp4 = tmp6;
  719. tmp6 = tmp2;
  720. tmp2 = tmp7;
  721. *mb++ = (u8)tmp3;
  722. tmp3 = tmp5;
  723. tmp5 = tmp1;
  724. }
  725. ptrJ += width - partWidth;
  726. mb += 16 - partWidth;
  727. }
  728. }
  729. /*------------------------------------------------------------------------------
  730. Function: h264bsdInterpolateHorQuarter
  731. Functional description:
  732. Function to perform horizontal interpolation of pixel position 'a'
  733. or 'c' for a block. Overfilling is done only if needed. Reference
  734. image (ref) is read at correct position and the predicted part
  735. is written to macroblock array (mb)
  736. ------------------------------------------------------------------------------*/
  737. void h264bsdInterpolateHorQuarter(
  738. u8 *ref,
  739. u8 *mb,
  740. i32 x0,
  741. i32 y0,
  742. u32 width,
  743. u32 height,
  744. u32 partWidth,
  745. u32 partHeight,
  746. u32 horOffset) /* 0 for pixel a, 1 for pixel c */
  747. {
  748. u32 p1[21*21/4+1];
  749. u8 *ptrJ;
  750. u32 x, y;
  751. i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  752. const u8 *clp = h264bsdClip + 512;
  753. /* Code */
  754. ASSERT(ref);
  755. ASSERT(mb);
  756. if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
  757. (y0 < 0) || ((u32)y0+partHeight > height))
  758. {
  759. h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
  760. partWidth+5, partHeight, partWidth+5);
  761. x0 = 0;
  762. y0 = 0;
  763. ref = (u8*)p1;
  764. width = partWidth + 5;
  765. }
  766. ref += (u32)y0 * width + (u32)x0;
  767. ptrJ = ref + 5;
  768. for (y = partHeight; y; y--)
  769. {
  770. tmp6 = *(ptrJ - 5);
  771. tmp5 = *(ptrJ - 4);
  772. tmp4 = *(ptrJ - 3);
  773. tmp3 = *(ptrJ - 2);
  774. tmp2 = *(ptrJ - 1);
  775. /* calculate 4 pels per iteration */
  776. for (x = (partWidth >> 2); x; x--)
  777. {
  778. /* First pixel */
  779. tmp6 += 16;
  780. tmp7 = tmp3 + tmp4;
  781. tmp6 += (tmp7 << 4);
  782. tmp6 += (tmp7 << 2);
  783. tmp7 = tmp2 + tmp5;
  784. tmp1 = *ptrJ++;
  785. tmp6 -= (tmp7 << 2);
  786. tmp6 -= tmp7;
  787. tmp6 += tmp1;
  788. tmp6 = clp[tmp6>>5];
  789. tmp5 += 16;
  790. if (!horOffset)
  791. tmp6 += tmp4;
  792. else
  793. tmp6 += tmp3;
  794. *mb++ = (u8)((tmp6 + 1) >> 1);
  795. /* Second pixel */
  796. tmp7 = tmp2 + tmp3;
  797. tmp5 += (tmp7 << 4);
  798. tmp5 += (tmp7 << 2);
  799. tmp7 = tmp1 + tmp4;
  800. tmp6 = *ptrJ++;
  801. tmp5 -= (tmp7 << 2);
  802. tmp5 -= tmp7;
  803. tmp5 += tmp6;
  804. tmp5 = clp[tmp5>>5];
  805. tmp4 += 16;
  806. if (!horOffset)
  807. tmp5 += tmp3;
  808. else
  809. tmp5 += tmp2;
  810. *mb++ = (u8)((tmp5 + 1) >> 1);
  811. /* Third pixel */
  812. tmp7 = tmp1 + tmp2;
  813. tmp4 += (tmp7 << 4);
  814. tmp4 += (tmp7 << 2);
  815. tmp7 = tmp6 + tmp3;
  816. tmp5 = *ptrJ++;
  817. tmp4 -= (tmp7 << 2);
  818. tmp4 -= tmp7;
  819. tmp4 += tmp5;
  820. tmp4 = clp[tmp4>>5];
  821. tmp3 += 16;
  822. if (!horOffset)
  823. tmp4 += tmp2;
  824. else
  825. tmp4 += tmp1;
  826. *mb++ = (u8)((tmp4 + 1) >> 1);
  827. /* Fourth pixel */
  828. tmp7 = tmp6 + tmp1;
  829. tmp3 += (tmp7 << 4);
  830. tmp3 += (tmp7 << 2);
  831. tmp7 = tmp5 + tmp2;
  832. tmp4 = *ptrJ++;
  833. tmp3 -= (tmp7 << 2);
  834. tmp3 -= tmp7;
  835. tmp3 += tmp4;
  836. tmp3 = clp[tmp3>>5];
  837. if (!horOffset)
  838. tmp3 += tmp1;
  839. else
  840. tmp3 += tmp6;
  841. *mb++ = (u8)((tmp3 + 1) >> 1);
  842. tmp3 = tmp5;
  843. tmp5 = tmp1;
  844. tmp7 = tmp4;
  845. tmp4 = tmp6;
  846. tmp6 = tmp2;
  847. tmp2 = tmp7;
  848. }
  849. ptrJ += width - partWidth;
  850. mb += 16 - partWidth;
  851. }
  852. }
  853. /*------------------------------------------------------------------------------
  854. Function: h264bsdInterpolateHorVerQuarter
  855. Functional description:
  856. Function to perform horizontal and vertical interpolation of pixel
  857. position 'e', 'g', 'p' or 'r' for a block. Overfilling is done only
  858. if needed. Reference image (ref) is read at correct position and
  859. the predicted part is written to macroblock array (mb)
  860. ------------------------------------------------------------------------------*/
  861. void h264bsdInterpolateHorVerQuarter(
  862. u8 *ref,
  863. u8 *mb,
  864. i32 x0,
  865. i32 y0,
  866. u32 width,
  867. u32 height,
  868. u32 partWidth,
  869. u32 partHeight,
  870. u32 horVerOffset) /* 0 for pixel e, 1 for pixel g,
  871. 2 for pixel p, 3 for pixel r */
  872. {
  873. u32 p1[21*21/4+1];
  874. u8 *ptrC, *ptrJ, *ptrV;
  875. u32 x, y;
  876. i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  877. const u8 *clp = h264bsdClip + 512;
  878. /* Code */
  879. ASSERT(ref);
  880. ASSERT(mb);
  881. if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
  882. (y0 < 0) || ((u32)y0+partHeight+5 > height))
  883. {
  884. h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
  885. partWidth+5, partHeight+5, partWidth+5);
  886. x0 = 0;
  887. y0 = 0;
  888. ref = (u8*)p1;
  889. width = partWidth+5;
  890. }
  891. /* Ref points to G + (-2, -2) */
  892. ref += (u32)y0 * width + (u32)x0;
  893. /* ptrJ points to either J or Q, depending on vertical offset */
  894. ptrJ = ref + (((horVerOffset & 0x2) >> 1) + 2) * width + 5;
  895. /* ptrC points to either C or D, depending on horizontal offset */
  896. ptrC = ref + width + 2 + (horVerOffset & 0x1);
  897. for (y = partHeight; y; y--)
  898. {
  899. tmp6 = *(ptrJ - 5);
  900. tmp5 = *(ptrJ - 4);
  901. tmp4 = *(ptrJ - 3);
  902. tmp3 = *(ptrJ - 2);
  903. tmp2 = *(ptrJ - 1);
  904. /* Horizontal interpolation, calculate 4 pels per iteration */
  905. for (x = (partWidth >> 2); x; x--)
  906. {
  907. /* First pixel */
  908. tmp6 += 16;
  909. tmp7 = tmp3 + tmp4;
  910. tmp6 += (tmp7 << 4);
  911. tmp6 += (tmp7 << 2);
  912. tmp7 = tmp2 + tmp5;
  913. tmp1 = *ptrJ++;
  914. tmp6 -= (tmp7 << 2);
  915. tmp6 -= tmp7;
  916. tmp6 += tmp1;
  917. tmp6 = clp[tmp6>>5];
  918. /* Second pixel */
  919. tmp5 += 16;
  920. tmp7 = tmp2 + tmp3;
  921. *mb++ = (u8)tmp6;
  922. tmp5 += (tmp7 << 4);
  923. tmp5 += (tmp7 << 2);
  924. tmp7 = tmp1 + tmp4;
  925. tmp6 = *ptrJ++;
  926. tmp5 -= (tmp7 << 2);
  927. tmp5 -= tmp7;
  928. tmp5 += tmp6;
  929. tmp5 = clp[tmp5>>5];
  930. /* Third pixel */
  931. tmp4 += 16;
  932. tmp7 = tmp1 + tmp2;
  933. *mb++ = (u8)tmp5;
  934. tmp4 += (tmp7 << 4);
  935. tmp4 += (tmp7 << 2);
  936. tmp7 = tmp6 + tmp3;
  937. tmp5 = *ptrJ++;
  938. tmp4 -= (tmp7 << 2);
  939. tmp4 -= tmp7;
  940. tmp4 += tmp5;
  941. tmp4 = clp[tmp4>>5];
  942. /* Fourth pixel */
  943. tmp3 += 16;
  944. tmp7 = tmp6 + tmp1;
  945. *mb++ = (u8)tmp4;
  946. tmp3 += (tmp7 << 4);
  947. tmp3 += (tmp7 << 2);
  948. tmp7 = tmp5 + tmp2;
  949. tmp4 = *ptrJ++;
  950. tmp3 -= (tmp7 << 2);
  951. tmp3 -= tmp7;
  952. tmp3 += tmp4;
  953. tmp3 = clp[tmp3>>5];
  954. tmp7 = tmp4;
  955. tmp4 = tmp6;
  956. tmp6 = tmp2;
  957. tmp2 = tmp7;
  958. *mb++ = (u8)tmp3;
  959. tmp3 = tmp5;
  960. tmp5 = tmp1;
  961. }
  962. ptrJ += width - partWidth;
  963. mb += 16 - partWidth;
  964. }
  965. mb -= 16*partHeight;
  966. ptrV = ptrC + 5*width;
  967. for (y = (partHeight >> 2); y; y--)
  968. {
  969. /* Vertical interpolation and averaging, 4 pels per iteration */
  970. for (x = partWidth; x; x--)
  971. {
  972. tmp4 = ptrV[-(i32)width*2];
  973. tmp5 = ptrV[-(i32)width];
  974. tmp1 = ptrV[width];
  975. tmp2 = ptrV[width*2];
  976. tmp6 = *ptrV++;
  977. tmp7 = tmp4 + tmp1;
  978. tmp2 -= (tmp7 << 2);
  979. tmp2 -= tmp7;
  980. tmp2 += 16;
  981. tmp7 = tmp5 + tmp6;
  982. tmp3 = ptrC[width*2];
  983. tmp2 += (tmp7 << 4);
  984. tmp2 += (tmp7 << 2);
  985. tmp2 += tmp3;
  986. tmp7 = clp[tmp2>>5];
  987. tmp2 = mb[48];
  988. tmp1 += 16;
  989. tmp7++;
  990. mb[48] = (u8)((tmp2 + tmp7) >> 1);
  991. tmp7 = tmp3 + tmp6;
  992. tmp1 -= (tmp7 << 2);
  993. tmp1 -= tmp7;
  994. tmp7 = tmp4 + tmp5;
  995. tmp2 = ptrC[width];
  996. tmp1 += (tmp7 << 4);
  997. tmp1 += (tmp7 << 2);
  998. tmp1 += tmp2;
  999. tmp7 = clp[tmp1>>5];
  1000. tmp1 = mb[32];
  1001. tmp6 += 16;
  1002. tmp7++;
  1003. mb[32] = (u8)((tmp1 + tmp7) >> 1);
  1004. tmp1 = *ptrC;
  1005. tmp7 = tmp2 + tmp5;
  1006. tmp6 -= (tmp7 << 2);
  1007. tmp6 -= tmp7;
  1008. tmp7 = tmp4 + tmp3;
  1009. tmp6 += (tmp7 << 4);
  1010. tmp6 += (tmp7 << 2);
  1011. tmp6 += tmp1;
  1012. tmp7 = clp[tmp6>>5];
  1013. tmp6 = mb[16];
  1014. tmp5 += 16;
  1015. tmp7++;
  1016. mb[16] = (u8)((tmp6 + tmp7) >> 1);
  1017. tmp6 = ptrC[-(i32)width];
  1018. tmp1 += tmp4;
  1019. tmp5 -= (tmp1 << 2);
  1020. tmp5 -= tmp1;
  1021. tmp3 += tmp2;
  1022. tmp5 += (tmp3 << 4);
  1023. tmp5 += (tmp3 << 2);
  1024. tmp5 += tmp6;
  1025. tmp7 = clp[tmp5>>5];
  1026. tmp5 = *mb;
  1027. tmp7++;
  1028. *mb++ = (u8)((tmp5 + tmp7) >> 1);
  1029. ptrC++;
  1030. }
  1031. ptrC += 4*width - partWidth;
  1032. ptrV += 4*width - partWidth;
  1033. mb += 4*16 - partWidth;
  1034. }
  1035. }
  1036. #endif
  1037. /*------------------------------------------------------------------------------
  1038. Function: h264bsdInterpolateMidHalf
  1039. Functional description:
  1040. Function to perform horizontal and vertical interpolation of pixel
  1041. position 'j' for a block. Overfilling is done only if needed.
  1042. Reference image (ref) is read at correct position and the predicted
  1043. part is written to macroblock array (mb)
  1044. ------------------------------------------------------------------------------*/
  1045. void h264bsdInterpolateMidHalf(
  1046. u8 *ref,
  1047. u8 *mb,
  1048. i32 x0,
  1049. i32 y0,
  1050. u32 width,
  1051. u32 height,
  1052. u32 partWidth,
  1053. u32 partHeight)
  1054. {
  1055. u32 p1[21*21/4+1];
  1056. u32 x, y;
  1057. i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  1058. i32 *ptrC, *ptrV, *b1;
  1059. u8 *ptrJ;
  1060. i32 table[21*16];
  1061. const u8 *clp = h264bsdClip + 512;
  1062. /* Code */
  1063. ASSERT(ref);
  1064. ASSERT(mb);
  1065. if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
  1066. (y0 < 0) || ((u32)y0+partHeight+5 > height))
  1067. {
  1068. h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
  1069. partWidth+5, partHeight+5, partWidth+5);
  1070. x0 = 0;
  1071. y0 = 0;
  1072. ref = (u8*)p1;
  1073. width = partWidth+5;
  1074. }
  1075. ref += (u32)y0 * width + (u32)x0;
  1076. b1 = table;
  1077. ptrJ = ref + 5;
  1078. /* First step: calculate intermediate values for
  1079. * horizontal interpolation */
  1080. for (y = partHeight + 5; y; y--)
  1081. {
  1082. tmp6 = *(ptrJ - 5);
  1083. tmp5 = *(ptrJ - 4);
  1084. tmp4 = *(ptrJ - 3);
  1085. tmp3 = *(ptrJ - 2);
  1086. tmp2 = *(ptrJ - 1);
  1087. /* 4 pels per iteration */
  1088. for (x = (partWidth >> 2); x; x--)
  1089. {
  1090. /* First pixel */
  1091. tmp7 = tmp3 + tmp4;
  1092. tmp6 += (tmp7 << 4);
  1093. tmp6 += (tmp7 << 2);
  1094. tmp7 = tmp2 + tmp5;
  1095. tmp1 = *ptrJ++;
  1096. tmp6 -= (tmp7 << 2);
  1097. tmp6 -= tmp7;
  1098. tmp6 += tmp1;
  1099. *b1++ = tmp6;
  1100. /* Second pixel */
  1101. tmp7 = tmp2 + tmp3;
  1102. tmp5 += (tmp7 << 4);
  1103. tmp5 += (tmp7 << 2);
  1104. tmp7 = tmp1 + tmp4;
  1105. tmp6 = *ptrJ++;
  1106. tmp5 -= (tmp7 << 2);
  1107. tmp5 -= tmp7;
  1108. tmp5 += tmp6;
  1109. *b1++ = tmp5;
  1110. /* Third pixel */
  1111. tmp7 = tmp1 + tmp2;
  1112. tmp4 += (tmp7 << 4);
  1113. tmp4 += (tmp7 << 2);
  1114. tmp7 = tmp6 + tmp3;
  1115. tmp5 = *ptrJ++;
  1116. tmp4 -= (tmp7 << 2);
  1117. tmp4 -= tmp7;
  1118. tmp4 += tmp5;
  1119. *b1++ = tmp4;
  1120. /* Fourth pixel */
  1121. tmp7 = tmp6 + tmp1;
  1122. tmp3 += (tmp7 << 4);
  1123. tmp3 += (tmp7 << 2);
  1124. tmp7 = tmp5 + tmp2;
  1125. tmp4 = *ptrJ++;
  1126. tmp3 -= (tmp7 << 2);
  1127. tmp3 -= tmp7;
  1128. tmp3 += tmp4;
  1129. *b1++ = tmp3;
  1130. tmp7 = tmp4;
  1131. tmp4 = tmp6;
  1132. tmp6 = tmp2;
  1133. tmp2 = tmp7;
  1134. tmp3 = tmp5;
  1135. tmp5 = tmp1;
  1136. }
  1137. ptrJ += width - partWidth;
  1138. }
  1139. /* Second step: calculate vertical interpolation */
  1140. ptrC = table + partWidth;
  1141. ptrV = ptrC + 5*partWidth;
  1142. for (y = (partHeight >> 2); y; y--)
  1143. {
  1144. /* 4 pels per iteration */
  1145. for (x = partWidth; x; x--)
  1146. {
  1147. tmp4 = ptrV[-(i32)partWidth*2];
  1148. tmp5 = ptrV[-(i32)partWidth];
  1149. tmp1 = ptrV[partWidth];
  1150. tmp2 = ptrV[partWidth*2];
  1151. tmp6 = *ptrV++;
  1152. tmp7 = tmp4 + tmp1;
  1153. tmp2 -= (tmp7 << 2);
  1154. tmp2 -= tmp7;
  1155. tmp2 += 512;
  1156. tmp7 = tmp5 + tmp6;
  1157. tmp3 = ptrC[partWidth*2];
  1158. tmp2 += (tmp7 << 4);
  1159. tmp2 += (tmp7 << 2);
  1160. tmp2 += tmp3;
  1161. tmp7 = clp[tmp2>>10];
  1162. tmp1 += 512;
  1163. mb[48] = (u8)tmp7;
  1164. tmp7 = tmp3 + tmp6;
  1165. tmp1 -= (tmp7 << 2);
  1166. tmp1 -= tmp7;
  1167. tmp7 = tmp4 + tmp5;
  1168. tmp2 = ptrC[partWidth];
  1169. tmp1 += (tmp7 << 4);
  1170. tmp1 += (tmp7 << 2);
  1171. tmp1 += tmp2;
  1172. tmp7 = clp[tmp1>>10];
  1173. tmp6 += 512;
  1174. mb[32] = (u8)tmp7;
  1175. tmp1 = *ptrC;
  1176. tmp7 = tmp2 + tmp5;
  1177. tmp6 -= (tmp7 << 2);
  1178. tmp6 -= tmp7;
  1179. tmp7 = tmp4 + tmp3;
  1180. tmp6 += (tmp7 << 4);
  1181. tmp6 += (tmp7 << 2);
  1182. tmp6 += tmp1;
  1183. tmp7 = clp[tmp6>>10];
  1184. tmp5 += 512;
  1185. mb[16] = (u8)tmp7;
  1186. tmp6 = ptrC[-(i32)partWidth];
  1187. tmp1 += tmp4;
  1188. tmp5 -= (tmp1 << 2);
  1189. tmp5 -= tmp1;
  1190. tmp3 += tmp2;
  1191. tmp5 += (tmp3 << 4);
  1192. tmp5 += (tmp3 << 2);
  1193. tmp5 += tmp6;
  1194. tmp7 = clp[tmp5>>10];
  1195. *mb++ = (u8)tmp7;
  1196. ptrC++;
  1197. }
  1198. mb += 4*16 - partWidth;
  1199. ptrC += 3*partWidth;
  1200. ptrV += 3*partWidth;
  1201. }
  1202. }
  1203. /*------------------------------------------------------------------------------
  1204. Function: h264bsdInterpolateMidVerQuarter
  1205. Functional description:
  1206. Function to perform horizontal and vertical interpolation of pixel
  1207. position 'f' or 'q' for a block. Overfilling is done only if needed.
  1208. Reference image (ref) is read at correct position and the predicted
  1209. part is written to macroblock array (mb)
  1210. ------------------------------------------------------------------------------*/
  1211. void h264bsdInterpolateMidVerQuarter(
  1212. u8 *ref,
  1213. u8 *mb,
  1214. i32 x0,
  1215. i32 y0,
  1216. u32 width,
  1217. u32 height,
  1218. u32 partWidth,
  1219. u32 partHeight,
  1220. u32 verOffset) /* 0 for pixel f, 1 for pixel q */
  1221. {
  1222. u32 p1[21*21/4+1];
  1223. u32 x, y;
  1224. i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  1225. i32 *ptrC, *ptrV, *ptrInt, *b1;
  1226. u8 *ptrJ;
  1227. i32 table[21*16];
  1228. const u8 *clp = h264bsdClip + 512;
  1229. /* Code */
  1230. ASSERT(ref);
  1231. ASSERT(mb);
  1232. if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
  1233. (y0 < 0) || ((u32)y0+partHeight+5 > height))
  1234. {
  1235. h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
  1236. partWidth+5, partHeight+5, partWidth+5);
  1237. x0 = 0;
  1238. y0 = 0;
  1239. ref = (u8*)p1;
  1240. width = partWidth+5;
  1241. }
  1242. ref += (u32)y0 * width + (u32)x0;
  1243. b1 = table;
  1244. ptrJ = ref + 5;
  1245. /* First step: calculate intermediate values for
  1246. * horizontal interpolation */
  1247. for (y = partHeight + 5; y; y--)
  1248. {
  1249. tmp6 = *(ptrJ - 5);
  1250. tmp5 = *(ptrJ - 4);
  1251. tmp4 = *(ptrJ - 3);
  1252. tmp3 = *(ptrJ - 2);
  1253. tmp2 = *(ptrJ - 1);
  1254. for (x = (partWidth >> 2); x; x--)
  1255. {
  1256. /* First pixel */
  1257. tmp7 = tmp3 + tmp4;
  1258. tmp6 += (tmp7 << 4);
  1259. tmp6 += (tmp7 << 2);
  1260. tmp7 = tmp2 + tmp5;
  1261. tmp1 = *ptrJ++;
  1262. tmp6 -= (tmp7 << 2);
  1263. tmp6 -= tmp7;
  1264. tmp6 += tmp1;
  1265. *b1++ = tmp6;
  1266. /* Second pixel */
  1267. tmp7 = tmp2 + tmp3;
  1268. tmp5 += (tmp7 << 4);
  1269. tmp5 += (tmp7 << 2);
  1270. tmp7 = tmp1 + tmp4;
  1271. tmp6 = *ptrJ++;
  1272. tmp5 -= (tmp7 << 2);
  1273. tmp5 -= tmp7;
  1274. tmp5 += tmp6;
  1275. *b1++ = tmp5;
  1276. /* Third pixel */
  1277. tmp7 = tmp1 + tmp2;
  1278. tmp4 += (tmp7 << 4);
  1279. tmp4 += (tmp7 << 2);
  1280. tmp7 = tmp6 + tmp3;
  1281. tmp5 = *ptrJ++;
  1282. tmp4 -= (tmp7 << 2);
  1283. tmp4 -= tmp7;
  1284. tmp4 += tmp5;
  1285. *b1++ = tmp4;
  1286. /* Fourth pixel */
  1287. tmp7 = tmp6 + tmp1;
  1288. tmp3 += (tmp7 << 4);
  1289. tmp3 += (tmp7 << 2);
  1290. tmp7 = tmp5 + tmp2;
  1291. tmp4 = *ptrJ++;
  1292. tmp3 -= (tmp7 << 2);
  1293. tmp3 -= tmp7;
  1294. tmp3 += tmp4;
  1295. *b1++ = tmp3;
  1296. tmp7 = tmp4;
  1297. tmp4 = tmp6;
  1298. tmp6 = tmp2;
  1299. tmp2 = tmp7;
  1300. tmp3 = tmp5;
  1301. tmp5 = tmp1;
  1302. }
  1303. ptrJ += width - partWidth;
  1304. }
  1305. /* Second step: calculate vertical interpolation and average */
  1306. ptrC = table + partWidth;
  1307. ptrV = ptrC + 5*partWidth;
  1308. /* Pointer to integer sample position, either M or R */
  1309. ptrInt = ptrC + (2+verOffset)*partWidth;
  1310. for (y = (partHeight >> 2); y; y--)
  1311. {
  1312. for (x = partWidth; x; x--)
  1313. {
  1314. tmp4 = ptrV[-(i32)partWidth*2];
  1315. tmp5 = ptrV[-(i32)partWidth];
  1316. tmp1 = ptrV[partWidth];
  1317. tmp2 = ptrV[partWidth*2];
  1318. tmp6 = *ptrV++;
  1319. tmp7 = tmp4 + tmp1;
  1320. tmp2 -= (tmp7 << 2);
  1321. tmp2 -= tmp7;
  1322. tmp2 += 512;
  1323. tmp7 = tmp5 + tmp6;
  1324. tmp3 = ptrC[partWidth*2];
  1325. tmp2 += (tmp7 << 4);
  1326. tmp2 += (tmp7 << 2);
  1327. tmp7 = ptrInt[partWidth*2];
  1328. tmp2 += tmp3;
  1329. tmp2 = clp[tmp2>>10];
  1330. tmp7 += 16;
  1331. tmp7 = clp[tmp7>>5];
  1332. tmp1 += 512;
  1333. tmp2++;
  1334. mb[48] = (u8)((tmp7 + tmp2) >> 1);
  1335. tmp7 = tmp3 + tmp6;
  1336. tmp1 -= (tmp7 << 2);
  1337. tmp1 -= tmp7;
  1338. tmp7 = tmp4 + tmp5;
  1339. tmp2 = ptrC[partWidth];
  1340. tmp1 += (tmp7 << 4);
  1341. tmp1 += (tmp7 << 2);
  1342. tmp7 = ptrInt[partWidth];
  1343. tmp1 += tmp2;
  1344. tmp1 = clp[tmp1>>10];
  1345. tmp7 += 16;
  1346. tmp7 = clp[tmp7>>5];
  1347. tmp6 += 512;
  1348. tmp1++;
  1349. mb[32] = (u8)((tmp7 + tmp1) >> 1);
  1350. tmp1 = *ptrC;
  1351. tmp7 = tmp2 + tmp5;
  1352. tmp6 -= (tmp7 << 2);
  1353. tmp6 -= tmp7;
  1354. tmp7 = tmp4 + tmp3;
  1355. tmp6 += (tmp7 << 4);
  1356. tmp6 += (tmp7 << 2);
  1357. tmp7 = *ptrInt;
  1358. tmp6 += tmp1;
  1359. tmp6 = clp[tmp6>>10];
  1360. tmp7 += 16;
  1361. tmp7 = clp[tmp7>>5];
  1362. tmp5 += 512;
  1363. tmp6++;
  1364. mb[16] = (u8)((tmp7 + tmp6) >> 1);
  1365. tmp6 = ptrC[-(i32)partWidth];
  1366. tmp1 += tmp4;
  1367. tmp5 -= (tmp1 << 2);
  1368. tmp5 -= tmp1;
  1369. tmp3 += tmp2;
  1370. tmp5 += (tmp3 << 4);
  1371. tmp5 += (tmp3 << 2);
  1372. tmp7 = ptrInt[-(i32)partWidth];
  1373. tmp5 += tmp6;
  1374. tmp5 = clp[tmp5>>10];
  1375. tmp7 += 16;
  1376. tmp7 = clp[tmp7>>5];
  1377. tmp5++;
  1378. *mb++ = (u8)((tmp7 + tmp5) >> 1);
  1379. ptrC++;
  1380. ptrInt++;
  1381. }
  1382. mb += 4*16 - partWidth;
  1383. ptrC += 3*partWidth;
  1384. ptrV += 3*partWidth;
  1385. ptrInt += 3*partWidth;
  1386. }
  1387. }
  1388. /*------------------------------------------------------------------------------
  1389. Function: h264bsdInterpolateMidHorQuarter
  1390. Functional description:
  1391. Function to perform horizontal and vertical interpolation of pixel
  1392. position 'i' or 'k' for a block. Overfilling is done only if needed.
  1393. Reference image (ref) is read at correct position and the predicted
  1394. part is written to macroblock array (mb)
  1395. ------------------------------------------------------------------------------*/
  1396. void h264bsdInterpolateMidHorQuarter(
  1397. u8 *ref,
  1398. u8 *mb,
  1399. i32 x0,
  1400. i32 y0,
  1401. u32 width,
  1402. u32 height,
  1403. u32 partWidth,
  1404. u32 partHeight,
  1405. u32 horOffset) /* 0 for pixel i, 1 for pixel k */
  1406. {
  1407. u32 p1[21*21/4+1];
  1408. u32 x, y;
  1409. i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  1410. i32 *ptrJ, *ptrInt, *h1;
  1411. u8 *ptrC, *ptrV;
  1412. i32 table[21*16];
  1413. i32 tableWidth = (i32)partWidth+5;
  1414. const u8 *clp = h264bsdClip + 512;
  1415. /* Code */
  1416. ASSERT(ref);
  1417. ASSERT(mb);
  1418. if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
  1419. (y0 < 0) || ((u32)y0+partHeight+5 > height))
  1420. {
  1421. h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
  1422. partWidth+5, partHeight+5, partWidth+5);
  1423. x0 = 0;
  1424. y0 = 0;
  1425. ref = (u8*)p1;
  1426. width = partWidth+5;
  1427. }
  1428. ref += (u32)y0 * width + (u32)x0;
  1429. h1 = table + tableWidth;
  1430. ptrC = ref + width;
  1431. ptrV = ptrC + 5*width;
  1432. /* First step: calculate intermediate values for
  1433. * vertical interpolation */
  1434. for (y = (partHeight >> 2); y; y--)
  1435. {
  1436. for (x = (u32)tableWidth; x; x--)
  1437. {
  1438. tmp4 = ptrV[-(i32)width*2];
  1439. tmp5 = ptrV[-(i32)width];
  1440. tmp1 = ptrV[width];
  1441. tmp2 = ptrV[width*2];
  1442. tmp6 = *ptrV++;
  1443. tmp7 = tmp4 + tmp1;
  1444. tmp2 -= (tmp7 << 2);
  1445. tmp2 -= tmp7;
  1446. tmp7 = tmp5 + tmp6;
  1447. tmp3 = ptrC[width*2];
  1448. tmp2 += (tmp7 << 4);
  1449. tmp2 += (tmp7 << 2);
  1450. tmp2 += tmp3;
  1451. h1[tableWidth*2] = tmp2;
  1452. tmp7 = tmp3 + tmp6;
  1453. tmp1 -= (tmp7 << 2);
  1454. tmp1 -= tmp7;
  1455. tmp7 = tmp4 + tmp5;
  1456. tmp2 = ptrC[width];
  1457. tmp1 += (tmp7 << 4);
  1458. tmp1 += (tmp7 << 2);
  1459. tmp1 += tmp2;
  1460. h1[tableWidth] = tmp1;
  1461. tmp1 = *ptrC;
  1462. tmp7 = tmp2 + tmp5;
  1463. tmp6 -= (tmp7 << 2);
  1464. tmp6 -= tmp7;
  1465. tmp7 = tmp4 + tmp3;
  1466. tmp6 += (tmp7 << 4);
  1467. tmp6 += (tmp7 << 2);
  1468. tmp6 += tmp1;
  1469. *h1 = tmp6;
  1470. tmp6 = ptrC[-(i32)width];
  1471. tmp1 += tmp4;
  1472. tmp5 -= (tmp1 << 2);
  1473. tmp5 -= tmp1;
  1474. tmp3 += tmp2;
  1475. tmp5 += (tmp3 << 4);
  1476. tmp5 += (tmp3 << 2);
  1477. tmp5 += tmp6;
  1478. h1[-tableWidth] = tmp5;
  1479. h1++;
  1480. ptrC++;
  1481. }
  1482. ptrC += 4*width - partWidth - 5;
  1483. ptrV += 4*width - partWidth - 5;
  1484. h1 += 3*tableWidth;
  1485. }
  1486. /* Second step: calculate horizontal interpolation and average */
  1487. ptrJ = table + 5;
  1488. /* Pointer to integer sample position, either G or H */
  1489. ptrInt = table + 2 + horOffset;
  1490. for (y = partHeight; y; y--)
  1491. {
  1492. tmp6 = *(ptrJ - 5);
  1493. tmp5 = *(ptrJ - 4);
  1494. tmp4 = *(ptrJ - 3);
  1495. tmp3 = *(ptrJ - 2);
  1496. tmp2 = *(ptrJ - 1);
  1497. for (x = (partWidth>>2); x; x--)
  1498. {
  1499. /* First pixel */
  1500. tmp6 += 512;
  1501. tmp7 = tmp3 + tmp4;
  1502. tmp6 += (tmp7 << 4);
  1503. tmp6 += (tmp7 << 2);
  1504. tmp7 = tmp2 + tmp5;
  1505. tmp1 = *ptrJ++;
  1506. tmp6 -= (tmp7 << 2);
  1507. tmp6 -= tmp7;
  1508. tmp7 = *ptrInt++;
  1509. tmp6 += tmp1;
  1510. tmp6 = clp[tmp6 >> 10];
  1511. tmp7 += 16;
  1512. tmp7 = clp[tmp7 >> 5];
  1513. tmp5 += 512;
  1514. tmp6++;
  1515. *mb++ = (u8)((tmp6 + tmp7) >> 1);
  1516. /* Second pixel */
  1517. tmp7 = tmp2 + tmp3;
  1518. tmp5 += (tmp7 << 4);
  1519. tmp5 += (tmp7 << 2);
  1520. tmp7 = tmp1 + tmp4;
  1521. tmp6 = *ptrJ++;
  1522. tmp5 -= (tmp7 << 2);
  1523. tmp5 -= tmp7;
  1524. tmp7 = *ptrInt++;
  1525. tmp5 += tmp6;
  1526. tmp5 = clp[tmp5 >> 10];
  1527. tmp7 += 16;
  1528. tmp7 = clp[tmp7 >> 5];
  1529. tmp4 += 512;
  1530. tmp5++;
  1531. *mb++ = (u8)((tmp5 + tmp7) >> 1);
  1532. /* Third pixel */
  1533. tmp7 = tmp1 + tmp2;
  1534. tmp4 += (tmp7 << 4);
  1535. tmp4 += (tmp7 << 2);
  1536. tmp7 = tmp6 + tmp3;
  1537. tmp5 = *ptrJ++;
  1538. tmp4 -= (tmp7 << 2);
  1539. tmp4 -= tmp7;
  1540. tmp7 = *ptrInt++;
  1541. tmp4 += tmp5;
  1542. tmp4 = clp[tmp4 >> 10];
  1543. tmp7 += 16;
  1544. tmp7 = clp[tmp7 >> 5];
  1545. tmp3 += 512;
  1546. tmp4++;
  1547. *mb++ = (u8)((tmp4 + tmp7) >> 1);
  1548. /* Fourth pixel */
  1549. tmp7 = tmp6 + tmp1;
  1550. tmp3 += (tmp7 << 4);
  1551. tmp3 += (tmp7 << 2);
  1552. tmp7 = tmp5 + tmp2;
  1553. tmp4 = *ptrJ++;
  1554. tmp3 -= (tmp7 << 2);
  1555. tmp3 -= tmp7;
  1556. tmp7 = *ptrInt++;
  1557. tmp3 += tmp4;
  1558. tmp3 = clp[tmp3 >> 10];
  1559. tmp7 += 16;
  1560. tmp7 = clp[tmp7 >> 5];
  1561. tmp3++;
  1562. *mb++ = (u8)((tmp3 + tmp7) >> 1);
  1563. tmp3 = tmp5;
  1564. tmp5 = tmp1;
  1565. tmp7 = tmp4;
  1566. tmp4 = tmp6;
  1567. tmp6 = tmp2;
  1568. tmp2 = tmp7;
  1569. }
  1570. ptrJ += 5;
  1571. ptrInt += 5;
  1572. mb += 16 - partWidth;
  1573. }
  1574. }
  1575. /*------------------------------------------------------------------------------
  1576. Function: h264bsdPredictSamples
  1577. Functional description:
  1578. This function reconstructs a prediction for a macroblock partition.
  1579. The prediction is either copied or interpolated using the reference
  1580. frame and the motion vector. Both luminance and chrominance parts are
  1581. predicted. The prediction is stored in given macroblock array (data).
  1582. Inputs:
  1583. data pointer to macroblock array (384 bytes) for output
  1584. mv pointer to motion vector used for prediction
  1585. refPic pointer to reference picture structure
  1586. xA x-coordinate for current macroblock
  1587. yA y-coordinate for current macroblock
  1588. partX x-offset for partition in macroblock
  1589. partY y-offset for partition in macroblock
  1590. partWidth width of partition
  1591. partHeight height of partition
  1592. Outputs:
  1593. data macroblock array (16x16+8x8+8x8) where predicted
  1594. partition is stored at correct position
  1595. ------------------------------------------------------------------------------*/
  1596. void h264bsdPredictSamples(
  1597. u8 *data,
  1598. mv_t *mv,
  1599. image_t *refPic,
  1600. u32 xA,
  1601. u32 yA,
  1602. u32 partX,
  1603. u32 partY,
  1604. u32 partWidth,
  1605. u32 partHeight)
  1606. {
  1607. /* Variables */
  1608. u32 xFrac, yFrac, width, height;
  1609. i32 xInt, yInt;
  1610. u8 *lumaPartData;
  1611. /* Code */
  1612. ASSERT(data);
  1613. ASSERT(mv);
  1614. ASSERT(partWidth);
  1615. ASSERT(partHeight);
  1616. ASSERT(refPic);
  1617. ASSERT(refPic->data);
  1618. ASSERT(refPic->width);
  1619. ASSERT(refPic->height);
  1620. /* luma */
  1621. lumaPartData = data + 16*partY + partX;
  1622. xFrac = mv->hor & 0x3;
  1623. yFrac = mv->ver & 0x3;
  1624. width = 16 * refPic->width;
  1625. height = 16 * refPic->height;
  1626. xInt = (i32)xA + (i32)partX + (mv->hor >> 2);
  1627. yInt = (i32)yA + (i32)partY + (mv->ver >> 2);
  1628. ASSERT(lumaFracPos[xFrac][yFrac] < 16);
  1629. switch (lumaFracPos[xFrac][yFrac])
  1630. {
  1631. case 0: /* G */
  1632. h264bsdFillBlock(refPic->data, lumaPartData,
  1633. xInt,yInt,width,height,partWidth,partHeight,16);
  1634. break;
  1635. case 1: /* d */
  1636. h264bsdInterpolateVerQuarter(refPic->data, lumaPartData,
  1637. xInt, yInt-2, width, height, partWidth, partHeight, 0);
  1638. break;
  1639. case 2: /* h */
  1640. h264bsdInterpolateVerHalf(refPic->data, lumaPartData,
  1641. xInt, yInt-2, width, height, partWidth, partHeight);
  1642. break;
  1643. case 3: /* n */
  1644. h264bsdInterpolateVerQuarter(refPic->data, lumaPartData,
  1645. xInt, yInt-2, width, height, partWidth, partHeight, 1);
  1646. break;
  1647. case 4: /* a */
  1648. h264bsdInterpolateHorQuarter(refPic->data, lumaPartData,
  1649. xInt-2, yInt, width, height, partWidth, partHeight, 0);
  1650. break;
  1651. case 5: /* e */
  1652. h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
  1653. xInt-2, yInt-2, width, height, partWidth, partHeight, 0);
  1654. break;
  1655. case 6: /* i */
  1656. h264bsdInterpolateMidHorQuarter(refPic->data, lumaPartData,
  1657. xInt-2, yInt-2, width, height, partWidth, partHeight, 0);
  1658. break;
  1659. case 7: /* p */
  1660. h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
  1661. xInt-2, yInt-2, width, height, partWidth, partHeight, 2);
  1662. break;
  1663. case 8: /* b */
  1664. h264bsdInterpolateHorHalf(refPic->data, lumaPartData,
  1665. xInt-2, yInt, width, height, partWidth, partHeight);
  1666. break;
  1667. case 9: /* f */
  1668. h264bsdInterpolateMidVerQuarter(refPic->data, lumaPartData,
  1669. xInt-2, yInt-2, width, height, partWidth, partHeight, 0);
  1670. break;
  1671. case 10: /* j */
  1672. h264bsdInterpolateMidHalf(refPic->data, lumaPartData,
  1673. xInt-2, yInt-2, width, height, partWidth, partHeight);
  1674. break;
  1675. case 11: /* q */
  1676. h264bsdInterpolateMidVerQuarter(refPic->data, lumaPartData,
  1677. xInt-2, yInt-2, width, height, partWidth, partHeight, 1);
  1678. break;
  1679. case 12: /* c */
  1680. h264bsdInterpolateHorQuarter(refPic->data, lumaPartData,
  1681. xInt-2, yInt, width, height, partWidth, partHeight, 1);
  1682. break;
  1683. case 13: /* g */
  1684. h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
  1685. xInt-2, yInt-2, width, height, partWidth, partHeight, 1);
  1686. break;
  1687. case 14: /* k */
  1688. h264bsdInterpolateMidHorQuarter(refPic->data, lumaPartData,
  1689. xInt-2, yInt-2, width, height, partWidth, partHeight, 1);
  1690. break;
  1691. default: /* case 15, r */
  1692. h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
  1693. xInt-2, yInt-2, width, height, partWidth, partHeight, 3);
  1694. break;
  1695. }
  1696. /* chroma */
  1697. PredictChroma(
  1698. data + 16*16 + (partY>>1)*8 + (partX>>1),
  1699. xA + partX,
  1700. yA + partY,
  1701. partWidth,
  1702. partHeight,
  1703. mv,
  1704. refPic);
  1705. }
  1706. #else /* H264DEC_OMXDL */
  1707. /*------------------------------------------------------------------------------
  1708. Function: h264bsdPredictSamples
  1709. Functional description:
  1710. This function reconstructs a prediction for a macroblock partition.
  1711. The prediction is either copied or interpolated using the reference
  1712. frame and the motion vector. Both luminance and chrominance parts are
  1713. predicted. The prediction is stored in given macroblock array (data).
  1714. Inputs:
  1715. data pointer to macroblock array (384 bytes) for output
  1716. mv pointer to motion vector used for prediction
  1717. refPic pointer to reference picture structure
  1718. xA x-coordinate for current macroblock
  1719. yA y-coordinate for current macroblock
  1720. partX x-offset for partition in macroblock
  1721. partY y-offset for partition in macroblock
  1722. partWidth width of partition
  1723. partHeight height of partition
  1724. Outputs:
  1725. data macroblock array (16x16+8x8+8x8) where predicted
  1726. partition is stored at correct position
  1727. ------------------------------------------------------------------------------*/
  1728. /*lint -e{550} Symbol 'res' not accessed */
  1729. void h264bsdPredictSamples(
  1730. u8 *data,
  1731. mv_t *mv,
  1732. image_t *refPic,
  1733. u32 colAndRow,
  1734. u32 part,
  1735. u8 *pFill)
  1736. {
  1737. /* Variables */
  1738. u32 xFrac, yFrac;
  1739. u32 width, height;
  1740. i32 xInt, yInt, x0, y0;
  1741. u8 *partData, *ref;
  1742. OMXSize roi;
  1743. u32 fillWidth;
  1744. u32 fillHeight;
  1745. OMXResult res;
  1746. u32 xA, yA;
  1747. u32 partX, partY;
  1748. u32 partWidth, partHeight;
  1749. /* Code */
  1750. ASSERT(data);
  1751. ASSERT(mv);
  1752. ASSERT(refPic);
  1753. ASSERT(refPic->data);
  1754. ASSERT(refPic->width);
  1755. ASSERT(refPic->height);
  1756. xA = (colAndRow & 0xFFFF0000) >> 16;
  1757. yA = (colAndRow & 0x0000FFFF);
  1758. partX = (part & 0xFF000000) >> 24;
  1759. partY = (part & 0x00FF0000) >> 16;
  1760. partWidth = (part & 0x0000FF00) >> 8;
  1761. partHeight = (part & 0x000000FF);
  1762. ASSERT(partWidth);
  1763. ASSERT(partHeight);
  1764. /* luma */
  1765. partData = data + 16*partY + partX;
  1766. xFrac = mv->hor & 0x3;
  1767. yFrac = mv->ver & 0x3;
  1768. width = 16 * refPic->width;
  1769. height = 16 * refPic->height;
  1770. xInt = (i32)xA + (i32)partX + (mv->hor >> 2);
  1771. yInt = (i32)yA + (i32)partY + (mv->ver >> 2);
  1772. x0 = (xFrac) ? xInt-2 : xInt;
  1773. y0 = (yFrac) ? yInt-2 : yInt;
  1774. if (xFrac)
  1775. {
  1776. if (partWidth == 16)
  1777. fillWidth = 32;
  1778. else
  1779. fillWidth = 16;
  1780. }
  1781. else
  1782. fillWidth = (partWidth*2);
  1783. if (yFrac)
  1784. fillHeight = partHeight+5;
  1785. else
  1786. fillHeight = partHeight;
  1787. if ((x0 < 0) || ((u32)x0+fillWidth > width) ||
  1788. (y0 < 0) || ((u32)y0+fillHeight > height))
  1789. {
  1790. h264bsdFillBlock(refPic->data, (u8*)pFill, x0, y0, width, height,
  1791. fillWidth, fillHeight, fillWidth);
  1792. x0 = 0;
  1793. y0 = 0;
  1794. ref = pFill;
  1795. width = fillWidth;
  1796. if (yFrac)
  1797. ref += 2*width;
  1798. if (xFrac)
  1799. ref += 2;
  1800. }
  1801. else
  1802. {
  1803. /*lint --e(737) Loss of sign */
  1804. ref = refPic->data + yInt*width + xInt;
  1805. }
  1806. /* Luma interpolation */
  1807. roi.width = (i32)partWidth;
  1808. roi.height = (i32)partHeight;
  1809. res = omxVCM4P10_InterpolateLuma(ref, (i32)width, partData, 16,
  1810. (i32)xFrac, (i32)yFrac, roi);
  1811. ASSERT(res == 0);
  1812. /* Chroma */
  1813. width = 8 * refPic->width;
  1814. height = 8 * refPic->height;
  1815. x0 = ((xA + partX) >> 1) + (mv->hor >> 3);
  1816. y0 = ((yA + partY) >> 1) + (mv->ver >> 3);
  1817. xFrac = mv->hor & 0x7;
  1818. yFrac = mv->ver & 0x7;
  1819. ref = refPic->data + 256 * refPic->width * refPic->height;
  1820. roi.width = (i32)(partWidth >> 1);
  1821. fillWidth = ((partWidth >> 1) + 8) & ~0x7;
  1822. roi.height = (i32)(partHeight >> 1);
  1823. fillHeight = (partHeight >> 1) + 1;
  1824. if ((x0 < 0) || ((u32)x0+fillWidth > width) ||
  1825. (y0 < 0) || ((u32)y0+fillHeight > height))
  1826. {
  1827. h264bsdFillBlock(ref, pFill, x0, y0, width, height,
  1828. fillWidth, fillHeight, fillWidth);
  1829. ref += width * height;
  1830. h264bsdFillBlock(ref, pFill + fillWidth*fillHeight,
  1831. x0, y0, width, height, fillWidth,
  1832. fillHeight, fillWidth);
  1833. ref = pFill;
  1834. x0 = 0;
  1835. y0 = 0;
  1836. width = fillWidth;
  1837. height = fillHeight;
  1838. }
  1839. partData = data + 16*16 + (partY>>1)*8 + (partX>>1);
  1840. /* Chroma interpolation */
  1841. /*lint --e(737) Loss of sign */
  1842. ref += y0 * width + x0;
  1843. res = armVCM4P10_Interpolate_Chroma(ref, width, partData, 8,
  1844. (u32)roi.width, (u32)roi.height, xFrac, yFrac);
  1845. ASSERT(res == 0);
  1846. partData += 8 * 8;
  1847. ref += height * width;
  1848. res = armVCM4P10_Interpolate_Chroma(ref, width, partData, 8,
  1849. (u32)roi.width, (u32)roi.height, xFrac, yFrac);
  1850. ASSERT(res == 0);
  1851. }
  1852. #endif /* H264DEC_OMXDL */
  1853. /*------------------------------------------------------------------------------
  1854. Function: FillRow1
  1855. Functional description:
  1856. This function gets a row of reference pels in a 'normal' case when no
  1857. overfilling is necessary.
  1858. ------------------------------------------------------------------------------*/
  1859. static void FillRow1(
  1860. u8 *ref,
  1861. u8 *fill,
  1862. i32 left,
  1863. i32 center,
  1864. i32 right)
  1865. {
  1866. ASSERT(ref);
  1867. ASSERT(fill);
  1868. H264SwDecMemcpy(fill, ref, (u32)center);
  1869. /*lint -e(715) */
  1870. }
  1871. /*------------------------------------------------------------------------------
  1872. Function: h264bsdFillRow7
  1873. Functional description:
  1874. This function gets a row of reference pels when horizontal coordinate
  1875. is partly negative or partly greater than reference picture width
  1876. (overfilling some pels on left and/or right edge).
  1877. Inputs:
  1878. ref pointer to reference samples
  1879. left amount of pixels to overfill on left-edge
  1880. center amount of pixels to copy
  1881. right amount of pixels to overfill on right-edge
  1882. Outputs:
  1883. fill pointer where samples are stored
  1884. ------------------------------------------------------------------------------*/
  1885. #ifndef H264DEC_NEON
  1886. void h264bsdFillRow7(
  1887. u8 *ref,
  1888. u8 *fill,
  1889. i32 left,
  1890. i32 center,
  1891. i32 right)
  1892. {
  1893. u8 tmp;
  1894. ASSERT(ref);
  1895. ASSERT(fill);
  1896. if (left)
  1897. tmp = *ref;
  1898. for ( ; left; left--)
  1899. /*lint -esym(644,tmp) tmp is initialized if used */
  1900. *fill++ = tmp;
  1901. for ( ; center; center--)
  1902. *fill++ = *ref++;
  1903. if (right)
  1904. tmp = ref[-1];
  1905. for ( ; right; right--)
  1906. /*lint -esym(644,tmp) tmp is initialized if used */
  1907. *fill++ = tmp;
  1908. }
  1909. #endif
  1910. /*------------------------------------------------------------------------------
  1911. Function: h264bsdFillBlock
  1912. Functional description:
  1913. This function gets a block of reference pels. It determines whether
  1914. overfilling is needed or not and repeatedly calls an appropriate
  1915. function (by using a function pointer) that fills one row the block.
  1916. Inputs:
  1917. ref pointer to reference frame
  1918. x0 x-coordinate for block
  1919. y0 y-coordinate for block
  1920. width width of reference frame
  1921. height height of reference frame
  1922. blockWidth width of block
  1923. blockHeight height of block
  1924. fillScanLength length of a line in output array (pixels)
  1925. Outputs:
  1926. fill pointer to array where output block is written
  1927. ------------------------------------------------------------------------------*/
  1928. void h264bsdFillBlock(
  1929. u8 *ref,
  1930. u8 *fill,
  1931. i32 x0,
  1932. i32 y0,
  1933. u32 width,
  1934. u32 height,
  1935. u32 blockWidth,
  1936. u32 blockHeight,
  1937. u32 fillScanLength)
  1938. {
  1939. /* Variables */
  1940. i32 xstop, ystop;
  1941. void (*fp)(u8*, u8*, i32, i32, i32);
  1942. i32 left, x, right;
  1943. i32 top, y, bottom;
  1944. /* Code */
  1945. ASSERT(ref);
  1946. ASSERT(fill);
  1947. ASSERT(width);
  1948. ASSERT(height);
  1949. ASSERT(fill);
  1950. ASSERT(blockWidth);
  1951. ASSERT(blockHeight);
  1952. xstop = x0 + (i32)blockWidth;
  1953. ystop = y0 + (i32)blockHeight;
  1954. /* Choose correct function whether overfilling on left-edge or right-edge
  1955. * is needed or not */
  1956. if (x0 >= 0 && xstop <= (i32)width)
  1957. fp = FillRow1;
  1958. else
  1959. fp = h264bsdFillRow7;
  1960. if (ystop < 0)
  1961. y0 = -(i32)blockHeight;
  1962. if (xstop < 0)
  1963. x0 = -(i32)blockWidth;
  1964. if (y0 > (i32)height)
  1965. y0 = (i32)height;
  1966. if (x0 > (i32)width)
  1967. x0 = (i32)width;
  1968. xstop = x0 + (i32)blockWidth;
  1969. ystop = y0 + (i32)blockHeight;
  1970. if (x0 > 0)
  1971. ref += x0;
  1972. if (y0 > 0)
  1973. ref += y0 * (i32)width;
  1974. left = x0 < 0 ? -x0 : 0;
  1975. right = xstop > (i32)width ? xstop - (i32)width : 0;
  1976. x = (i32)blockWidth - left - right;
  1977. top = y0 < 0 ? -y0 : 0;
  1978. bottom = ystop > (i32)height ? ystop - (i32)height : 0;
  1979. y = (i32)blockHeight - top - bottom;
  1980. /* Top-overfilling */
  1981. for ( ; top; top-- )
  1982. {
  1983. (*fp)(ref, fill, left, x, right);
  1984. fill += fillScanLength;
  1985. }
  1986. /* Lines inside reference image */
  1987. for ( ; y; y-- )
  1988. {
  1989. (*fp)(ref, fill, left, x, right);
  1990. ref += width;
  1991. fill += fillScanLength;
  1992. }
  1993. ref -= width;
  1994. /* Bottom-overfilling */
  1995. for ( ; bottom; bottom-- )
  1996. {
  1997. (*fp)(ref, fill, left, x, right);
  1998. fill += fillScanLength;
  1999. }
  2000. }
  2001. /*lint +e701 +e702 */