PageRenderTime 157ms CodeModel.GetById 75ms app.highlight 64ms RepoModel.GetById 1ms app.codeStats 1ms

/H264Dec/source/h264bsd_reconstruct.c

http://github.com/mbebenita/Broadway
C | 2315 lines | 1666 code | 259 blank | 390 comment | 110 complexity | a243c534b0997850d53f65736fd8ffbe MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/*
   2 * Copyright (C) 2009 The Android Open Source Project
   3 *
   4 * Licensed under the Apache License, Version 2.0 (the "License");
   5 * you may not use this file except in compliance with the License.
   6 * You may obtain a copy of the License at
   7 *
   8 *      http://www.apache.org/licenses/LICENSE-2.0
   9 *
  10 * Unless required by applicable law or agreed to in writing, software
  11 * distributed under the License is distributed on an "AS IS" BASIS,
  12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 * See the License for the specific language governing permissions and
  14 * limitations under the License.
  15 */
  16
  17/*------------------------------------------------------------------------------
  18
  19    Table of contents
  20
  21     1. Include headers
  22     2. External compiler flags
  23     3. Module defines
  24     4. Local function prototypes
  25     5. Functions
  26
  27------------------------------------------------------------------------------*/
  28
  29/*------------------------------------------------------------------------------
  30    1. Include headers
  31------------------------------------------------------------------------------*/
  32
  33#include "basetype.h"
  34#include "h264bsd_reconstruct.h"
  35#include "h264bsd_macroblock_layer.h"
  36#include "h264bsd_image.h"
  37#include "h264bsd_util.h"
  38
  39#ifdef H264DEC_OMXDL
  40#include "omxtypes.h"
  41#include "omxVC.h"
  42#include "armVC.h"
  43#endif /* H264DEC_OMXDL */
  44
  45/*------------------------------------------------------------------------------
  46    2. External compiler flags
  47--------------------------------------------------------------------------------
  48
  49--------------------------------------------------------------------------------
  50    3. Module defines
  51------------------------------------------------------------------------------*/
  52
  53/* Switch off the following Lint messages for this file:
  54 * Info 701: Shift left of signed quantity (int)
  55 * Info 702: Shift right of signed quantity (int)
  56 */
  57/*lint -e701 -e702 */
  58
  59/* Luma fractional-sample positions
  60 *
  61 *  G a b c H
  62 *  d e f g
  63 *  h i j k m
  64 *  n p q r
  65 *  M   s   N
  66 *
  67 *  G, H, M and N are integer sample positions
  68 *  a-s are fractional samples that need to be interpolated.
  69 */
  70#ifndef H264DEC_OMXDL
  71static const u32 lumaFracPos[4][4] = {
  72  /* G  d  h  n    a  e  i  p    b  f  j   q     c   g   k   r */
  73    {0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}, {12, 13, 14, 15}};
  74#endif /* H264DEC_OMXDL */
  75
  76/* clipping table, defined in h264bsd_intra_prediction.c */
  77extern const u8 h264bsdClip[];
  78
  79/*------------------------------------------------------------------------------
  80    4. Local function prototypes
  81------------------------------------------------------------------------------*/
  82
  83#ifndef H264DEC_OMXDL
  84
  85/*------------------------------------------------------------------------------
  86
  87    Function: h264bsdInterpolateChromaHor
  88
  89        Functional description:
  90          This function performs chroma interpolation in horizontal direction.
  91          Overfilling is done only if needed. Reference image (pRef) is
  92          read at correct position and the predicted part is written to
  93          macroblock's chrominance (predPartChroma)
  94        Inputs:
  95          pRef              pointer to reference frame Cb top-left corner
  96          x0                integer x-coordinate for prediction
  97          y0                integer y-coordinate for prediction
  98          width             width of the reference frame chrominance in pixels
  99          height            height of the reference frame chrominance in pixels
 100          xFrac             horizontal fraction for prediction in 1/8 pixels
 101          chromaPartWidth   width of the predicted part in pixels
 102          chromaPartHeight  height of the predicted part in pixels
 103        Outputs:
 104          predPartChroma    pointer where predicted part is written
 105
 106------------------------------------------------------------------------------*/
 107#ifndef H264DEC_ARM11
 108void h264bsdInterpolateChromaHor(
 109  u8 *pRef,
 110  u8 *predPartChroma,
 111  i32 x0,
 112  i32 y0,
 113  u32 width,
 114  u32 height,
 115  u32 xFrac,
 116  u32 chromaPartWidth,
 117  u32 chromaPartHeight)
 118{
 119
 120/* Variables */
 121
 122    u32 x, y, tmp1, tmp2, tmp3, tmp4, c, val;
 123    u8 *ptrA, *cbr;
 124    u32 comp;
 125    u8 block[9*8*2];
 126
 127/* Code */
 128
 129    ASSERT(predPartChroma);
 130    ASSERT(chromaPartWidth);
 131    ASSERT(chromaPartHeight);
 132    ASSERT(xFrac < 8);
 133    ASSERT(pRef);
 134
 135    if ((x0 < 0) || ((u32)x0+chromaPartWidth+1 > width) ||
 136        (y0 < 0) || ((u32)y0+chromaPartHeight > height))
 137    {
 138        h264bsdFillBlock(pRef, block, x0, y0, width, height,
 139            chromaPartWidth + 1, chromaPartHeight, chromaPartWidth + 1);
 140        pRef += width * height;
 141        h264bsdFillBlock(pRef, block + (chromaPartWidth+1)*chromaPartHeight,
 142            x0, y0, width, height, chromaPartWidth + 1,
 143            chromaPartHeight, chromaPartWidth + 1);
 144
 145        pRef = block;
 146        x0 = 0;
 147        y0 = 0;
 148        width = chromaPartWidth+1;
 149        height = chromaPartHeight;
 150    }
 151
 152    val = 8 - xFrac;
 153
 154    for (comp = 0; comp <= 1; comp++)
 155    {
 156
 157        ptrA = pRef + (comp * height + (u32)y0) * width + x0;
 158        cbr = predPartChroma + comp * 8 * 8;
 159
 160        /* 2x2 pels per iteration
 161         * bilinear horizontal interpolation */
 162        for (y = (chromaPartHeight >> 1); y; y--)
 163        {
 164            for (x = (chromaPartWidth >> 1); x; x--)
 165            {
 166                tmp1 = ptrA[width];
 167                tmp2 = *ptrA++;
 168                tmp3 = ptrA[width];
 169                tmp4 = *ptrA++;
 170                c = ((val * tmp1 + xFrac * tmp3) << 3) + 32;
 171                c >>= 6;
 172                cbr[8] = (u8)c;
 173                c = ((val * tmp2 + xFrac * tmp4) << 3) + 32;
 174                c >>= 6;
 175                *cbr++ = (u8)c;
 176                tmp1 = ptrA[width];
 177                tmp2 = *ptrA;
 178                c = ((val * tmp3 + xFrac * tmp1) << 3) + 32;
 179                c >>= 6;
 180                cbr[8] = (u8)c;
 181                c = ((val * tmp4 + xFrac * tmp2) << 3) + 32;
 182                c >>= 6;
 183                *cbr++ = (u8)c;
 184            }
 185            cbr += 2*8 - chromaPartWidth;
 186            ptrA += 2*width - chromaPartWidth;
 187        }
 188    }
 189
 190}
 191
 192/*------------------------------------------------------------------------------
 193
 194    Function: h264bsdInterpolateChromaVer
 195
 196        Functional description:
 197          This function performs chroma interpolation in vertical direction.
 198          Overfilling is done only if needed. Reference image (pRef) is
 199          read at correct position and the predicted part is written to
 200          macroblock's chrominance (predPartChroma)
 201
 202------------------------------------------------------------------------------*/
 203
 204void h264bsdInterpolateChromaVer(
 205  u8 *pRef,
 206  u8 *predPartChroma,
 207  i32 x0,
 208  i32 y0,
 209  u32 width,
 210  u32 height,
 211  u32 yFrac,
 212  u32 chromaPartWidth,
 213  u32 chromaPartHeight)
 214{
 215
 216/* Variables */
 217
 218    u32 x, y, tmp1, tmp2, tmp3, c, val;
 219    u8 *ptrA, *cbr;
 220    u32 comp;
 221    u8 block[9*8*2];
 222
 223/* Code */
 224
 225    ASSERT(predPartChroma);
 226    ASSERT(chromaPartWidth);
 227    ASSERT(chromaPartHeight);
 228    ASSERT(yFrac < 8);
 229    ASSERT(pRef);
 230
 231    if ((x0 < 0) || ((u32)x0+chromaPartWidth > width) ||
 232        (y0 < 0) || ((u32)y0+chromaPartHeight+1 > height))
 233    {
 234        h264bsdFillBlock(pRef, block, x0, y0, width, height, chromaPartWidth,
 235            chromaPartHeight + 1, chromaPartWidth);
 236        pRef += width * height;
 237        h264bsdFillBlock(pRef, block + chromaPartWidth*(chromaPartHeight+1),
 238            x0, y0, width, height, chromaPartWidth,
 239            chromaPartHeight + 1, chromaPartWidth);
 240
 241        pRef = block;
 242        x0 = 0;
 243        y0 = 0;
 244        width = chromaPartWidth;
 245        height = chromaPartHeight+1;
 246    }
 247
 248    val = 8 - yFrac;
 249
 250    for (comp = 0; comp <= 1; comp++)
 251    {
 252
 253        ptrA = pRef + (comp * height + (u32)y0) * width + x0;
 254        cbr = predPartChroma + comp * 8 * 8;
 255
 256        /* 2x2 pels per iteration
 257         * bilinear vertical interpolation */
 258        for (y = (chromaPartHeight >> 1); y; y--)
 259        {
 260            for (x = (chromaPartWidth >> 1); x; x--)
 261            {
 262                tmp3 = ptrA[width*2];
 263                tmp2 = ptrA[width];
 264                tmp1 = *ptrA++;
 265                c = ((val * tmp2 + yFrac * tmp3) << 3) + 32;
 266                c >>= 6;
 267                cbr[8] = (u8)c;
 268                c = ((val * tmp1 + yFrac * tmp2) << 3) + 32;
 269                c >>= 6;
 270                *cbr++ = (u8)c;
 271                tmp3 = ptrA[width*2];
 272                tmp2 = ptrA[width];
 273                tmp1 = *ptrA++;
 274                c = ((val * tmp2 + yFrac * tmp3) << 3) + 32;
 275                c >>= 6;
 276                cbr[8] = (u8)c;
 277                c = ((val * tmp1 + yFrac * tmp2) << 3) + 32;
 278                c >>= 6;
 279                *cbr++ = (u8)c;
 280            }
 281            cbr += 2*8 - chromaPartWidth;
 282            ptrA += 2*width - chromaPartWidth;
 283        }
 284    }
 285
 286}
 287#endif
 288/*------------------------------------------------------------------------------
 289
 290    Function: h264bsdInterpolateChromaHorVer
 291
 292        Functional description:
 293          This function performs chroma interpolation in horizontal and
 294          vertical direction. Overfilling is done only if needed. Reference
 295          image (ref) is read at correct position and the predicted part
 296          is written to macroblock's chrominance (predPartChroma)
 297
 298------------------------------------------------------------------------------*/
 299
 300void h264bsdInterpolateChromaHorVer(
 301  u8 *ref,
 302  u8 *predPartChroma,
 303  i32 x0,
 304  i32 y0,
 305  u32 width,
 306  u32 height,
 307  u32 xFrac,
 308  u32 yFrac,
 309  u32 chromaPartWidth,
 310  u32 chromaPartHeight)
 311{
 312    u8 block[9*9*2];
 313    u32 x, y, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, valX, valY, plus32 = 32;
 314    u32 comp;
 315    u8 *ptrA, *cbr;
 316
 317/* Code */
 318
 319    ASSERT(predPartChroma);
 320    ASSERT(chromaPartWidth);
 321    ASSERT(chromaPartHeight);
 322    ASSERT(xFrac < 8);
 323    ASSERT(yFrac < 8);
 324    ASSERT(ref);
 325
 326    if ((x0 < 0) || ((u32)x0+chromaPartWidth+1 > width) ||
 327        (y0 < 0) || ((u32)y0+chromaPartHeight+1 > height))
 328    {
 329        h264bsdFillBlock(ref, block, x0, y0, width, height,
 330            chromaPartWidth + 1, chromaPartHeight + 1, chromaPartWidth + 1);
 331        ref += width * height;
 332        h264bsdFillBlock(ref, block + (chromaPartWidth+1)*(chromaPartHeight+1),
 333            x0, y0, width, height, chromaPartWidth + 1,
 334            chromaPartHeight + 1, chromaPartWidth + 1);
 335
 336        ref = block;
 337        x0 = 0;
 338        y0 = 0;
 339        width = chromaPartWidth+1;
 340        height = chromaPartHeight+1;
 341    }
 342
 343    valX = 8 - xFrac;
 344    valY = 8 - yFrac;
 345
 346    for (comp = 0; comp <= 1; comp++)
 347    {
 348
 349        ptrA = ref + (comp * height + (u32)y0) * width + x0;
 350        cbr = predPartChroma + comp * 8 * 8;
 351
 352        /* 2x2 pels per iteration
 353         * bilinear vertical and horizontal interpolation */
 354        for (y = (chromaPartHeight >> 1); y; y--)
 355        {
 356            tmp1 = *ptrA;
 357            tmp3 = ptrA[width];
 358            tmp5 = ptrA[width*2];
 359            tmp1 *= valY;
 360            tmp1 += tmp3 * yFrac;
 361            tmp3 *= valY;
 362            tmp3 += tmp5 * yFrac;
 363            for (x = (chromaPartWidth >> 1); x; x--)
 364            {
 365                tmp2 = *++ptrA;
 366                tmp4 = ptrA[width];
 367                tmp6 = ptrA[width*2];
 368                tmp2 *= valY;
 369                tmp2 += tmp4 * yFrac;
 370                tmp4 *= valY;
 371                tmp4 += tmp6 * yFrac;
 372                tmp1 = tmp1 * valX + plus32;
 373                tmp3 = tmp3 * valX + plus32;
 374                tmp1 += tmp2 * xFrac;
 375                tmp1 >>= 6;
 376                tmp3 += tmp4 * xFrac;
 377                tmp3 >>= 6;
 378                cbr[8] = (u8)tmp3;
 379                *cbr++ = (u8)tmp1;
 380
 381                tmp1 = *++ptrA;
 382                tmp3 = ptrA[width];
 383                tmp5 = ptrA[width*2];
 384                tmp1 *= valY;
 385                tmp1 += tmp3 * yFrac;
 386                tmp3 *= valY;
 387                tmp3 += tmp5 * yFrac;
 388                tmp2 = tmp2 * valX + plus32;
 389                tmp4 = tmp4 * valX + plus32;
 390                tmp2 += tmp1 * xFrac;
 391                tmp2 >>= 6;
 392                tmp4 += tmp3 * xFrac;
 393                tmp4 >>= 6;
 394                cbr[8] = (u8)tmp4;
 395                *cbr++ = (u8)tmp2;
 396            }
 397            cbr += 2*8 - chromaPartWidth;
 398            ptrA += 2*width - chromaPartWidth;
 399        }
 400    }
 401
 402}
 403
 404/*------------------------------------------------------------------------------
 405
 406    Function: PredictChroma
 407
 408        Functional description:
 409          Top level chroma prediction function that calls the appropriate
 410          interpolation function. The output is written to macroblock array.
 411
 412------------------------------------------------------------------------------*/
 413
 414static void PredictChroma(
 415  u8 *mbPartChroma,
 416  u32 xAL,
 417  u32 yAL,
 418  u32 partWidth,
 419  u32 partHeight,
 420  mv_t *mv,
 421  image_t *refPic)
 422{
 423
 424/* Variables */
 425
 426    u32 xFrac, yFrac, width, height, chromaPartWidth, chromaPartHeight;
 427    i32 xInt, yInt;
 428    u8 *ref;
 429
 430/* Code */
 431
 432    ASSERT(mv);
 433    ASSERT(refPic);
 434    ASSERT(refPic->data);
 435    ASSERT(refPic->width);
 436    ASSERT(refPic->height);
 437
 438    width  = 8 * refPic->width;
 439    height = 8 * refPic->height;
 440
 441    xInt = (xAL >> 1) + (mv->hor >> 3);
 442    yInt = (yAL >> 1) + (mv->ver >> 3);
 443    xFrac = mv->hor & 0x7;
 444    yFrac = mv->ver & 0x7;
 445
 446    chromaPartWidth  = partWidth >> 1;
 447    chromaPartHeight = partHeight >> 1;
 448    ref = refPic->data + 256 * refPic->width * refPic->height;
 449
 450    if (xFrac && yFrac)
 451    {
 452        h264bsdInterpolateChromaHorVer(ref, mbPartChroma, xInt, yInt, width,
 453                height, xFrac, yFrac, chromaPartWidth, chromaPartHeight);
 454    }
 455    else if (xFrac)
 456    {
 457        h264bsdInterpolateChromaHor(ref, mbPartChroma, xInt, yInt, width,
 458                height, xFrac, chromaPartWidth, chromaPartHeight);
 459    }
 460    else if (yFrac)
 461    {
 462        h264bsdInterpolateChromaVer(ref, mbPartChroma, xInt, yInt, width,
 463                height, yFrac, chromaPartWidth, chromaPartHeight);
 464    }
 465    else
 466    {
 467        h264bsdFillBlock(ref, mbPartChroma, xInt, yInt, width, height,
 468            chromaPartWidth, chromaPartHeight, 8);
 469        ref += width * height;
 470        h264bsdFillBlock(ref, mbPartChroma + 8*8, xInt, yInt, width, height,
 471            chromaPartWidth, chromaPartHeight, 8);
 472    }
 473
 474}
 475
 476
 477/*------------------------------------------------------------------------------
 478
 479    Function: h264bsdInterpolateVerHalf
 480
 481        Functional description:
 482          Function to perform vertical interpolation of pixel position 'h'
 483          for a block. Overfilling is done only if needed. Reference
 484          image (ref) is read at correct position and the predicted part
 485          is written to macroblock array (mb)
 486
 487------------------------------------------------------------------------------*/
 488#ifndef H264DEC_ARM11
 489void h264bsdInterpolateVerHalf(
 490  u8 *ref,
 491  u8 *mb,
 492  i32 x0,
 493  i32 y0,
 494  u32 width,
 495  u32 height,
 496  u32 partWidth,
 497  u32 partHeight)
 498{
 499    u32 p1[21*21/4+1];
 500    u32 i, j;
 501    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 502    u8 *ptrC, *ptrV;
 503    const u8 *clp = h264bsdClip + 512;
 504
 505    /* Code */
 506
 507    ASSERT(ref);
 508    ASSERT(mb);
 509
 510    if ((x0 < 0) || ((u32)x0+partWidth > width) ||
 511        (y0 < 0) || ((u32)y0+partHeight+5 > height))
 512    {
 513        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
 514                partWidth, partHeight+5, partWidth);
 515
 516        x0 = 0;
 517        y0 = 0;
 518        ref = (u8*)p1;
 519        width = partWidth;
 520    }
 521
 522    ref += (u32)y0 * width + (u32)x0;
 523
 524    ptrC = ref + width;
 525    ptrV = ptrC + 5*width;
 526
 527    /* 4 pixels per iteration, interpolate using 5 vertical samples */
 528    for (i = (partHeight >> 2); i; i--)
 529    {
 530        /* h1 = (16 + A + 16(G+M) + 4(G+M) - 4(C+R) - (C+R) + T) >> 5 */
 531        for (j = partWidth; j; j--)
 532        {
 533            tmp4 = ptrV[-(i32)width*2];
 534            tmp5 = ptrV[-(i32)width];
 535            tmp1 = ptrV[width];
 536            tmp2 = ptrV[width*2];
 537            tmp6 = *ptrV++;
 538
 539            tmp7 = tmp4 + tmp1;
 540            tmp2 -= (tmp7 << 2);
 541            tmp2 -= tmp7;
 542            tmp2 += 16;
 543            tmp7 = tmp5 + tmp6;
 544            tmp3 = ptrC[width*2];
 545            tmp2 += (tmp7 << 4);
 546            tmp2 += (tmp7 << 2);
 547            tmp2 += tmp3;
 548            tmp2 = clp[tmp2>>5];
 549            tmp1 += 16;
 550            mb[48] = (u8)tmp2;
 551
 552            tmp7 = tmp3 + tmp6;
 553            tmp1 -= (tmp7 << 2);
 554            tmp1 -= tmp7;
 555            tmp7 = tmp4 + tmp5;
 556            tmp2 = ptrC[width];
 557            tmp1 += (tmp7 << 4);
 558            tmp1 += (tmp7 << 2);
 559            tmp1 += tmp2;
 560            tmp1 = clp[tmp1>>5];
 561            tmp6 += 16;
 562            mb[32] = (u8)tmp1;
 563
 564            tmp7 = tmp2 + tmp5;
 565            tmp6 -= (tmp7 << 2);
 566            tmp6 -= tmp7;
 567            tmp7 = tmp4 + tmp3;
 568            tmp1 = *ptrC;
 569            tmp6 += (tmp7 << 4);
 570            tmp6 += (tmp7 << 2);
 571            tmp6 += tmp1;
 572            tmp6 = clp[tmp6>>5];
 573            tmp5 += 16;
 574            mb[16] = (u8)tmp6;
 575
 576            tmp1 += tmp4;
 577            tmp5 -= (tmp1 << 2);
 578            tmp5 -= tmp1;
 579            tmp3 += tmp2;
 580            tmp6 = ptrC[-(i32)width];
 581            tmp5 += (tmp3 << 4);
 582            tmp5 += (tmp3 << 2);
 583            tmp5 += tmp6;
 584            tmp5 = clp[tmp5>>5];
 585            *mb++ = (u8)tmp5;
 586            ptrC++;
 587        }
 588        ptrC += 4*width - partWidth;
 589        ptrV += 4*width - partWidth;
 590        mb += 4*16 - partWidth;
 591    }
 592
 593}
 594
 595/*------------------------------------------------------------------------------
 596
 597    Function: h264bsdInterpolateVerQuarter
 598
 599        Functional description:
 600          Function to perform vertical interpolation of pixel position 'd'
 601          or 'n' for a block. Overfilling is done only if needed. Reference
 602          image (ref) is read at correct position and the predicted part
 603          is written to macroblock array (mb)
 604
 605------------------------------------------------------------------------------*/
 606
 607void h264bsdInterpolateVerQuarter(
 608  u8 *ref,
 609  u8 *mb,
 610  i32 x0,
 611  i32 y0,
 612  u32 width,
 613  u32 height,
 614  u32 partWidth,
 615  u32 partHeight,
 616  u32 verOffset)    /* 0 for pixel d, 1 for pixel n */
 617{
 618    u32 p1[21*21/4+1];
 619    u32 i, j;
 620    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 621    u8 *ptrC, *ptrV, *ptrInt;
 622    const u8 *clp = h264bsdClip + 512;
 623
 624    /* Code */
 625
 626    ASSERT(ref);
 627    ASSERT(mb);
 628
 629    if ((x0 < 0) || ((u32)x0+partWidth > width) ||
 630        (y0 < 0) || ((u32)y0+partHeight+5 > height))
 631    {
 632        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
 633                partWidth, partHeight+5, partWidth);
 634
 635        x0 = 0;
 636        y0 = 0;
 637        ref = (u8*)p1;
 638        width = partWidth;
 639    }
 640
 641    ref += (u32)y0 * width + (u32)x0;
 642
 643    ptrC = ref + width;
 644    ptrV = ptrC + 5*width;
 645
 646    /* Pointer to integer sample position, either M or R */
 647    ptrInt = ptrC + (2+verOffset)*width;
 648
 649    /* 4 pixels per iteration
 650     * interpolate using 5 vertical samples and average between
 651     * interpolated value and integer sample value */
 652    for (i = (partHeight >> 2); i; i--)
 653    {
 654        /* h1 = (16 + A + 16(G+M) + 4(G+M) - 4(C+R) - (C+R) + T) >> 5 */
 655        for (j = partWidth; j; j--)
 656        {
 657            tmp4 = ptrV[-(i32)width*2];
 658            tmp5 = ptrV[-(i32)width];
 659            tmp1 = ptrV[width];
 660            tmp2 = ptrV[width*2];
 661            tmp6 = *ptrV++;
 662
 663            tmp7 = tmp4 + tmp1;
 664            tmp2 -= (tmp7 << 2);
 665            tmp2 -= tmp7;
 666            tmp2 += 16;
 667            tmp7 = tmp5 + tmp6;
 668            tmp3 = ptrC[width*2];
 669            tmp2 += (tmp7 << 4);
 670            tmp2 += (tmp7 << 2);
 671            tmp2 += tmp3;
 672            tmp2 = clp[tmp2>>5];
 673            tmp7 = ptrInt[width*2];
 674            tmp1 += 16;
 675            tmp2++;
 676            mb[48] = (u8)((tmp2 + tmp7) >> 1);
 677
 678            tmp7 = tmp3 + tmp6;
 679            tmp1 -= (tmp7 << 2);
 680            tmp1 -= tmp7;
 681            tmp7 = tmp4 + tmp5;
 682            tmp2 = ptrC[width];
 683            tmp1 += (tmp7 << 4);
 684            tmp1 += (tmp7 << 2);
 685            tmp1 += tmp2;
 686            tmp1 = clp[tmp1>>5];
 687            tmp7 = ptrInt[width];
 688            tmp6 += 16;
 689            tmp1++;
 690            mb[32] = (u8)((tmp1 + tmp7) >> 1);
 691
 692            tmp7 = tmp2 + tmp5;
 693            tmp6 -= (tmp7 << 2);
 694            tmp6 -= tmp7;
 695            tmp7 = tmp4 + tmp3;
 696            tmp1 = *ptrC;
 697            tmp6 += (tmp7 << 4);
 698            tmp6 += (tmp7 << 2);
 699            tmp6 += tmp1;
 700            tmp6 = clp[tmp6>>5];
 701            tmp7 = *ptrInt;
 702            tmp5 += 16;
 703            tmp6++;
 704            mb[16] = (u8)((tmp6 + tmp7) >> 1);
 705
 706            tmp1 += tmp4;
 707            tmp5 -= (tmp1 << 2);
 708            tmp5 -= tmp1;
 709            tmp3 += tmp2;
 710            tmp6 = ptrC[-(i32)width];
 711            tmp5 += (tmp3 << 4);
 712            tmp5 += (tmp3 << 2);
 713            tmp5 += tmp6;
 714            tmp5 = clp[tmp5>>5];
 715            tmp7 = ptrInt[-(i32)width];
 716            tmp5++;
 717            *mb++ = (u8)((tmp5 + tmp7) >> 1);
 718            ptrC++;
 719            ptrInt++;
 720        }
 721        ptrC += 4*width - partWidth;
 722        ptrV += 4*width - partWidth;
 723        ptrInt += 4*width - partWidth;
 724        mb += 4*16 - partWidth;
 725    }
 726
 727}
 728
 729/*------------------------------------------------------------------------------
 730
 731    Function: h264bsdInterpolateHorHalf
 732
 733        Functional description:
 734          Function to perform horizontal interpolation of pixel position 'b'
 735          for a block. Overfilling is done only if needed. Reference
 736          image (ref) is read at correct position and the predicted part
 737          is written to macroblock array (mb)
 738
 739------------------------------------------------------------------------------*/
 740
 741void h264bsdInterpolateHorHalf(
 742  u8 *ref,
 743  u8 *mb,
 744  i32 x0,
 745  i32 y0,
 746  u32 width,
 747  u32 height,
 748  u32 partWidth,
 749  u32 partHeight)
 750{
 751    u32 p1[21*21/4+1];
 752    u8 *ptrJ;
 753    u32 x, y;
 754    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 755    const u8 *clp = h264bsdClip + 512;
 756
 757    /* Code */
 758
 759    ASSERT(ref);
 760    ASSERT(mb);
 761    ASSERT((partWidth&0x3) == 0);
 762    ASSERT((partHeight&0x3) == 0);
 763
 764    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
 765        (y0 < 0) || ((u32)y0+partHeight > height))
 766    {
 767        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
 768                partWidth+5, partHeight, partWidth+5);
 769
 770        x0 = 0;
 771        y0 = 0;
 772        ref = (u8*)p1;
 773        width = partWidth + 5;
 774    }
 775
 776    ref += (u32)y0 * width + (u32)x0;
 777
 778    ptrJ = ref + 5;
 779
 780    for (y = partHeight; y; y--)
 781    {
 782        tmp6 = *(ptrJ - 5);
 783        tmp5 = *(ptrJ - 4);
 784        tmp4 = *(ptrJ - 3);
 785        tmp3 = *(ptrJ - 2);
 786        tmp2 = *(ptrJ - 1);
 787
 788        /* calculate 4 pels per iteration */
 789        for (x = (partWidth >> 2); x; x--)
 790        {
 791            /* First pixel */
 792            tmp6 += 16;
 793            tmp7 = tmp3 + tmp4;
 794            tmp6 += (tmp7 << 4);
 795            tmp6 += (tmp7 << 2);
 796            tmp7 = tmp2 + tmp5;
 797            tmp1 = *ptrJ++;
 798            tmp6 -= (tmp7 << 2);
 799            tmp6 -= tmp7;
 800            tmp6 += tmp1;
 801            tmp6 = clp[tmp6>>5];
 802            /* Second pixel */
 803            tmp5 += 16;
 804            tmp7 = tmp2 + tmp3;
 805            *mb++ = (u8)tmp6;
 806            tmp5 += (tmp7 << 4);
 807            tmp5 += (tmp7 << 2);
 808            tmp7 = tmp1 + tmp4;
 809            tmp6 = *ptrJ++;
 810            tmp5 -= (tmp7 << 2);
 811            tmp5 -= tmp7;
 812            tmp5 += tmp6;
 813            tmp5 = clp[tmp5>>5];
 814            /* Third pixel */
 815            tmp4 += 16;
 816            tmp7 = tmp1 + tmp2;
 817            *mb++ = (u8)tmp5;
 818            tmp4 += (tmp7 << 4);
 819            tmp4 += (tmp7 << 2);
 820            tmp7 = tmp6 + tmp3;
 821            tmp5 = *ptrJ++;
 822            tmp4 -= (tmp7 << 2);
 823            tmp4 -= tmp7;
 824            tmp4 += tmp5;
 825            tmp4 = clp[tmp4>>5];
 826            /* Fourth pixel */
 827            tmp3 += 16;
 828            tmp7 = tmp6 + tmp1;
 829            *mb++ = (u8)tmp4;
 830            tmp3 += (tmp7 << 4);
 831            tmp3 += (tmp7 << 2);
 832            tmp7 = tmp5 + tmp2;
 833            tmp4 = *ptrJ++;
 834            tmp3 -= (tmp7 << 2);
 835            tmp3 -= tmp7;
 836            tmp3 += tmp4;
 837            tmp3 = clp[tmp3>>5];
 838            tmp7 = tmp4;
 839            tmp4 = tmp6;
 840            tmp6 = tmp2;
 841            tmp2 = tmp7;
 842            *mb++ = (u8)tmp3;
 843            tmp3 = tmp5;
 844            tmp5 = tmp1;
 845        }
 846        ptrJ += width - partWidth;
 847        mb += 16 - partWidth;
 848    }
 849
 850}
 851
 852/*------------------------------------------------------------------------------
 853
 854    Function: h264bsdInterpolateHorQuarter
 855
 856        Functional description:
 857          Function to perform horizontal interpolation of pixel position 'a'
 858          or 'c' for a block. Overfilling is done only if needed. Reference
 859          image (ref) is read at correct position and the predicted part
 860          is written to macroblock array (mb)
 861
 862------------------------------------------------------------------------------*/
 863
 864void h264bsdInterpolateHorQuarter(
 865  u8 *ref,
 866  u8 *mb,
 867  i32 x0,
 868  i32 y0,
 869  u32 width,
 870  u32 height,
 871  u32 partWidth,
 872  u32 partHeight,
 873  u32 horOffset) /* 0 for pixel a, 1 for pixel c */
 874{
 875    u32 p1[21*21/4+1];
 876    u8 *ptrJ;
 877    u32 x, y;
 878    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 879    const u8 *clp = h264bsdClip + 512;
 880
 881    /* Code */
 882
 883    ASSERT(ref);
 884    ASSERT(mb);
 885
 886    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
 887        (y0 < 0) || ((u32)y0+partHeight > height))
 888    {
 889        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
 890                partWidth+5, partHeight, partWidth+5);
 891
 892        x0 = 0;
 893        y0 = 0;
 894        ref = (u8*)p1;
 895        width = partWidth + 5;
 896    }
 897
 898    ref += (u32)y0 * width + (u32)x0;
 899
 900    ptrJ = ref + 5;
 901
 902    for (y = partHeight; y; y--)
 903    {
 904        tmp6 = *(ptrJ - 5);
 905        tmp5 = *(ptrJ - 4);
 906        tmp4 = *(ptrJ - 3);
 907        tmp3 = *(ptrJ - 2);
 908        tmp2 = *(ptrJ - 1);
 909
 910        /* calculate 4 pels per iteration */
 911        for (x = (partWidth >> 2); x; x--)
 912        {
 913            /* First pixel */
 914            tmp6 += 16;
 915            tmp7 = tmp3 + tmp4;
 916            tmp6 += (tmp7 << 4);
 917            tmp6 += (tmp7 << 2);
 918            tmp7 = tmp2 + tmp5;
 919            tmp1 = *ptrJ++;
 920            tmp6 -= (tmp7 << 2);
 921            tmp6 -= tmp7;
 922            tmp6 += tmp1;
 923            tmp6 = clp[tmp6>>5];
 924            tmp5 += 16;
 925            if (!horOffset)
 926                tmp6 += tmp4;
 927            else
 928                tmp6 += tmp3;
 929            *mb++ = (u8)((tmp6 + 1) >> 1);
 930            /* Second pixel */
 931            tmp7 = tmp2 + tmp3;
 932            tmp5 += (tmp7 << 4);
 933            tmp5 += (tmp7 << 2);
 934            tmp7 = tmp1 + tmp4;
 935            tmp6 = *ptrJ++;
 936            tmp5 -= (tmp7 << 2);
 937            tmp5 -= tmp7;
 938            tmp5 += tmp6;
 939            tmp5 = clp[tmp5>>5];
 940            tmp4 += 16;
 941            if (!horOffset)
 942                tmp5 += tmp3;
 943            else
 944                tmp5 += tmp2;
 945            *mb++ = (u8)((tmp5 + 1) >> 1);
 946            /* Third pixel */
 947            tmp7 = tmp1 + tmp2;
 948            tmp4 += (tmp7 << 4);
 949            tmp4 += (tmp7 << 2);
 950            tmp7 = tmp6 + tmp3;
 951            tmp5 = *ptrJ++;
 952            tmp4 -= (tmp7 << 2);
 953            tmp4 -= tmp7;
 954            tmp4 += tmp5;
 955            tmp4 = clp[tmp4>>5];
 956            tmp3 += 16;
 957            if (!horOffset)
 958                tmp4 += tmp2;
 959            else
 960                tmp4 += tmp1;
 961            *mb++ = (u8)((tmp4 + 1) >> 1);
 962            /* Fourth pixel */
 963            tmp7 = tmp6 + tmp1;
 964            tmp3 += (tmp7 << 4);
 965            tmp3 += (tmp7 << 2);
 966            tmp7 = tmp5 + tmp2;
 967            tmp4 = *ptrJ++;
 968            tmp3 -= (tmp7 << 2);
 969            tmp3 -= tmp7;
 970            tmp3 += tmp4;
 971            tmp3 = clp[tmp3>>5];
 972            if (!horOffset)
 973                tmp3 += tmp1;
 974            else
 975                tmp3 += tmp6;
 976            *mb++ = (u8)((tmp3 + 1) >> 1);
 977            tmp3 = tmp5;
 978            tmp5 = tmp1;
 979            tmp7 = tmp4;
 980            tmp4 = tmp6;
 981            tmp6 = tmp2;
 982            tmp2 = tmp7;
 983        }
 984        ptrJ += width - partWidth;
 985        mb += 16 - partWidth;
 986    }
 987
 988}
 989
 990/*------------------------------------------------------------------------------
 991
 992    Function: h264bsdInterpolateHorVerQuarter
 993
 994        Functional description:
 995          Function to perform horizontal and vertical interpolation of pixel
 996          position 'e', 'g', 'p' or 'r' for a block. Overfilling is done only
 997          if needed. Reference image (ref) is read at correct position and
 998          the predicted part is written to macroblock array (mb)
 999
1000------------------------------------------------------------------------------*/
1001
1002void h264bsdInterpolateHorVerQuarter(
1003  u8 *ref,
1004  u8 *mb,
1005  i32 x0,
1006  i32 y0,
1007  u32 width,
1008  u32 height,
1009  u32 partWidth,
1010  u32 partHeight,
1011  u32 horVerOffset) /* 0 for pixel e, 1 for pixel g,
1012                       2 for pixel p, 3 for pixel r */
1013{
1014    u32 p1[21*21/4+1];
1015    u8 *ptrC, *ptrJ, *ptrV;
1016    u32 x, y;
1017    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1018    const u8 *clp = h264bsdClip + 512;
1019
1020    /* Code */
1021
1022    ASSERT(ref);
1023    ASSERT(mb);
1024
1025    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
1026        (y0 < 0) || ((u32)y0+partHeight+5 > height))
1027    {
1028        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
1029                partWidth+5, partHeight+5, partWidth+5);
1030
1031        x0 = 0;
1032        y0 = 0;
1033        ref = (u8*)p1;
1034        width = partWidth+5;
1035    }
1036
1037    /* Ref points to G + (-2, -2) */
1038    ref += (u32)y0 * width + (u32)x0;
1039
1040    /* ptrJ points to either J or Q, depending on vertical offset */
1041    ptrJ = ref + (((horVerOffset & 0x2) >> 1) + 2) * width + 5;
1042
1043    /* ptrC points to either C or D, depending on horizontal offset */
1044    ptrC = ref + width + 2 + (horVerOffset & 0x1);
1045
1046    for (y = partHeight; y; y--)
1047    {
1048        tmp6 = *(ptrJ - 5);
1049        tmp5 = *(ptrJ - 4);
1050        tmp4 = *(ptrJ - 3);
1051        tmp3 = *(ptrJ - 2);
1052        tmp2 = *(ptrJ - 1);
1053
1054        /* Horizontal interpolation, calculate 4 pels per iteration */
1055        for (x = (partWidth >> 2); x; x--)
1056        {
1057            /* First pixel */
1058            tmp6 += 16;
1059            tmp7 = tmp3 + tmp4;
1060            tmp6 += (tmp7 << 4);
1061            tmp6 += (tmp7 << 2);
1062            tmp7 = tmp2 + tmp5;
1063            tmp1 = *ptrJ++;
1064            tmp6 -= (tmp7 << 2);
1065            tmp6 -= tmp7;
1066            tmp6 += tmp1;
1067            tmp6 = clp[tmp6>>5];
1068            /* Second pixel */
1069            tmp5 += 16;
1070            tmp7 = tmp2 + tmp3;
1071            *mb++ = (u8)tmp6;
1072            tmp5 += (tmp7 << 4);
1073            tmp5 += (tmp7 << 2);
1074            tmp7 = tmp1 + tmp4;
1075            tmp6 = *ptrJ++;
1076            tmp5 -= (tmp7 << 2);
1077            tmp5 -= tmp7;
1078            tmp5 += tmp6;
1079            tmp5 = clp[tmp5>>5];
1080            /* Third pixel */
1081            tmp4 += 16;
1082            tmp7 = tmp1 + tmp2;
1083            *mb++ = (u8)tmp5;
1084            tmp4 += (tmp7 << 4);
1085            tmp4 += (tmp7 << 2);
1086            tmp7 = tmp6 + tmp3;
1087            tmp5 = *ptrJ++;
1088            tmp4 -= (tmp7 << 2);
1089            tmp4 -= tmp7;
1090            tmp4 += tmp5;
1091            tmp4 = clp[tmp4>>5];
1092            /* Fourth pixel */
1093            tmp3 += 16;
1094            tmp7 = tmp6 + tmp1;
1095            *mb++ = (u8)tmp4;
1096            tmp3 += (tmp7 << 4);
1097            tmp3 += (tmp7 << 2);
1098            tmp7 = tmp5 + tmp2;
1099            tmp4 = *ptrJ++;
1100            tmp3 -= (tmp7 << 2);
1101            tmp3 -= tmp7;
1102            tmp3 += tmp4;
1103            tmp3 = clp[tmp3>>5];
1104            tmp7 = tmp4;
1105            tmp4 = tmp6;
1106            tmp6 = tmp2;
1107            tmp2 = tmp7;
1108            *mb++ = (u8)tmp3;
1109            tmp3 = tmp5;
1110            tmp5 = tmp1;
1111        }
1112        ptrJ += width - partWidth;
1113        mb += 16 - partWidth;
1114    }
1115
1116    mb -= 16*partHeight;
1117    ptrV = ptrC + 5*width;
1118
1119    for (y = (partHeight >> 2); y; y--)
1120    {
1121        /* Vertical interpolation and averaging, 4 pels per iteration */
1122        for (x = partWidth; x; x--)
1123        {
1124            tmp4 = ptrV[-(i32)width*2];
1125            tmp5 = ptrV[-(i32)width];
1126            tmp1 = ptrV[width];
1127            tmp2 = ptrV[width*2];
1128            tmp6 = *ptrV++;
1129
1130            tmp7 = tmp4 + tmp1;
1131            tmp2 -= (tmp7 << 2);
1132            tmp2 -= tmp7;
1133            tmp2 += 16;
1134            tmp7 = tmp5 + tmp6;
1135            tmp3 = ptrC[width*2];
1136            tmp2 += (tmp7 << 4);
1137            tmp2 += (tmp7 << 2);
1138            tmp2 += tmp3;
1139            tmp7 = clp[tmp2>>5];
1140            tmp2 = mb[48];
1141            tmp1 += 16;
1142            tmp7++;
1143            mb[48] = (u8)((tmp2 + tmp7) >> 1);
1144
1145            tmp7 = tmp3 + tmp6;
1146            tmp1 -= (tmp7 << 2);
1147            tmp1 -= tmp7;
1148            tmp7 = tmp4 + tmp5;
1149            tmp2 = ptrC[width];
1150            tmp1 += (tmp7 << 4);
1151            tmp1 += (tmp7 << 2);
1152            tmp1 += tmp2;
1153            tmp7 = clp[tmp1>>5];
1154            tmp1 = mb[32];
1155            tmp6 += 16;
1156            tmp7++;
1157            mb[32] = (u8)((tmp1 + tmp7) >> 1);
1158
1159            tmp1 = *ptrC;
1160            tmp7 = tmp2 + tmp5;
1161            tmp6 -= (tmp7 << 2);
1162            tmp6 -= tmp7;
1163            tmp7 = tmp4 + tmp3;
1164            tmp6 += (tmp7 << 4);
1165            tmp6 += (tmp7 << 2);
1166            tmp6 += tmp1;
1167            tmp7 = clp[tmp6>>5];
1168            tmp6 = mb[16];
1169            tmp5 += 16;
1170            tmp7++;
1171            mb[16] = (u8)((tmp6 + tmp7) >> 1);
1172
1173            tmp6 = ptrC[-(i32)width];
1174            tmp1 += tmp4;
1175            tmp5 -= (tmp1 << 2);
1176            tmp5 -= tmp1;
1177            tmp3 += tmp2;
1178            tmp5 += (tmp3 << 4);
1179            tmp5 += (tmp3 << 2);
1180            tmp5 += tmp6;
1181            tmp7 = clp[tmp5>>5];
1182            tmp5 = *mb;
1183            tmp7++;
1184            *mb++ = (u8)((tmp5 + tmp7) >> 1);
1185            ptrC++;
1186
1187        }
1188        ptrC += 4*width - partWidth;
1189        ptrV += 4*width - partWidth;
1190        mb += 4*16 - partWidth;
1191    }
1192
1193}
1194#endif
1195
1196/*------------------------------------------------------------------------------
1197
1198    Function: h264bsdInterpolateMidHalf
1199
1200        Functional description:
1201          Function to perform horizontal and vertical interpolation of pixel
1202          position 'j' for a block. Overfilling is done only if needed.
1203          Reference image (ref) is read at correct position and the predicted
1204          part is written to macroblock array (mb)
1205
1206------------------------------------------------------------------------------*/
1207
1208void h264bsdInterpolateMidHalf(
1209  u8 *ref,
1210  u8 *mb,
1211  i32 x0,
1212  i32 y0,
1213  u32 width,
1214  u32 height,
1215  u32 partWidth,
1216  u32 partHeight)
1217{
1218    u32 p1[21*21/4+1];
1219    u32 x, y;
1220    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1221    i32 *ptrC, *ptrV, *b1;
1222    u8  *ptrJ;
1223    i32 table[21*16];
1224    const u8 *clp = h264bsdClip + 512;
1225
1226    /* Code */
1227
1228    ASSERT(ref);
1229    ASSERT(mb);
1230
1231    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
1232        (y0 < 0) || ((u32)y0+partHeight+5 > height))
1233    {
1234        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
1235                partWidth+5, partHeight+5, partWidth+5);
1236
1237        x0 = 0;
1238        y0 = 0;
1239        ref = (u8*)p1;
1240        width = partWidth+5;
1241    }
1242
1243    ref += (u32)y0 * width + (u32)x0;
1244
1245    b1 = table;
1246    ptrJ = ref + 5;
1247
1248    /* First step: calculate intermediate values for
1249     * horizontal interpolation */
1250    for (y = partHeight + 5; y; y--)
1251    {
1252        tmp6 = *(ptrJ - 5);
1253        tmp5 = *(ptrJ - 4);
1254        tmp4 = *(ptrJ - 3);
1255        tmp3 = *(ptrJ - 2);
1256        tmp2 = *(ptrJ - 1);
1257
1258        /* 4 pels per iteration */
1259        for (x = (partWidth >> 2); x; x--)
1260        {
1261            /* First pixel */
1262            tmp7 = tmp3 + tmp4;
1263            tmp6 += (tmp7 << 4);
1264            tmp6 += (tmp7 << 2);
1265            tmp7 = tmp2 + tmp5;
1266            tmp1 = *ptrJ++;
1267            tmp6 -= (tmp7 << 2);
1268            tmp6 -= tmp7;
1269            tmp6 += tmp1;
1270            *b1++ = tmp6;
1271            /* Second pixel */
1272            tmp7 = tmp2 + tmp3;
1273            tmp5 += (tmp7 << 4);
1274            tmp5 += (tmp7 << 2);
1275            tmp7 = tmp1 + tmp4;
1276            tmp6 = *ptrJ++;
1277            tmp5 -= (tmp7 << 2);
1278            tmp5 -= tmp7;
1279            tmp5 += tmp6;
1280            *b1++ = tmp5;
1281            /* Third pixel */
1282            tmp7 = tmp1 + tmp2;
1283            tmp4 += (tmp7 << 4);
1284            tmp4 += (tmp7 << 2);
1285            tmp7 = tmp6 + tmp3;
1286            tmp5 = *ptrJ++;
1287            tmp4 -= (tmp7 << 2);
1288            tmp4 -= tmp7;
1289            tmp4 += tmp5;
1290            *b1++ = tmp4;
1291            /* Fourth pixel */
1292            tmp7 = tmp6 + tmp1;
1293            tmp3 += (tmp7 << 4);
1294            tmp3 += (tmp7 << 2);
1295            tmp7 = tmp5 + tmp2;
1296            tmp4 = *ptrJ++;
1297            tmp3 -= (tmp7 << 2);
1298            tmp3 -= tmp7;
1299            tmp3 += tmp4;
1300            *b1++ = tmp3;
1301            tmp7 = tmp4;
1302            tmp4 = tmp6;
1303            tmp6 = tmp2;
1304            tmp2 = tmp7;
1305            tmp3 = tmp5;
1306            tmp5 = tmp1;
1307        }
1308        ptrJ += width - partWidth;
1309    }
1310
1311    /* Second step: calculate vertical interpolation */
1312    ptrC = table + partWidth;
1313    ptrV = ptrC + 5*partWidth;
1314    for (y = (partHeight >> 2); y; y--)
1315    {
1316        /* 4 pels per iteration */
1317        for (x = partWidth; x; x--)
1318        {
1319            tmp4 = ptrV[-(i32)partWidth*2];
1320            tmp5 = ptrV[-(i32)partWidth];
1321            tmp1 = ptrV[partWidth];
1322            tmp2 = ptrV[partWidth*2];
1323            tmp6 = *ptrV++;
1324
1325            tmp7 = tmp4 + tmp1;
1326            tmp2 -= (tmp7 << 2);
1327            tmp2 -= tmp7;
1328            tmp2 += 512;
1329            tmp7 = tmp5 + tmp6;
1330            tmp3 = ptrC[partWidth*2];
1331            tmp2 += (tmp7 << 4);
1332            tmp2 += (tmp7 << 2);
1333            tmp2 += tmp3;
1334            tmp7 = clp[tmp2>>10];
1335            tmp1 += 512;
1336            mb[48] = (u8)tmp7;
1337
1338            tmp7 = tmp3 + tmp6;
1339            tmp1 -= (tmp7 << 2);
1340            tmp1 -= tmp7;
1341            tmp7 = tmp4 + tmp5;
1342            tmp2 = ptrC[partWidth];
1343            tmp1 += (tmp7 << 4);
1344            tmp1 += (tmp7 << 2);
1345            tmp1 += tmp2;
1346            tmp7 = clp[tmp1>>10];
1347            tmp6 += 512;
1348            mb[32] = (u8)tmp7;
1349
1350            tmp1 = *ptrC;
1351            tmp7 = tmp2 + tmp5;
1352            tmp6 -= (tmp7 << 2);
1353            tmp6 -= tmp7;
1354            tmp7 = tmp4 + tmp3;
1355            tmp6 += (tmp7 << 4);
1356            tmp6 += (tmp7 << 2);
1357            tmp6 += tmp1;
1358            tmp7 = clp[tmp6>>10];
1359            tmp5 += 512;
1360            mb[16] = (u8)tmp7;
1361
1362            tmp6 = ptrC[-(i32)partWidth];
1363            tmp1 += tmp4;
1364            tmp5 -= (tmp1 << 2);
1365            tmp5 -= tmp1;
1366            tmp3 += tmp2;
1367            tmp5 += (tmp3 << 4);
1368            tmp5 += (tmp3 << 2);
1369            tmp5 += tmp6;
1370            tmp7 = clp[tmp5>>10];
1371            *mb++ = (u8)tmp7;
1372            ptrC++;
1373        }
1374        mb += 4*16 - partWidth;
1375        ptrC += 3*partWidth;
1376        ptrV += 3*partWidth;
1377    }
1378
1379}
1380
1381
1382/*------------------------------------------------------------------------------
1383
1384    Function: h264bsdInterpolateMidVerQuarter
1385
1386        Functional description:
1387          Function to perform horizontal and vertical interpolation of pixel
1388          position 'f' or 'q' for a block. Overfilling is done only if needed.
1389          Reference image (ref) is read at correct position and the predicted
1390          part is written to macroblock array (mb)
1391
1392------------------------------------------------------------------------------*/
1393
1394void h264bsdInterpolateMidVerQuarter(
1395  u8 *ref,
1396  u8 *mb,
1397  i32 x0,
1398  i32 y0,
1399  u32 width,
1400  u32 height,
1401  u32 partWidth,
1402  u32 partHeight,
1403  u32 verOffset)    /* 0 for pixel f, 1 for pixel q */
1404{
1405    u32 p1[21*21/4+1];
1406    u32 x, y;
1407    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1408    i32 *ptrC, *ptrV, *ptrInt, *b1;
1409    u8  *ptrJ;
1410    i32 table[21*16];
1411    const u8 *clp = h264bsdClip + 512;
1412
1413    /* Code */
1414
1415    ASSERT(ref);
1416    ASSERT(mb);
1417
1418    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
1419        (y0 < 0) || ((u32)y0+partHeight+5 > height))
1420    {
1421        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
1422                partWidth+5, partHeight+5, partWidth+5);
1423
1424        x0 = 0;
1425        y0 = 0;
1426        ref = (u8*)p1;
1427        width = partWidth+5;
1428    }
1429
1430    ref += (u32)y0 * width + (u32)x0;
1431
1432    b1 = table;
1433    ptrJ = ref + 5;
1434
1435    /* First step: calculate intermediate values for
1436     * horizontal interpolation */
1437    for (y = partHeight + 5; y; y--)
1438    {
1439        tmp6 = *(ptrJ - 5);
1440        tmp5 = *(ptrJ - 4);
1441        tmp4 = *(ptrJ - 3);
1442        tmp3 = *(ptrJ - 2);
1443        tmp2 = *(ptrJ - 1);
1444        for (x = (partWidth >> 2); x; x--)
1445        {
1446            /* First pixel */
1447            tmp7 = tmp3 + tmp4;
1448            tmp6 += (tmp7 << 4);
1449            tmp6 += (tmp7 << 2);
1450            tmp7 = tmp2 + tmp5;
1451            tmp1 = *ptrJ++;
1452            tmp6 -= (tmp7 << 2);
1453            tmp6 -= tmp7;
1454            tmp6 += tmp1;
1455            *b1++ = tmp6;
1456            /* Second pixel */
1457            tmp7 = tmp2 + tmp3;
1458            tmp5 += (tmp7 << 4);
1459            tmp5 += (tmp7 << 2);
1460            tmp7 = tmp1 + tmp4;
1461            tmp6 = *ptrJ++;
1462            tmp5 -= (tmp7 << 2);
1463            tmp5 -= tmp7;
1464            tmp5 += tmp6;
1465            *b1++ = tmp5;
1466            /* Third pixel */
1467            tmp7 = tmp1 + tmp2;
1468            tmp4 += (tmp7 << 4);
1469            tmp4 += (tmp7 << 2);
1470            tmp7 = tmp6 + tmp3;
1471            tmp5 = *ptrJ++;
1472            tmp4 -= (tmp7 << 2);
1473            tmp4 -= tmp7;
1474            tmp4 += tmp5;
1475            *b1++ = tmp4;
1476            /* Fourth pixel */
1477            tmp7 = tmp6 + tmp1;
1478            tmp3 += (tmp7 << 4);
1479            tmp3 += (tmp7 << 2);
1480            tmp7 = tmp5 + tmp2;
1481            tmp4 = *ptrJ++;
1482            tmp3 -= (tmp7 << 2);
1483            tmp3 -= tmp7;
1484            tmp3 += tmp4;
1485            *b1++ = tmp3;
1486            tmp7 = tmp4;
1487            tmp4 = tmp6;
1488            tmp6 = tmp2;
1489            tmp2 = tmp7;
1490            tmp3 = tmp5;
1491            tmp5 = tmp1;
1492        }
1493        ptrJ += width - partWidth;
1494    }
1495
1496    /* Second step: calculate vertical interpolation and average */
1497    ptrC = table + partWidth;
1498    ptrV = ptrC + 5*partWidth;
1499    /* Pointer to integer sample position, either M or R */
1500    ptrInt = ptrC + (2+verOffset)*partWidth;
1501    for (y = (partHeight >> 2); y; y--)
1502    {
1503        for (x = partWidth; x; x--)
1504        {
1505            tmp4 = ptrV[-(i32)partWidth*2];
1506            tmp5 = ptrV[-(i32)partWidth];
1507            tmp1 = ptrV[partWidth];
1508            tmp2 = ptrV[partWidth*2];
1509            tmp6 = *ptrV++;
1510
1511            tmp7 = tmp4 + tmp1;
1512            tmp2 -= (tmp7 << 2);
1513            tmp2 -= tmp7;
1514            tmp2 += 512;
1515            tmp7 = tmp5 + tmp6;
1516            tmp3 = ptrC[partWidth*2];
1517            tmp2 += (tmp7 << 4);
1518            tmp2 += (tmp7 << 2);
1519            tmp7 = ptrInt[partWidth*2];
1520            tmp2 += tmp3;
1521            tmp2 = clp[tmp2>>10];
1522            tmp7 += 16;
1523            tmp7 = clp[tmp7>>5];
1524            tmp1 += 512;
1525            tmp2++;
1526            mb[48] = (u8)((tmp7 + tmp2) >> 1);
1527
1528            tmp7 = tmp3 + tmp6;
1529            tmp1 -= (tmp7 << 2);
1530            tmp1 -= tmp7;
1531            tmp7 = tmp4 + tmp5;
1532            tmp2 = ptrC[partWidth];
1533            tmp1 += (tmp7 << 4);
1534            tmp1 += (tmp7 << 2);
1535            tmp7 = ptrInt[partWidth];
1536            tmp1 += tmp2;
1537            tmp1 = clp[tmp1>>10];
1538            tmp7 += 16;
1539            tmp7 = clp[tmp7>>5];
1540            tmp6 += 512;
1541            tmp1++;
1542            mb[32] = (u8)((tmp7 + tmp1) >> 1);
1543
1544            tmp1 = *ptrC;
1545            tmp7 = tmp2 + tmp5;
1546            tmp6 -= (tmp7 << 2);
1547            tmp6 -= tmp7;
1548            tmp7 = tmp4 + tmp3;
1549            tmp6 += (tmp7 << 4);
1550            tmp6 += (tmp7 << 2);
1551            tmp7 = *ptrInt;
1552            tmp6 += tmp1;
1553            tmp6 = clp[tmp6>>10];
1554            tmp7 += 16;
1555            tmp7 = clp[tmp7>>5];
1556            tmp5 += 512;
1557            tmp6++;
1558            mb[16] = (u8)((tmp7 + tmp6) >> 1);
1559
1560            tmp6 = ptrC[-(i32)partWidth];
1561            tmp1 += tmp4;
1562            tmp5 -= (tmp1 << 2);
1563            tmp5 -= tmp1;
1564            tmp3 += tmp2;
1565            tmp5 += (tmp3 << 4);
1566            tmp5 += (tmp3 << 2);
1567            tmp7 = ptrInt[-(i32)partWidth];
1568            tmp5 += tmp6;
1569            tmp5 = clp[tmp5>>10];
1570            tmp7 += 16;
1571            tmp7 = clp[tmp7>>5];
1572            tmp5++;
1573            *mb++ = (u8)((tmp7 + tmp5) >> 1);
1574            ptrC++;
1575            ptrInt++;
1576        }
1577        mb += 4*16 - partWidth;
1578        ptrC += 3*partWidth;
1579        ptrV += 3*partWidth;
1580        ptrInt += 3*partWidth;
1581    }
1582
1583}
1584
1585
1586/*------------------------------------------------------------------------------
1587
1588    Function: h264bsdInterpolateMidHorQuarter
1589
1590        Functional description:
1591          Function to perform horizontal and vertical interpolation of pixel
1592          position 'i' or 'k' for a block. Overfilling is done only if needed.
1593          Reference image (ref) is read at correct position and the predicted
1594          part is written to macroblock array (mb)
1595
1596------------------------------------------------------------------------------*/
1597
1598void h264bsdInterpolateMidHorQuarter(
1599  u8 *ref,
1600  u8 *mb,
1601  i32 x0,
1602  i32 y0,
1603  u32 width,
1604  u32 height,
1605  u32 partWidth,
1606  u32 partHeight,
1607  u32 horOffset)    /* 0 for pixel i, 1 for pixel k */
1608{
1609    u32 p1[21*21/4+1];
1610    u32 x, y;
1611    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1612    i32 *ptrJ, *ptrInt, *h1;
1613    u8  *ptrC, *ptrV;
1614    i32 table[21*16];
1615    i32 tableWidth = (i32)partWidth+5;
1616    const u8 *clp = h264bsdClip + 512;
1617
1618    /* Code */
1619
1620    ASSERT(ref);
1621    ASSERT(mb);
1622
1623    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
1624        (y0 < 0) || ((u32)y0+partHeight+5 > height))
1625    {
1626        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
1627                partWidth+5, partHeight+5, partWidth+5);
1628
1629        x0 = 0;
1630        y0 = 0;
1631        ref = (u8*)p1;
1632        width = partWidth+5;
1633    }
1634
1635    ref += (u32)y0 * width + (u32)x0;
1636
1637    h1 = table + tableWidth;
1638    ptrC = ref + width;
1639    ptrV = ptrC + 5*width;
1640
1641    /* First step: calculate intermediate values for
1642     * vertical interpolation */
1643    for (y = (partHeight >> 2); y; y--)
1644    {
1645        for (x = (u32)tableWidth; x; x--)
1646        {
1647            tmp4 = ptrV[-(i32)width*2];
1648            tmp5 = ptrV[-(i32)width];
1649            tmp1 = ptrV[width];
1650            tmp2 = ptrV[width*2];
1651            tmp6 = *ptrV++;
1652
1653            tmp7 = tmp4 + tmp1;
1654            tmp2 -= (tmp7 << 2);
1655            tmp2 -= tmp7;
1656            tmp7 = tmp5 + tmp6;
1657            tmp3 = ptrC[width*2];
1658            tmp2 += (tmp7 << 4);
1659            tmp2 += (tmp7 << 2);
1660            tmp2 += tmp3;
1661            h1[tableWidth*2] = tmp2;
1662
1663            tmp7 = tmp3 + tmp6;
1664            tmp1 -= (tmp7 << 2);
1665            tmp1 -= tmp7;
1666            tmp7 = tmp4 + tmp5;
1667            tmp2 = ptrC[width];
1668            tmp1 += (tmp7 << 4);
1669            tmp1 += (tmp7 << 2);
1670            tmp1 += tmp2;
1671            h1[tableWidth] = tmp1;
1672
1673            tmp1 = *ptrC;
1674            tmp7 = tmp2 + tmp5;
1675            tmp6 -= (tmp7 << 2);
1676            tmp6 -= tmp7;
1677            tmp7 = tmp4 + tmp3;
1678            tmp6 += (tmp7 << 4);
1679            tmp6 += (tmp7 << 2);
1680            tmp6 += tmp1;
1681            *h1 = tmp6;
1682
1683            tmp6 = ptrC[-(i32)width];
1684            tmp1 += tmp4;
1685            tmp5 -= (tmp1 << 2);
1686            tmp5 -= tmp1;
1687            tmp3 += tmp2;
1688            tmp5 += (tmp3 << 4);
1689            tmp5 += (tmp3 << 2);
1690            tmp5 += tmp6;
1691            h1[-tableWidth] = tmp5;
1692            h1++;
1693            ptrC++;
1694        }
1695        ptrC += 4*width - partWidth - 5;
1696        ptrV += 4*width - partWidth - 5;
1697        h1 += 3*tableWidth;
1698    }
1699
1700    /* Second step: calculate horizontal interpolation and average */
1701    ptrJ = table + 5;
1702    /* Pointer to integer sample position, either G or H */
1703    ptrInt = table + 2 + horOffset;
1704    for (y = partHeight; y; y--)
1705    {
1706        tmp6 = *(ptrJ - 5);
1707        tmp5 = *(ptrJ - 4);
1708        tmp4 = *(ptrJ - 3);
1709        tmp3 = *(ptrJ - 2);
1710        tmp2 = *(ptrJ - 1);
1711        for (x = (partWidth>>2); x; x--)
1712        {
1713            /* First pixel */
1714            tmp6 += 512;
1715            tmp7 = tmp3 + tmp4;
1716            tmp6 += (tmp7 << 4);
1717            tmp6 += (tmp7 << 2);
1718            tmp7 = tmp2 + tmp5;
1719            tmp1 = *ptrJ++;
1720            tmp6 -= (tmp7 << 2);
1721            tmp6 -= tmp7;
1722            tmp7 = *ptrInt++;
1723            tmp6 += tmp1;
1724            tmp6 = clp[tmp6 >> 10];
1725            tmp7 += 16;
1726            tmp7 = clp[tmp7 >> 5];
1727            tmp5 += 512;
1728            tmp6++;
1729            *mb++ = (u8)((tmp6 + tmp7) >> 1);
1730            /* Second pixel */
1731            tmp7 = tmp2 + tmp3;
1732            tmp5 += (tmp7 << 4);
1733            tmp5 += (tmp7 << 2);
1734            tmp7 = tmp1 + tmp4;
1735            tmp6 = *ptrJ++;
1736            tmp5 -= (tmp7 << 2);
1737            tmp5 -= tmp7;
1738            tmp7 = *ptrInt++;
1739            tmp5 += tmp6;
1740            tmp5 = clp[tmp5 >> 10];
1741            tmp7 += 16;
1742            tmp7 = clp[tmp7 >> 5];
1743            tmp4 += 512;
1744            tmp5++;
1745            *mb++ = (u8)((tmp5 + tmp7) >> 1);
1746            /* Third pixel */
1747            tmp7 = tmp1 + tmp2;
1748            tmp4 += (tmp7 << 4);
1749            tmp4 += (tmp7 << 2);
1750            tmp7 = tmp6 + tmp3;
1751            tmp5 = *ptrJ++;
1752            tmp4 -= (tmp7 << 2);
1753            tmp4 -= tmp7;
1754            tmp7 = *ptrInt++;
1755            tmp4 += tmp5;
1756            tmp4 = clp[tmp4 >> 10];
1757            tmp7 += 16;
1758            tmp7 = clp[tmp7 >> 5];
1759            tmp3 += 512;
1760            tmp4++;
1761            *mb++ = (u8)((tmp4 + tmp7) >> 1);
1762            /* Fourth pixel */
1763            tmp7 = tmp6 + tmp1;
1764            tmp3 += (tmp7 << 4);
1765            tmp3 += (tmp7 << 2);
1766            tmp7 = tmp5 + tmp2;
1767            tmp4 = *ptrJ++;
1768            tmp3 -= (tmp7 << 2);
1769            tmp3 -= tmp7;
1770            tmp7 = *ptrInt++;
1771            tmp3 += tmp4;
1772            tmp3 = clp[tmp3 >> 10];
1773            tmp7 += 16;
1774            tmp7 = clp[tmp7 >> 5];
1775            tmp3++;
1776            *mb++ = (u8)((tmp3 + tmp7) >> 1);
1777            tmp3 = tmp5;
1778            tmp5

Large files files are truncated, but you can click here to view the full file