PageRenderTime 335ms CodeModel.GetById 118ms app.highlight 83ms RepoModel.GetById 94ms app.codeStats 3ms

/Show/avc/pred_inter.cpp

http://github.com/mbebenita/Broadway
C++ | 2330 lines | 1974 code | 201 blank | 155 comment | 158 complexity | 5750c7959b2cde35dcd102bfcf79ae88 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/* ------------------------------------------------------------------
   2 * Copyright (C) 1998-2009 PacketVideo
   3 *
   4 * Licensed under the Apache License, Version 2.0 (the "License");
   5 * you may not use this file except in compliance with the License.
   6 * You may obtain a copy of the License at
   7 *
   8 *      http://www.apache.org/licenses/LICENSE-2.0
   9 *
  10 * Unless required by applicable law or agreed to in writing, software
  11 * distributed under the License is distributed on an "AS IS" BASIS,
  12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
  13 * express or implied.
  14 * See the License for the specific language governing permissions
  15 * and limitations under the License.
  16 * -------------------------------------------------------------------
  17 */
  18#include "avcdec_lib.h"
  19
  20
  21#define CLIP_RESULT(x)      if((uint)x > 0xFF){ \
  22                 x = 0xFF & (~(x>>31));}
  23
  24/* (blkwidth << 2) + (dy << 1) + dx */
  25static void (*const ChromaMC_SIMD[8])(uint8 *, int , int , int , uint8 *, int, int , int) =
  26{
  27    &ChromaFullMC_SIMD,
  28    &ChromaHorizontalMC_SIMD,
  29    &ChromaVerticalMC_SIMD,
  30    &ChromaDiagonalMC_SIMD,
  31    &ChromaFullMC_SIMD,
  32    &ChromaHorizontalMC2_SIMD,
  33    &ChromaVerticalMC2_SIMD,
  34    &ChromaDiagonalMC2_SIMD
  35};
  36/* Perform motion prediction and compensation with residue if exist. */
  37void InterMBPrediction(AVCCommonObj *video)
  38{
  39    AVCMacroblock *currMB = video->currMB;
  40    AVCPictureData *currPic = video->currPic;
  41    int mbPartIdx, subMbPartIdx;
  42    int ref_idx;
  43    int offset_MbPart_indx = 0;
  44    int16 *mv;
  45    uint32 x_pos, y_pos;
  46    uint8 *curL, *curCb, *curCr;
  47    uint8 *ref_l, *ref_Cb, *ref_Cr;
  48    uint8 *predBlock, *predCb, *predCr;
  49    int block_x, block_y, offset_x, offset_y, offsetP, offset;
  50    int x_position = (video->mb_x << 4);
  51    int y_position = (video->mb_y << 4);
  52    int MbHeight, MbWidth, mbPartIdx_X, mbPartIdx_Y, offset_indx;
  53    int picWidth = currPic->pitch;
  54    int picHeight = currPic->height;
  55    int16 *dataBlock;
  56    uint32 cbp4x4;
  57    uint32 tmp_word;
  58
  59    tmp_word = y_position * picWidth;
  60    curL = currPic->Sl + tmp_word + x_position;
  61    offset = (tmp_word >> 2) + (x_position >> 1);
  62    curCb = currPic->Scb + offset;
  63    curCr = currPic->Scr + offset;
  64
  65#ifdef USE_PRED_BLOCK
  66    predBlock = video->pred + 84;
  67    predCb = video->pred + 452;
  68    predCr = video->pred + 596;
  69#else
  70    predBlock = curL;
  71    predCb = curCb;
  72    predCr = curCr;
  73#endif
  74
  75    GetMotionVectorPredictor(video, false);
  76
  77    for (mbPartIdx = 0; mbPartIdx < currMB->NumMbPart; mbPartIdx++)
  78    {
  79        MbHeight = currMB->SubMbPartHeight[mbPartIdx];
  80        MbWidth = currMB->SubMbPartWidth[mbPartIdx];
  81        mbPartIdx_X = ((mbPartIdx + offset_MbPart_indx) & 1);
  82        mbPartIdx_Y = (mbPartIdx + offset_MbPart_indx) >> 1;
  83        ref_idx = currMB->ref_idx_L0[(mbPartIdx_Y << 1) + mbPartIdx_X];
  84        offset_indx = 0;
  85
  86        ref_l = video->RefPicList0[ref_idx]->Sl;
  87        ref_Cb = video->RefPicList0[ref_idx]->Scb;
  88        ref_Cr = video->RefPicList0[ref_idx]->Scr;
  89
  90        for (subMbPartIdx = 0; subMbPartIdx < currMB->NumSubMbPart[mbPartIdx]; subMbPartIdx++)
  91        {
  92            block_x = (mbPartIdx_X << 1) + ((subMbPartIdx + offset_indx) & 1);  // check this
  93            block_y = (mbPartIdx_Y << 1) + (((subMbPartIdx + offset_indx) >> 1) & 1);
  94            mv = (int16*)(currMB->mvL0 + block_x + (block_y << 2));
  95            offset_x = x_position + (block_x << 2);
  96            offset_y = y_position + (block_y << 2);
  97            x_pos = (offset_x << 2) + *mv++;   /*quarter pel */
  98            y_pos = (offset_y << 2) + *mv;   /*quarter pel */
  99
 100            //offset = offset_y * currPic->width;
 101            //offsetC = (offset >> 2) + (offset_x >> 1);
 102#ifdef USE_PRED_BLOCK
 103            offsetP = (block_y * 80) + (block_x << 2);
 104            LumaMotionComp(ref_l, picWidth, picHeight, x_pos, y_pos,
 105                           /*comp_Sl + offset + offset_x,*/
 106                           predBlock + offsetP, 20, MbWidth, MbHeight);
 107#else
 108            offsetP = (block_y << 2) * picWidth + (block_x << 2);
 109            LumaMotionComp(ref_l, picWidth, picHeight, x_pos, y_pos,
 110                           /*comp_Sl + offset + offset_x,*/
 111                           predBlock + offsetP, picWidth, MbWidth, MbHeight);
 112#endif
 113
 114#ifdef USE_PRED_BLOCK
 115            offsetP = (block_y * 24) + (block_x << 1);
 116            ChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
 117                             /*comp_Scb +  offsetC,*/
 118                             predCb + offsetP, 12, MbWidth >> 1, MbHeight >> 1);
 119            ChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
 120                             /*comp_Scr +  offsetC,*/
 121                             predCr + offsetP, 12, MbWidth >> 1, MbHeight >> 1);
 122#else
 123            offsetP = (block_y * picWidth) + (block_x << 1);
 124            ChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
 125                             /*comp_Scb +  offsetC,*/
 126                             predCb + offsetP, picWidth >> 1, MbWidth >> 1, MbHeight >> 1);
 127            ChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
 128                             /*comp_Scr +  offsetC,*/
 129                             predCr + offsetP, picWidth >> 1, MbWidth >> 1, MbHeight >> 1);
 130#endif
 131
 132            offset_indx = currMB->SubMbPartWidth[mbPartIdx] >> 3;
 133        }
 134        offset_MbPart_indx = currMB->MbPartWidth >> 4;
 135    }
 136
 137    /* used in decoder, used to be if(!encFlag)  */
 138
 139    /* transform in raster scan order */
 140    dataBlock = video->block;
 141    cbp4x4 = video->cbp4x4;
 142    /* luma */
 143    for (block_y = 4; block_y > 0; block_y--)
 144    {
 145        for (block_x = 4; block_x > 0; block_x--)
 146        {
 147#ifdef USE_PRED_BLOCK
 148            if (cbp4x4&1)
 149            {
 150                itrans(dataBlock, predBlock, predBlock, 20);
 151            }
 152#else
 153            if (cbp4x4&1)
 154            {
 155                itrans(dataBlock, curL, curL, picWidth);
 156            }
 157#endif
 158            cbp4x4 >>= 1;
 159            dataBlock += 4;
 160#ifdef USE_PRED_BLOCK
 161            predBlock += 4;
 162#else
 163            curL += 4;
 164#endif
 165        }
 166        dataBlock += 48;
 167#ifdef USE_PRED_BLOCK
 168        predBlock += 64;
 169#else
 170        curL += ((picWidth << 2) - 16);
 171#endif
 172    }
 173
 174    /* chroma */
 175    picWidth = (picWidth >> 1);
 176    for (block_y = 2; block_y > 0; block_y--)
 177    {
 178        for (block_x = 2; block_x > 0; block_x--)
 179        {
 180#ifdef USE_PRED_BLOCK
 181            if (cbp4x4&1)
 182            {
 183                ictrans(dataBlock, predCb, predCb, 12);
 184            }
 185#else
 186            if (cbp4x4&1)
 187            {
 188                ictrans(dataBlock, curCb, curCb, picWidth);
 189            }
 190#endif
 191            cbp4x4 >>= 1;
 192            dataBlock += 4;
 193#ifdef USE_PRED_BLOCK
 194            predCb += 4;
 195#else
 196            curCb += 4;
 197#endif
 198        }
 199        for (block_x = 2; block_x > 0; block_x--)
 200        {
 201#ifdef USE_PRED_BLOCK
 202            if (cbp4x4&1)
 203            {
 204                ictrans(dataBlock, predCr, predCr, 12);
 205            }
 206#else
 207            if (cbp4x4&1)
 208            {
 209                ictrans(dataBlock, curCr, curCr, picWidth);
 210            }
 211#endif
 212            cbp4x4 >>= 1;
 213            dataBlock += 4;
 214#ifdef USE_PRED_BLOCK
 215            predCr += 4;
 216#else
 217            curCr += 4;
 218#endif
 219        }
 220        dataBlock += 48;
 221#ifdef USE_PRED_BLOCK
 222        predCb += 40;
 223        predCr += 40;
 224#else
 225        curCb += ((picWidth << 2) - 8);
 226        curCr += ((picWidth << 2) - 8);
 227#endif
 228    }
 229
 230#ifdef MB_BASED_DEBLOCK
 231    SaveNeighborForIntraPred(video, offset);
 232#endif
 233
 234    return ;
 235}
 236
 237
 238/* preform the actual  motion comp here */
 239void LumaMotionComp(uint8 *ref, int picwidth, int picheight,
 240                    int x_pos, int y_pos,
 241                    uint8 *pred, int pred_pitch,
 242                    int blkwidth, int blkheight)
 243{
 244    int dx, dy;
 245    uint8 temp[24][24]; /* for padding, make the size multiple of 4 for packing */
 246    int temp2[21][21]; /* for intermediate results */
 247    uint8 *ref2;
 248
 249    dx = x_pos & 3;
 250    dy = y_pos & 3;
 251    x_pos = x_pos >> 2;  /* round it to full-pel resolution */
 252    y_pos = y_pos >> 2;
 253
 254    /* perform actual motion compensation */
 255    if (dx == 0 && dy == 0)
 256    {  /* fullpel position *//* G */
 257        if (x_pos >= 0 && x_pos + blkwidth <= picwidth && y_pos >= 0 && y_pos + blkheight <= picheight)
 258        {
 259            ref += y_pos * picwidth + x_pos;
 260            FullPelMC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight);
 261        }
 262        else
 263        {
 264            CreatePad(ref, picwidth, picheight, x_pos, y_pos, &temp[0][0], blkwidth, blkheight);
 265            FullPelMC(&temp[0][0], 24, pred, pred_pitch, blkwidth, blkheight);
 266        }
 267
 268    }   /* other positions */
 269    else  if (dy == 0)
 270    { /* no vertical interpolation *//* a,b,c*/
 271
 272        if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos >= 0 && y_pos + blkheight <= picheight)
 273        {
 274            ref += y_pos * picwidth + x_pos;
 275
 276            HorzInterp1MC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight, dx);
 277        }
 278        else  /* need padding */
 279        {
 280            CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos, &temp[0][0], blkwidth + 5, blkheight);
 281
 282            HorzInterp1MC(&temp[0][2], 24, pred, pred_pitch, blkwidth, blkheight, dx);
 283        }
 284    }
 285    else if (dx == 0)
 286    { /*no horizontal interpolation *//* d,h,n */
 287
 288        if (x_pos >= 0 && x_pos + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
 289        {
 290            ref += y_pos * picwidth + x_pos;
 291
 292            VertInterp1MC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight, dy);
 293        }
 294        else  /* need padding */
 295        {
 296            CreatePad(ref, picwidth, picheight, x_pos, y_pos - 2, &temp[0][0], blkwidth, blkheight + 5);
 297
 298            VertInterp1MC(&temp[2][0], 24, pred, pred_pitch, blkwidth, blkheight, dy);
 299        }
 300    }
 301    else if (dy == 2)
 302    {  /* horizontal cross *//* i, j, k */
 303
 304        if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
 305        {
 306            ref += y_pos * picwidth + x_pos - 2; /* move to the left 2 pixels */
 307
 308            VertInterp2MC(ref, picwidth, &temp2[0][0], 21, blkwidth + 5, blkheight);
 309
 310            HorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
 311        }
 312        else /* need padding */
 313        {
 314            CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5, blkheight + 5);
 315
 316            VertInterp2MC(&temp[2][0], 24, &temp2[0][0], 21, blkwidth + 5, blkheight);
 317
 318            HorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
 319        }
 320    }
 321    else if (dx == 2)
 322    { /* vertical cross */ /* f,q */
 323
 324        if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
 325        {
 326            ref += (y_pos - 2) * picwidth + x_pos; /* move to up 2 lines */
 327
 328            HorzInterp3MC(ref, picwidth, &temp2[0][0], 21, blkwidth, blkheight + 5);
 329            VertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
 330        }
 331        else  /* need padding */
 332        {
 333            CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5, blkheight + 5);
 334            HorzInterp3MC(&temp[0][2], 24, &temp2[0][0], 21, blkwidth, blkheight + 5);
 335            VertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
 336        }
 337    }
 338    else
 339    { /* diagonal *//* e,g,p,r */
 340
 341        if (x_pos - 2 >= 0 && x_pos + 3 + (dx / 2) + blkwidth <= picwidth &&
 342                y_pos - 2 >= 0 && y_pos + 3 + blkheight + (dy / 2) <= picheight)
 343        {
 344            ref2 = ref + (y_pos + (dy / 2)) * picwidth + x_pos;
 345
 346            ref += (y_pos * picwidth) + x_pos + (dx / 2);
 347
 348            DiagonalInterpMC(ref2, ref, picwidth, pred, pred_pitch, blkwidth, blkheight);
 349        }
 350        else  /* need padding */
 351        {
 352            CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5 + (dx / 2), blkheight + 5 + (dy / 2));
 353
 354            ref2 = &temp[2 + (dy/2)][2];
 355
 356            ref = &temp[2][2 + (dx/2)];
 357
 358            DiagonalInterpMC(ref2, ref, 24, pred, pred_pitch, blkwidth, blkheight);
 359        }
 360    }
 361
 362    return ;
 363}
 364
 365void CreateAlign(uint8 *ref, int picwidth, int y_pos,
 366                 uint8 *out, int blkwidth, int blkheight)
 367{
 368    int i, j;
 369    int offset, out_offset;
 370    uint32 prev_pix, result, pix1, pix2, pix4;
 371
 372    out_offset = 24 - blkwidth;
 373
 374    //switch(x_pos&0x3){
 375    switch (((uint32)ref)&0x3)
 376    {
 377        case 1:
 378            ref += y_pos * picwidth;
 379            offset =  picwidth - blkwidth - 3;
 380            for (j = 0; j < blkheight; j++)
 381            {
 382                pix1 = *ref++;
 383                pix2 = *((uint16*)ref);
 384                ref += 2;
 385                result = (pix2 << 8) | pix1;
 386
 387                for (i = 3; i < blkwidth; i += 4)
 388                {
 389                    pix4 = *((uint32*)ref);
 390                    ref += 4;
 391                    prev_pix = (pix4 << 24) & 0xFF000000; /* mask out byte belong to previous word */
 392                    result |= prev_pix;
 393                    *((uint32*)out) = result;  /* write 4 bytes */
 394                    out += 4;
 395                    result = pix4 >> 8; /* for the next loop */
 396                }
 397                ref += offset;
 398                out += out_offset;
 399            }
 400            break;
 401        case 2:
 402            ref += y_pos * picwidth;
 403            offset =  picwidth - blkwidth - 2;
 404            for (j = 0; j < blkheight; j++)
 405            {
 406                result = *((uint16*)ref);
 407                ref += 2;
 408                for (i = 2; i < blkwidth; i += 4)
 409                {
 410                    pix4 = *((uint32*)ref);
 411                    ref += 4;
 412                    prev_pix = (pix4 << 16) & 0xFFFF0000; /* mask out byte belong to previous word */
 413                    result |= prev_pix;
 414                    *((uint32*)out) = result;  /* write 4 bytes */
 415                    out += 4;
 416                    result = pix4 >> 16; /* for the next loop */
 417                }
 418                ref += offset;
 419                out += out_offset;
 420            }
 421            break;
 422        case 3:
 423            ref += y_pos * picwidth;
 424            offset =  picwidth - blkwidth - 1;
 425            for (j = 0; j < blkheight; j++)
 426            {
 427                result = *ref++;
 428                for (i = 1; i < blkwidth; i += 4)
 429                {
 430                    pix4 = *((uint32*)ref);
 431                    ref += 4;
 432                    prev_pix = (pix4 << 8) & 0xFFFFFF00; /* mask out byte belong to previous word */
 433                    result |= prev_pix;
 434                    *((uint32*)out) = result;  /* write 4 bytes */
 435                    out += 4;
 436                    result = pix4 >> 24; /* for the next loop */
 437                }
 438                ref += offset;
 439                out += out_offset;
 440            }
 441            break;
 442    }
 443}
 444
 445void CreatePad(uint8 *ref, int picwidth, int picheight, int x_pos, int y_pos,
 446               uint8 *out, int blkwidth, int blkheight)
 447{
 448    int x_inc0, x_mid;
 449    int y_inc, y_inc0, y_inc1, y_mid;
 450    int i, j;
 451    int offset;
 452
 453    if (x_pos < 0)
 454    {
 455        x_inc0 = 0;  /* increment for the first part */
 456        x_mid = ((blkwidth + x_pos > 0) ? -x_pos : blkwidth);  /* stopping point */
 457        x_pos = 0;
 458    }
 459    else if (x_pos + blkwidth > picwidth)
 460    {
 461        x_inc0 = 1;  /* increasing */
 462        x_mid = ((picwidth > x_pos) ? picwidth - x_pos - 1 : 0);  /* clip negative to zero, encode fool proof! */
 463    }
 464    else    /* normal case */
 465    {
 466        x_inc0 = 1;
 467        x_mid = blkwidth; /* just one run */
 468    }
 469
 470
 471    /* boundary for y_pos, taking the result from x_pos into account */
 472    if (y_pos < 0)
 473    {
 474        y_inc0 = (x_inc0 ? - x_mid : -blkwidth + x_mid); /* offset depending on x_inc1 and x_inc0 */
 475        y_inc1 = picwidth + y_inc0;
 476        y_mid = ((blkheight + y_pos > 0) ? -y_pos : blkheight); /* clip to prevent memory corruption */
 477        y_pos = 0;
 478    }
 479    else  if (y_pos + blkheight > picheight)
 480    {
 481        y_inc1 = (x_inc0 ? - x_mid : -blkwidth + x_mid); /* saturate */
 482        y_inc0 = picwidth + y_inc1;                 /* increasing */
 483        y_mid = ((picheight > y_pos) ? picheight - 1 - y_pos : 0);
 484    }
 485    else  /* normal case */
 486    {
 487        y_inc1 = (x_inc0 ? - x_mid : -blkwidth + x_mid);
 488        y_inc0 = picwidth + y_inc1;
 489        y_mid = blkheight;
 490    }
 491
 492    /* clip y_pos and x_pos */
 493    if (y_pos > picheight - 1) y_pos = picheight - 1;
 494    if (x_pos > picwidth - 1) x_pos = picwidth - 1;
 495
 496    ref += y_pos * picwidth + x_pos;
 497
 498    y_inc = y_inc0;  /* start with top half */
 499
 500    offset = 24 - blkwidth; /* to use in offset out */
 501    blkwidth -= x_mid; /* to use in the loop limit */
 502
 503    if (x_inc0 == 0)
 504    {
 505        for (j = 0; j < blkheight; j++)
 506        {
 507            if (j == y_mid)  /* put a check here to reduce the code size (for unrolling the loop) */
 508            {
 509                y_inc = y_inc1;  /* switch to lower half */
 510            }
 511            for (i = x_mid; i > 0; i--)   /* first or third quarter */
 512            {
 513                *out++ = *ref;
 514            }
 515            for (i = blkwidth; i > 0; i--)  /* second or fourth quarter */
 516            {
 517                *out++ = *ref++;
 518            }
 519            out += offset;
 520            ref += y_inc;
 521        }
 522    }
 523    else
 524    {
 525        for (j = 0; j < blkheight; j++)
 526        {
 527            if (j == y_mid)  /* put a check here to reduce the code size (for unrolling the loop) */
 528            {
 529                y_inc = y_inc1;  /* switch to lower half */
 530            }
 531            for (i = x_mid; i > 0; i--)   /* first or third quarter */
 532            {
 533                *out++ = *ref++;
 534            }
 535            for (i = blkwidth; i > 0; i--)  /* second or fourth quarter */
 536            {
 537                *out++ = *ref;
 538            }
 539            out += offset;
 540            ref += y_inc;
 541        }
 542    }
 543
 544    return ;
 545}
 546
 547void HorzInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
 548                   int blkwidth, int blkheight, int dx)
 549{
 550    uint8 *p_ref;
 551    uint32 *p_cur;
 552    uint32 tmp;
 553    uint32 pkres;
 554    int result, curr_offset, ref_offset;
 555    int j;
 556    int32 r0, r1, r2, r3, r4, r5;
 557    int32 r13, r6;
 558
 559    p_cur = (uint32*)out; /* assume it's word aligned */
 560    curr_offset = (outpitch - blkwidth) >> 2;
 561    p_ref = in;
 562    ref_offset = inpitch - blkwidth;
 563
 564    if (dx&1)
 565    {
 566        dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
 567        p_ref -= 2;
 568        r13 = 0;
 569        for (j = blkheight; j > 0; j--)
 570        {
 571            tmp = (uint32)(p_ref + blkwidth);
 572            r0 = p_ref[0];
 573            r1 = p_ref[2];
 574            r0 |= (r1 << 16);           /* 0,c,0,a */
 575            r1 = p_ref[1];
 576            r2 = p_ref[3];
 577            r1 |= (r2 << 16);           /* 0,d,0,b */
 578            while ((uint32)p_ref < tmp)
 579            {
 580                r2 = *(p_ref += 4); /* move pointer to e */
 581                r3 = p_ref[2];
 582                r2 |= (r3 << 16);           /* 0,g,0,e */
 583                r3 = p_ref[1];
 584                r4 = p_ref[3];
 585                r3 |= (r4 << 16);           /* 0,h,0,f */
 586
 587                r4 = r0 + r3;       /* c+h, a+f */
 588                r5 = r0 + r1;   /* c+d, a+b */
 589                r6 = r2 + r3;   /* g+h, e+f */
 590                r5 >>= 16;
 591                r5 |= (r6 << 16);   /* e+f, c+d */
 592                r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
 593                r4 += 0x100010; /* +16, +16 */
 594                r5 = r1 + r2;       /* d+g, b+e */
 595                r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
 596                r4 >>= 5;
 597                r13 |= r4;      /* check clipping */
 598
 599                r5 = p_ref[dx+2];
 600                r6 = p_ref[dx+4];
 601                r5 |= (r6 << 16);
 602                r4 += r5;
 603                r4 += 0x10001;
 604                r4 = (r4 >> 1) & 0xFF00FF;
 605
 606                r5 = p_ref[4];  /* i */
 607                r6 = (r5 << 16);
 608                r5 = r6 | (r2 >> 16);/* 0,i,0,g */
 609                r5 += r1;       /* d+i, b+g */ /* r5 not free */
 610                r1 >>= 16;
 611                r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
 612                r1 += r2;       /* f+g, d+e */
 613                r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
 614                r0 >>= 16;
 615                r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
 616                r0 += r3;       /* e+h, c+f */
 617                r5 += 0x100010; /* 16,16 */
 618                r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
 619                r5 >>= 5;
 620                r13 |= r5;      /* check clipping */
 621
 622                r0 = p_ref[dx+3];
 623                r1 = p_ref[dx+5];
 624                r0 |= (r1 << 16);
 625                r5 += r0;
 626                r5 += 0x10001;
 627                r5 = (r5 >> 1) & 0xFF00FF;
 628
 629                r4 |= (r5 << 8);    /* pack them together */
 630                *p_cur++ = r4;
 631                r1 = r3;
 632                r0 = r2;
 633            }
 634            p_cur += curr_offset; /* move to the next line */
 635            p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
 636
 637            if (r13&0xFF000700) /* need clipping */
 638            {
 639                /* move back to the beginning of the line */
 640                p_ref -= (ref_offset + blkwidth);   /* input */
 641                p_cur -= (outpitch >> 2);
 642
 643                tmp = (uint32)(p_ref + blkwidth);
 644                for (; (uint32)p_ref < tmp;)
 645                {
 646
 647                    r0 = *p_ref++;
 648                    r1 = *p_ref++;
 649                    r2 = *p_ref++;
 650                    r3 = *p_ref++;
 651                    r4 = *p_ref++;
 652                    /* first pixel */
 653                    r5 = *p_ref++;
 654                    result = (r0 + r5);
 655                    r0 = (r1 + r4);
 656                    result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
 657                    r0 = (r2 + r3);
 658                    result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
 659                    result = (result + 16) >> 5;
 660                    CLIP_RESULT(result)
 661                    /* 3/4 pel,  no need to clip */
 662                    result = (result + p_ref[dx] + 1);
 663                    pkres = (result >> 1) ;
 664                    /* second pixel */
 665                    r0 = *p_ref++;
 666                    result = (r1 + r0);
 667                    r1 = (r2 + r5);
 668                    result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
 669                    r1 = (r3 + r4);
 670                    result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
 671                    result = (result + 16) >> 5;
 672                    CLIP_RESULT(result)
 673                    /* 3/4 pel,  no need to clip */
 674                    result = (result + p_ref[dx] + 1);
 675                    result = (result >> 1);
 676                    pkres  |= (result << 8);
 677                    /* third pixel */
 678                    r1 = *p_ref++;
 679                    result = (r2 + r1);
 680                    r2 = (r3 + r0);
 681                    result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
 682                    r2 = (r4 + r5);
 683                    result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
 684                    result = (result + 16) >> 5;
 685                    CLIP_RESULT(result)
 686                    /* 3/4 pel,  no need to clip */
 687                    result = (result + p_ref[dx] + 1);
 688                    result = (result >> 1);
 689                    pkres  |= (result << 16);
 690                    /* fourth pixel */
 691                    r2 = *p_ref++;
 692                    result = (r3 + r2);
 693                    r3 = (r4 + r1);
 694                    result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
 695                    r3 = (r5 + r0);
 696                    result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
 697                    result = (result + 16) >> 5;
 698                    CLIP_RESULT(result)
 699                    /* 3/4 pel,  no need to clip */
 700                    result = (result + p_ref[dx] + 1);
 701                    result = (result >> 1);
 702                    pkres  |= (result << 24);
 703                    *p_cur++ = pkres; /* write 4 pixels */
 704                    p_ref -= 5;  /* offset back to the middle of filter */
 705                }
 706                p_cur += curr_offset;  /* move to the next line */
 707                p_ref += ref_offset;    /* move to the next line */
 708            }
 709        }
 710    }
 711    else
 712    {
 713        p_ref -= 2;
 714        r13 = 0;
 715        for (j = blkheight; j > 0; j--)
 716        {
 717            tmp = (uint32)(p_ref + blkwidth);
 718            r0 = p_ref[0];
 719            r1 = p_ref[2];
 720            r0 |= (r1 << 16);           /* 0,c,0,a */
 721            r1 = p_ref[1];
 722            r2 = p_ref[3];
 723            r1 |= (r2 << 16);           /* 0,d,0,b */
 724            while ((uint32)p_ref < tmp)
 725            {
 726                r2 = *(p_ref += 4); /* move pointer to e */
 727                r3 = p_ref[2];
 728                r2 |= (r3 << 16);           /* 0,g,0,e */
 729                r3 = p_ref[1];
 730                r4 = p_ref[3];
 731                r3 |= (r4 << 16);           /* 0,h,0,f */
 732
 733                r4 = r0 + r3;       /* c+h, a+f */
 734                r5 = r0 + r1;   /* c+d, a+b */
 735                r6 = r2 + r3;   /* g+h, e+f */
 736                r5 >>= 16;
 737                r5 |= (r6 << 16);   /* e+f, c+d */
 738                r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
 739                r4 += 0x100010; /* +16, +16 */
 740                r5 = r1 + r2;       /* d+g, b+e */
 741                r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
 742                r4 >>= 5;
 743                r13 |= r4;      /* check clipping */
 744                r4 &= 0xFF00FF; /* mask */
 745
 746                r5 = p_ref[4];  /* i */
 747                r6 = (r5 << 16);
 748                r5 = r6 | (r2 >> 16);/* 0,i,0,g */
 749                r5 += r1;       /* d+i, b+g */ /* r5 not free */
 750                r1 >>= 16;
 751                r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
 752                r1 += r2;       /* f+g, d+e */
 753                r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
 754                r0 >>= 16;
 755                r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
 756                r0 += r3;       /* e+h, c+f */
 757                r5 += 0x100010; /* 16,16 */
 758                r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
 759                r5 >>= 5;
 760                r13 |= r5;      /* check clipping */
 761                r5 &= 0xFF00FF; /* mask */
 762
 763                r4 |= (r5 << 8);    /* pack them together */
 764                *p_cur++ = r4;
 765                r1 = r3;
 766                r0 = r2;
 767            }
 768            p_cur += curr_offset; /* move to the next line */
 769            p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
 770
 771            if (r13&0xFF000700) /* need clipping */
 772            {
 773                /* move back to the beginning of the line */
 774                p_ref -= (ref_offset + blkwidth);   /* input */
 775                p_cur -= (outpitch >> 2);
 776
 777                tmp = (uint32)(p_ref + blkwidth);
 778                for (; (uint32)p_ref < tmp;)
 779                {
 780
 781                    r0 = *p_ref++;
 782                    r1 = *p_ref++;
 783                    r2 = *p_ref++;
 784                    r3 = *p_ref++;
 785                    r4 = *p_ref++;
 786                    /* first pixel */
 787                    r5 = *p_ref++;
 788                    result = (r0 + r5);
 789                    r0 = (r1 + r4);
 790                    result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
 791                    r0 = (r2 + r3);
 792                    result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
 793                    result = (result + 16) >> 5;
 794                    CLIP_RESULT(result)
 795                    pkres  = result;
 796                    /* second pixel */
 797                    r0 = *p_ref++;
 798                    result = (r1 + r0);
 799                    r1 = (r2 + r5);
 800                    result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
 801                    r1 = (r3 + r4);
 802                    result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
 803                    result = (result + 16) >> 5;
 804                    CLIP_RESULT(result)
 805                    pkres  |= (result << 8);
 806                    /* third pixel */
 807                    r1 = *p_ref++;
 808                    result = (r2 + r1);
 809                    r2 = (r3 + r0);
 810                    result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
 811                    r2 = (r4 + r5);
 812                    result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
 813                    result = (result + 16) >> 5;
 814                    CLIP_RESULT(result)
 815                    pkres  |= (result << 16);
 816                    /* fourth pixel */
 817                    r2 = *p_ref++;
 818                    result = (r3 + r2);
 819                    r3 = (r4 + r1);
 820                    result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
 821                    r3 = (r5 + r0);
 822                    result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
 823                    result = (result + 16) >> 5;
 824                    CLIP_RESULT(result)
 825                    pkres  |= (result << 24);
 826                    *p_cur++ = pkres;   /* write 4 pixels */
 827                    p_ref -= 5;
 828                }
 829                p_cur += curr_offset; /* move to the next line */
 830                p_ref += ref_offset;
 831            }
 832        }
 833    }
 834
 835    return ;
 836}
 837
 838void HorzInterp2MC(int *in, int inpitch, uint8 *out, int outpitch,
 839                   int blkwidth, int blkheight, int dx)
 840{
 841    int *p_ref;
 842    uint32 *p_cur;
 843    uint32 tmp, pkres;
 844    int result, result2, curr_offset, ref_offset;
 845    int j, r0, r1, r2, r3, r4, r5;
 846
 847    p_cur = (uint32*)out; /* assume it's word aligned */
 848    curr_offset = (outpitch - blkwidth) >> 2;
 849    p_ref = in;
 850    ref_offset = inpitch - blkwidth;
 851
 852    if (dx&1)
 853    {
 854        dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
 855
 856        for (j = blkheight; j > 0 ; j--)
 857        {
 858            tmp = (uint32)(p_ref + blkwidth);
 859            for (; (uint32)p_ref < tmp;)
 860            {
 861
 862                r0 = p_ref[-2];
 863                r1 = p_ref[-1];
 864                r2 = *p_ref++;
 865                r3 = *p_ref++;
 866                r4 = *p_ref++;
 867                /* first pixel */
 868                r5 = *p_ref++;
 869                result = (r0 + r5);
 870                r0 = (r1 + r4);
 871                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
 872                r0 = (r2 + r3);
 873                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
 874                result = (result + 512) >> 10;
 875                CLIP_RESULT(result)
 876                result2 = ((p_ref[dx] + 16) >> 5);
 877                CLIP_RESULT(result2)
 878                /* 3/4 pel,  no need to clip */
 879                result = (result + result2 + 1);
 880                pkres = (result >> 1);
 881                /* second pixel */
 882                r0 = *p_ref++;
 883                result = (r1 + r0);
 884                r1 = (r2 + r5);
 885                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
 886                r1 = (r3 + r4);
 887                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
 888                result = (result + 512) >> 10;
 889                CLIP_RESULT(result)
 890                result2 = ((p_ref[dx] + 16) >> 5);
 891                CLIP_RESULT(result2)
 892                /* 3/4 pel,  no need to clip */
 893                result = (result + result2 + 1);
 894                result = (result >> 1);
 895                pkres  |= (result << 8);
 896                /* third pixel */
 897                r1 = *p_ref++;
 898                result = (r2 + r1);
 899                r2 = (r3 + r0);
 900                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
 901                r2 = (r4 + r5);
 902                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
 903                result = (result + 512) >> 10;
 904                CLIP_RESULT(result)
 905                result2 = ((p_ref[dx] + 16) >> 5);
 906                CLIP_RESULT(result2)
 907                /* 3/4 pel,  no need to clip */
 908                result = (result + result2 + 1);
 909                result = (result >> 1);
 910                pkres  |= (result << 16);
 911                /* fourth pixel */
 912                r2 = *p_ref++;
 913                result = (r3 + r2);
 914                r3 = (r4 + r1);
 915                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
 916                r3 = (r5 + r0);
 917                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
 918                result = (result + 512) >> 10;
 919                CLIP_RESULT(result)
 920                result2 = ((p_ref[dx] + 16) >> 5);
 921                CLIP_RESULT(result2)
 922                /* 3/4 pel,  no need to clip */
 923                result = (result + result2 + 1);
 924                result = (result >> 1);
 925                pkres  |= (result << 24);
 926                *p_cur++ = pkres; /* write 4 pixels */
 927                p_ref -= 3;  /* offset back to the middle of filter */
 928            }
 929            p_cur += curr_offset;  /* move to the next line */
 930            p_ref += ref_offset;    /* move to the next line */
 931        }
 932    }
 933    else
 934    {
 935        for (j = blkheight; j > 0 ; j--)
 936        {
 937            tmp = (uint32)(p_ref + blkwidth);
 938            for (; (uint32)p_ref < tmp;)
 939            {
 940
 941                r0 = p_ref[-2];
 942                r1 = p_ref[-1];
 943                r2 = *p_ref++;
 944                r3 = *p_ref++;
 945                r4 = *p_ref++;
 946                /* first pixel */
 947                r5 = *p_ref++;
 948                result = (r0 + r5);
 949                r0 = (r1 + r4);
 950                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
 951                r0 = (r2 + r3);
 952                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
 953                result = (result + 512) >> 10;
 954                CLIP_RESULT(result)
 955                pkres  = result;
 956                /* second pixel */
 957                r0 = *p_ref++;
 958                result = (r1 + r0);
 959                r1 = (r2 + r5);
 960                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
 961                r1 = (r3 + r4);
 962                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
 963                result = (result + 512) >> 10;
 964                CLIP_RESULT(result)
 965                pkres  |= (result << 8);
 966                /* third pixel */
 967                r1 = *p_ref++;
 968                result = (r2 + r1);
 969                r2 = (r3 + r0);
 970                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
 971                r2 = (r4 + r5);
 972                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
 973                result = (result + 512) >> 10;
 974                CLIP_RESULT(result)
 975                pkres  |= (result << 16);
 976                /* fourth pixel */
 977                r2 = *p_ref++;
 978                result = (r3 + r2);
 979                r3 = (r4 + r1);
 980                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
 981                r3 = (r5 + r0);
 982                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
 983                result = (result + 512) >> 10;
 984                CLIP_RESULT(result)
 985                pkres  |= (result << 24);
 986                *p_cur++ = pkres; /* write 4 pixels */
 987                p_ref -= 3;  /* offset back to the middle of filter */
 988            }
 989            p_cur += curr_offset;  /* move to the next line */
 990            p_ref += ref_offset;    /* move to the next line */
 991        }
 992    }
 993
 994    return ;
 995}
 996
 997void HorzInterp3MC(uint8 *in, int inpitch, int *out, int outpitch,
 998                   int blkwidth, int blkheight)
 999{
1000    uint8 *p_ref;
1001    int   *p_cur;
1002    uint32 tmp;
1003    int result, curr_offset, ref_offset;
1004    int j, r0, r1, r2, r3, r4, r5;
1005
1006    p_cur = out;
1007    curr_offset = (outpitch - blkwidth);
1008    p_ref = in;
1009    ref_offset = inpitch - blkwidth;
1010
1011    for (j = blkheight; j > 0 ; j--)
1012    {
1013        tmp = (uint32)(p_ref + blkwidth);
1014        for (; (uint32)p_ref < tmp;)
1015        {
1016
1017            r0 = p_ref[-2];
1018            r1 = p_ref[-1];
1019            r2 = *p_ref++;
1020            r3 = *p_ref++;
1021            r4 = *p_ref++;
1022            /* first pixel */
1023            r5 = *p_ref++;
1024            result = (r0 + r5);
1025            r0 = (r1 + r4);
1026            result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1027            r0 = (r2 + r3);
1028            result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1029            *p_cur++ = result;
1030            /* second pixel */
1031            r0 = *p_ref++;
1032            result = (r1 + r0);
1033            r1 = (r2 + r5);
1034            result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1035            r1 = (r3 + r4);
1036            result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1037            *p_cur++ = result;
1038            /* third pixel */
1039            r1 = *p_ref++;
1040            result = (r2 + r1);
1041            r2 = (r3 + r0);
1042            result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1043            r2 = (r4 + r5);
1044            result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1045            *p_cur++ = result;
1046            /* fourth pixel */
1047            r2 = *p_ref++;
1048            result = (r3 + r2);
1049            r3 = (r4 + r1);
1050            result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1051            r3 = (r5 + r0);
1052            result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1053            *p_cur++ = result;
1054            p_ref -= 3; /* move back to the middle of the filter */
1055        }
1056        p_cur += curr_offset; /* move to the next line */
1057        p_ref += ref_offset;
1058    }
1059
1060    return ;
1061}
1062void VertInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
1063                   int blkwidth, int blkheight, int dy)
1064{
1065    uint8 *p_cur, *p_ref;
1066    uint32 tmp;
1067    int result, curr_offset, ref_offset;
1068    int j, i;
1069    int32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r13;
1070    uint8  tmp_in[24][24];
1071
1072    /* not word-aligned */
1073    if (((uint32)in)&0x3)
1074    {
1075        CreateAlign(in, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
1076        in = &tmp_in[2][0];
1077        inpitch = 24;
1078    }
1079    p_cur = out;
1080    curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
1081    ref_offset = blkheight * inpitch; /* for limit */
1082
1083    curr_offset += 3;
1084
1085    if (dy&1)
1086    {
1087        dy = (dy >> 1) ? 0 : -inpitch;
1088
1089        for (j = 0; j < blkwidth; j += 4, in += 4)
1090        {
1091            r13 = 0;
1092            p_ref = in;
1093            p_cur -= outpitch;  /* compensate for the first offset */
1094            tmp = (uint32)(p_ref + ref_offset); /* limit */
1095            while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
1096            {
1097                r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
1098                p_ref += inpitch;
1099                r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
1100                r0 &= 0xFF00FF;
1101
1102                r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
1103                r7 = (r1 >> 8) & 0xFF00FF;
1104                r1 &= 0xFF00FF;
1105
1106                r0 += r1;
1107                r6 += r7;
1108
1109                r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
1110                r8 = (r2 >> 8) & 0xFF00FF;
1111                r2 &= 0xFF00FF;
1112
1113                r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
1114                r7 = (r1 >> 8) & 0xFF00FF;
1115                r1 &= 0xFF00FF;
1116                r1 += r2;
1117
1118                r7 += r8;
1119
1120                r0 += 20 * r1;
1121                r6 += 20 * r7;
1122                r0 += 0x100010;
1123                r6 += 0x100010;
1124
1125                r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
1126                r8 = (r2 >> 8) & 0xFF00FF;
1127                r2 &= 0xFF00FF;
1128
1129                r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
1130                r7 = (r1 >> 8) & 0xFF00FF;
1131                r1 &= 0xFF00FF;
1132                r1 += r2;
1133
1134                r7 += r8;
1135
1136                r0 -= 5 * r1;
1137                r6 -= 5 * r7;
1138
1139                r0 >>= 5;
1140                r6 >>= 5;
1141                /* clip */
1142                r13 |= r6;
1143                r13 |= r0;
1144                //CLIPPACK(r6,result)
1145
1146                r1 = *((uint32*)(p_ref + dy));
1147                r2 = (r1 >> 8) & 0xFF00FF;
1148                r1 &= 0xFF00FF;
1149                r0 += r1;
1150                r6 += r2;
1151                r0 += 0x10001;
1152                r6 += 0x10001;
1153                r0 = (r0 >> 1) & 0xFF00FF;
1154                r6 = (r6 >> 1) & 0xFF00FF;
1155
1156                r0 |= (r6 << 8);  /* pack it back */
1157                *((uint32*)(p_cur += outpitch)) = r0;
1158            }
1159            p_cur += curr_offset; /* offset to the next pixel */
1160            if (r13 & 0xFF000700) /* this column need clipping */
1161            {
1162                p_cur -= 4;
1163                for (i = 0; i < 4; i++)
1164                {
1165                    p_ref = in + i;
1166                    p_cur -= outpitch;  /* compensate for the first offset */
1167
1168                    tmp = (uint32)(p_ref + ref_offset); /* limit */
1169                    while ((uint32)p_ref < tmp)
1170                    {                           /* loop un-rolled */
1171                        r0 = *(p_ref - (inpitch << 1));
1172                        r1 = *(p_ref - inpitch);
1173                        r2 = *p_ref;
1174                        r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1175                        r4 = *(p_ref += inpitch);
1176                        /* first pixel */
1177                        r5 = *(p_ref += inpitch);
1178                        result = (r0 + r5);
1179                        r0 = (r1 + r4);
1180                        result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1181                        r0 = (r2 + r3);
1182                        result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1183                        result = (result + 16) >> 5;
1184                        CLIP_RESULT(result)
1185                        /* 3/4 pel,  no need to clip */
1186                        result = (result + p_ref[dy-(inpitch<<1)] + 1);
1187                        result = (result >> 1);
1188                        *(p_cur += outpitch) = result;
1189                        /* second pixel */
1190                        r0 = *(p_ref += inpitch);
1191                        result = (r1 + r0);
1192                        r1 = (r2 + r5);
1193                        result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1194                        r1 = (r3 + r4);
1195                        result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1196                        result = (result + 16) >> 5;
1197                        CLIP_RESULT(result)
1198                        /* 3/4 pel,  no need to clip */
1199                        result = (result + p_ref[dy-(inpitch<<1)] + 1);
1200                        result = (result >> 1);
1201                        *(p_cur += outpitch) = result;
1202                        /* third pixel */
1203                        r1 = *(p_ref += inpitch);
1204                        result = (r2 + r1);
1205                        r2 = (r3 + r0);
1206                        result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1207                        r2 = (r4 + r5);
1208                        result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1209                        result = (result + 16) >> 5;
1210                        CLIP_RESULT(result)
1211                        /* 3/4 pel,  no need to clip */
1212                        result = (result + p_ref[dy-(inpitch<<1)] + 1);
1213                        result = (result >> 1);
1214                        *(p_cur += outpitch) = result;
1215                        /* fourth pixel */
1216                        r2 = *(p_ref += inpitch);
1217                        result = (r3 + r2);
1218                        r3 = (r4 + r1);
1219                        result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1220                        r3 = (r5 + r0);
1221                        result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1222                        result = (result + 16) >> 5;
1223                        CLIP_RESULT(result)
1224                        /* 3/4 pel,  no need to clip */
1225                        result = (result + p_ref[dy-(inpitch<<1)] + 1);
1226                        result = (result >> 1);
1227                        *(p_cur += outpitch) = result;
1228                        p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1229                    }
1230                    p_cur += (curr_offset - 3);
1231                }
1232            }
1233        }
1234    }
1235    else
1236    {
1237        for (j = 0; j < blkwidth; j += 4, in += 4)
1238        {
1239            r13 = 0;
1240            p_ref = in;
1241            p_cur -= outpitch;  /* compensate for the first offset */
1242            tmp = (uint32)(p_ref + ref_offset); /* limit */
1243            while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
1244            {
1245                r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
1246                p_ref += inpitch;
1247                r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
1248                r0 &= 0xFF00FF;
1249
1250                r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
1251                r7 = (r1 >> 8) & 0xFF00FF;
1252                r1 &= 0xFF00FF;
1253
1254                r0 += r1;
1255                r6 += r7;
1256
1257                r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
1258                r8 = (r2 >> 8) & 0xFF00FF;
1259                r2 &= 0xFF00FF;
1260
1261                r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
1262                r7 = (r1 >> 8) & 0xFF00FF;
1263                r1 &= 0xFF00FF;
1264                r1 += r2;
1265
1266                r7 += r8;
1267
1268                r0 += 20 * r1;
1269                r6 += 20 * r7;
1270                r0 += 0x100010;
1271                r6 += 0x100010;
1272
1273                r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
1274                r8 = (r2 >> 8) & 0xFF00FF;
1275                r2 &= 0xFF00FF;
1276
1277                r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
1278                r7 = (r1 >> 8) & 0xFF00FF;
1279                r1 &= 0xFF00FF;
1280                r1 += r2;
1281
1282                r7 += r8;
1283
1284                r0 -= 5 * r1;
1285                r6 -= 5 * r7;
1286
1287                r0 >>= 5;
1288                r6 >>= 5;
1289                /* clip */
1290                r13 |= r6;
1291                r13 |= r0;
1292                //CLIPPACK(r6,result)
1293                r0 &= 0xFF00FF;
1294                r6 &= 0xFF00FF;
1295                r0 |= (r6 << 8);  /* pack it back */
1296                *((uint32*)(p_cur += outpitch)) = r0;
1297            }
1298            p_cur += curr_offset; /* offset to the next pixel */
1299            if (r13 & 0xFF000700) /* this column need clipping */
1300            {
1301                p_cur -= 4;
1302                for (i = 0; i < 4; i++)
1303                {
1304                    p_ref = in + i;
1305                    p_cur -= outpitch;  /* compensate for the first offset */
1306                    tmp = (uint32)(p_ref + ref_offset); /* limit */
1307                    while ((uint32)p_ref < tmp)
1308                    {                           /* loop un-rolled */
1309                        r0 = *(p_ref - (inpitch << 1));
1310                        r1 = *(p_ref - inpitch);
1311                        r2 = *p_ref;
1312                        r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1313                        r4 = *(p_ref += inpitch);
1314                        /* first pixel */
1315                        r5 = *(p_ref += inpitch);
1316                        result = (r0 + r5);
1317                        r0 = (r1 + r4);
1318                        result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1319                        r0 = (r2 + r3);
1320                        result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1321                        result = (result + 16) >> 5;
1322                        CLIP_RESULT(result)
1323                        *(p_cur += outpitch) = result;
1324                        /* second pixel */
1325                        r0 = *(p_ref += inpitch);
1326                        result = (r1 + r0);
1327                        r1 = (r2 + r5);
1328                        result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1329                        r1 = (r3 + r4);
1330                        result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1331                        result = (result + 16) >> 5;
1332                        CLIP_RESULT(result)
1333                        *(p_cur += outpitch) = result;
1334                        /* third pixel */
1335                        r1 = *(p_ref += inpitch);
1336                        result = (r2 + r1);
1337                        r2 = (r3 + r0);
1338                        result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1339                        r2 = (r4 + r5);
1340                        result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1341                        result = (result + 16) >> 5;
1342                        CLIP_RESULT(result)
1343                        *(p_cur += outpitch) = result;
1344                        /* fourth pixel */
1345                        r2 = *(p_ref += inpitch);
1346                        result = (r3 + r2);
1347                        r3 = (r4 + r1);
1348                        result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1349                        r3 = (r5 + r0);
1350                        result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1351                        result = (result + 16) >> 5;
1352                        CLIP_RESULT(result)
1353                        *(p_cur += outpitch) = result;
1354                        p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1355                    }
1356                    p_cur += (curr_offset - 3);
1357                }
1358            }
1359        }
1360    }
1361
1362    return ;
1363}
1364
1365void VertInterp2MC(uint8 *in, int inpitch, int *out, int outpitch,
1366        

Large files files are truncated, but you can click here to view the full file