PageRenderTime 7ms CodeModel.GetById 134ms app.highlight 149ms RepoModel.GetById 2ms app.codeStats 0ms

/Show/avc/pred_inter.cpp

http://github.com/mbebenita/Broadway
C++ | 2330 lines | 1974 code | 201 blank | 155 comment | 158 complexity | 5750c7959b2cde35dcd102bfcf79ae88 MD5 | raw file
   1/* ------------------------------------------------------------------
   2 * Copyright (C) 1998-2009 PacketVideo
   3 *
   4 * Licensed under the Apache License, Version 2.0 (the "License");
   5 * you may not use this file except in compliance with the License.
   6 * You may obtain a copy of the License at
   7 *
   8 *      http://www.apache.org/licenses/LICENSE-2.0
   9 *
  10 * Unless required by applicable law or agreed to in writing, software
  11 * distributed under the License is distributed on an "AS IS" BASIS,
  12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
  13 * express or implied.
  14 * See the License for the specific language governing permissions
  15 * and limitations under the License.
  16 * -------------------------------------------------------------------
  17 */
  18#include "avcdec_lib.h"
  19
  20
  21#define CLIP_RESULT(x)      if((uint)x > 0xFF){ \
  22                 x = 0xFF & (~(x>>31));}
  23
  24/* (blkwidth << 2) + (dy << 1) + dx */
  25static void (*const ChromaMC_SIMD[8])(uint8 *, int , int , int , uint8 *, int, int , int) =
  26{
  27    &ChromaFullMC_SIMD,
  28    &ChromaHorizontalMC_SIMD,
  29    &ChromaVerticalMC_SIMD,
  30    &ChromaDiagonalMC_SIMD,
  31    &ChromaFullMC_SIMD,
  32    &ChromaHorizontalMC2_SIMD,
  33    &ChromaVerticalMC2_SIMD,
  34    &ChromaDiagonalMC2_SIMD
  35};
  36/* Perform motion prediction and compensation with residue if exist. */
  37void InterMBPrediction(AVCCommonObj *video)
  38{
  39    AVCMacroblock *currMB = video->currMB;
  40    AVCPictureData *currPic = video->currPic;
  41    int mbPartIdx, subMbPartIdx;
  42    int ref_idx;
  43    int offset_MbPart_indx = 0;
  44    int16 *mv;
  45    uint32 x_pos, y_pos;
  46    uint8 *curL, *curCb, *curCr;
  47    uint8 *ref_l, *ref_Cb, *ref_Cr;
  48    uint8 *predBlock, *predCb, *predCr;
  49    int block_x, block_y, offset_x, offset_y, offsetP, offset;
  50    int x_position = (video->mb_x << 4);
  51    int y_position = (video->mb_y << 4);
  52    int MbHeight, MbWidth, mbPartIdx_X, mbPartIdx_Y, offset_indx;
  53    int picWidth = currPic->pitch;
  54    int picHeight = currPic->height;
  55    int16 *dataBlock;
  56    uint32 cbp4x4;
  57    uint32 tmp_word;
  58
  59    tmp_word = y_position * picWidth;
  60    curL = currPic->Sl + tmp_word + x_position;
  61    offset = (tmp_word >> 2) + (x_position >> 1);
  62    curCb = currPic->Scb + offset;
  63    curCr = currPic->Scr + offset;
  64
  65#ifdef USE_PRED_BLOCK
  66    predBlock = video->pred + 84;
  67    predCb = video->pred + 452;
  68    predCr = video->pred + 596;
  69#else
  70    predBlock = curL;
  71    predCb = curCb;
  72    predCr = curCr;
  73#endif
  74
  75    GetMotionVectorPredictor(video, false);
  76
  77    for (mbPartIdx = 0; mbPartIdx < currMB->NumMbPart; mbPartIdx++)
  78    {
  79        MbHeight = currMB->SubMbPartHeight[mbPartIdx];
  80        MbWidth = currMB->SubMbPartWidth[mbPartIdx];
  81        mbPartIdx_X = ((mbPartIdx + offset_MbPart_indx) & 1);
  82        mbPartIdx_Y = (mbPartIdx + offset_MbPart_indx) >> 1;
  83        ref_idx = currMB->ref_idx_L0[(mbPartIdx_Y << 1) + mbPartIdx_X];
  84        offset_indx = 0;
  85
  86        ref_l = video->RefPicList0[ref_idx]->Sl;
  87        ref_Cb = video->RefPicList0[ref_idx]->Scb;
  88        ref_Cr = video->RefPicList0[ref_idx]->Scr;
  89
  90        for (subMbPartIdx = 0; subMbPartIdx < currMB->NumSubMbPart[mbPartIdx]; subMbPartIdx++)
  91        {
  92            block_x = (mbPartIdx_X << 1) + ((subMbPartIdx + offset_indx) & 1);  // check this
  93            block_y = (mbPartIdx_Y << 1) + (((subMbPartIdx + offset_indx) >> 1) & 1);
  94            mv = (int16*)(currMB->mvL0 + block_x + (block_y << 2));
  95            offset_x = x_position + (block_x << 2);
  96            offset_y = y_position + (block_y << 2);
  97            x_pos = (offset_x << 2) + *mv++;   /*quarter pel */
  98            y_pos = (offset_y << 2) + *mv;   /*quarter pel */
  99
 100            //offset = offset_y * currPic->width;
 101            //offsetC = (offset >> 2) + (offset_x >> 1);
 102#ifdef USE_PRED_BLOCK
 103            offsetP = (block_y * 80) + (block_x << 2);
 104            LumaMotionComp(ref_l, picWidth, picHeight, x_pos, y_pos,
 105                           /*comp_Sl + offset + offset_x,*/
 106                           predBlock + offsetP, 20, MbWidth, MbHeight);
 107#else
 108            offsetP = (block_y << 2) * picWidth + (block_x << 2);
 109            LumaMotionComp(ref_l, picWidth, picHeight, x_pos, y_pos,
 110                           /*comp_Sl + offset + offset_x,*/
 111                           predBlock + offsetP, picWidth, MbWidth, MbHeight);
 112#endif
 113
 114#ifdef USE_PRED_BLOCK
 115            offsetP = (block_y * 24) + (block_x << 1);
 116            ChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
 117                             /*comp_Scb +  offsetC,*/
 118                             predCb + offsetP, 12, MbWidth >> 1, MbHeight >> 1);
 119            ChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
 120                             /*comp_Scr +  offsetC,*/
 121                             predCr + offsetP, 12, MbWidth >> 1, MbHeight >> 1);
 122#else
 123            offsetP = (block_y * picWidth) + (block_x << 1);
 124            ChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
 125                             /*comp_Scb +  offsetC,*/
 126                             predCb + offsetP, picWidth >> 1, MbWidth >> 1, MbHeight >> 1);
 127            ChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
 128                             /*comp_Scr +  offsetC,*/
 129                             predCr + offsetP, picWidth >> 1, MbWidth >> 1, MbHeight >> 1);
 130#endif
 131
 132            offset_indx = currMB->SubMbPartWidth[mbPartIdx] >> 3;
 133        }
 134        offset_MbPart_indx = currMB->MbPartWidth >> 4;
 135    }
 136
 137    /* used in decoder, used to be if(!encFlag)  */
 138
 139    /* transform in raster scan order */
 140    dataBlock = video->block;
 141    cbp4x4 = video->cbp4x4;
 142    /* luma */
 143    for (block_y = 4; block_y > 0; block_y--)
 144    {
 145        for (block_x = 4; block_x > 0; block_x--)
 146        {
 147#ifdef USE_PRED_BLOCK
 148            if (cbp4x4&1)
 149            {
 150                itrans(dataBlock, predBlock, predBlock, 20);
 151            }
 152#else
 153            if (cbp4x4&1)
 154            {
 155                itrans(dataBlock, curL, curL, picWidth);
 156            }
 157#endif
 158            cbp4x4 >>= 1;
 159            dataBlock += 4;
 160#ifdef USE_PRED_BLOCK
 161            predBlock += 4;
 162#else
 163            curL += 4;
 164#endif
 165        }
 166        dataBlock += 48;
 167#ifdef USE_PRED_BLOCK
 168        predBlock += 64;
 169#else
 170        curL += ((picWidth << 2) - 16);
 171#endif
 172    }
 173
 174    /* chroma */
 175    picWidth = (picWidth >> 1);
 176    for (block_y = 2; block_y > 0; block_y--)
 177    {
 178        for (block_x = 2; block_x > 0; block_x--)
 179        {
 180#ifdef USE_PRED_BLOCK
 181            if (cbp4x4&1)
 182            {
 183                ictrans(dataBlock, predCb, predCb, 12);
 184            }
 185#else
 186            if (cbp4x4&1)
 187            {
 188                ictrans(dataBlock, curCb, curCb, picWidth);
 189            }
 190#endif
 191            cbp4x4 >>= 1;
 192            dataBlock += 4;
 193#ifdef USE_PRED_BLOCK
 194            predCb += 4;
 195#else
 196            curCb += 4;
 197#endif
 198        }
 199        for (block_x = 2; block_x > 0; block_x--)
 200        {
 201#ifdef USE_PRED_BLOCK
 202            if (cbp4x4&1)
 203            {
 204                ictrans(dataBlock, predCr, predCr, 12);
 205            }
 206#else
 207            if (cbp4x4&1)
 208            {
 209                ictrans(dataBlock, curCr, curCr, picWidth);
 210            }
 211#endif
 212            cbp4x4 >>= 1;
 213            dataBlock += 4;
 214#ifdef USE_PRED_BLOCK
 215            predCr += 4;
 216#else
 217            curCr += 4;
 218#endif
 219        }
 220        dataBlock += 48;
 221#ifdef USE_PRED_BLOCK
 222        predCb += 40;
 223        predCr += 40;
 224#else
 225        curCb += ((picWidth << 2) - 8);
 226        curCr += ((picWidth << 2) - 8);
 227#endif
 228    }
 229
 230#ifdef MB_BASED_DEBLOCK
 231    SaveNeighborForIntraPred(video, offset);
 232#endif
 233
 234    return ;
 235}
 236
 237
 238/* preform the actual  motion comp here */
 239void LumaMotionComp(uint8 *ref, int picwidth, int picheight,
 240                    int x_pos, int y_pos,
 241                    uint8 *pred, int pred_pitch,
 242                    int blkwidth, int blkheight)
 243{
 244    int dx, dy;
 245    uint8 temp[24][24]; /* for padding, make the size multiple of 4 for packing */
 246    int temp2[21][21]; /* for intermediate results */
 247    uint8 *ref2;
 248
 249    dx = x_pos & 3;
 250    dy = y_pos & 3;
 251    x_pos = x_pos >> 2;  /* round it to full-pel resolution */
 252    y_pos = y_pos >> 2;
 253
 254    /* perform actual motion compensation */
 255    if (dx == 0 && dy == 0)
 256    {  /* fullpel position *//* G */
 257        if (x_pos >= 0 && x_pos + blkwidth <= picwidth && y_pos >= 0 && y_pos + blkheight <= picheight)
 258        {
 259            ref += y_pos * picwidth + x_pos;
 260            FullPelMC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight);
 261        }
 262        else
 263        {
 264            CreatePad(ref, picwidth, picheight, x_pos, y_pos, &temp[0][0], blkwidth, blkheight);
 265            FullPelMC(&temp[0][0], 24, pred, pred_pitch, blkwidth, blkheight);
 266        }
 267
 268    }   /* other positions */
 269    else  if (dy == 0)
 270    { /* no vertical interpolation *//* a,b,c*/
 271
 272        if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos >= 0 && y_pos + blkheight <= picheight)
 273        {
 274            ref += y_pos * picwidth + x_pos;
 275
 276            HorzInterp1MC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight, dx);
 277        }
 278        else  /* need padding */
 279        {
 280            CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos, &temp[0][0], blkwidth + 5, blkheight);
 281
 282            HorzInterp1MC(&temp[0][2], 24, pred, pred_pitch, blkwidth, blkheight, dx);
 283        }
 284    }
 285    else if (dx == 0)
 286    { /*no horizontal interpolation *//* d,h,n */
 287
 288        if (x_pos >= 0 && x_pos + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
 289        {
 290            ref += y_pos * picwidth + x_pos;
 291
 292            VertInterp1MC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight, dy);
 293        }
 294        else  /* need padding */
 295        {
 296            CreatePad(ref, picwidth, picheight, x_pos, y_pos - 2, &temp[0][0], blkwidth, blkheight + 5);
 297
 298            VertInterp1MC(&temp[2][0], 24, pred, pred_pitch, blkwidth, blkheight, dy);
 299        }
 300    }
 301    else if (dy == 2)
 302    {  /* horizontal cross *//* i, j, k */
 303
 304        if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
 305        {
 306            ref += y_pos * picwidth + x_pos - 2; /* move to the left 2 pixels */
 307
 308            VertInterp2MC(ref, picwidth, &temp2[0][0], 21, blkwidth + 5, blkheight);
 309
 310            HorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
 311        }
 312        else /* need padding */
 313        {
 314            CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5, blkheight + 5);
 315
 316            VertInterp2MC(&temp[2][0], 24, &temp2[0][0], 21, blkwidth + 5, blkheight);
 317
 318            HorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
 319        }
 320    }
 321    else if (dx == 2)
 322    { /* vertical cross */ /* f,q */
 323
 324        if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
 325        {
 326            ref += (y_pos - 2) * picwidth + x_pos; /* move to up 2 lines */
 327
 328            HorzInterp3MC(ref, picwidth, &temp2[0][0], 21, blkwidth, blkheight + 5);
 329            VertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
 330        }
 331        else  /* need padding */
 332        {
 333            CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5, blkheight + 5);
 334            HorzInterp3MC(&temp[0][2], 24, &temp2[0][0], 21, blkwidth, blkheight + 5);
 335            VertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
 336        }
 337    }
 338    else
 339    { /* diagonal *//* e,g,p,r */
 340
 341        if (x_pos - 2 >= 0 && x_pos + 3 + (dx / 2) + blkwidth <= picwidth &&
 342                y_pos - 2 >= 0 && y_pos + 3 + blkheight + (dy / 2) <= picheight)
 343        {
 344            ref2 = ref + (y_pos + (dy / 2)) * picwidth + x_pos;
 345
 346            ref += (y_pos * picwidth) + x_pos + (dx / 2);
 347
 348            DiagonalInterpMC(ref2, ref, picwidth, pred, pred_pitch, blkwidth, blkheight);
 349        }
 350        else  /* need padding */
 351        {
 352            CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5 + (dx / 2), blkheight + 5 + (dy / 2));
 353
 354            ref2 = &temp[2 + (dy/2)][2];
 355
 356            ref = &temp[2][2 + (dx/2)];
 357
 358            DiagonalInterpMC(ref2, ref, 24, pred, pred_pitch, blkwidth, blkheight);
 359        }
 360    }
 361
 362    return ;
 363}
 364
 365void CreateAlign(uint8 *ref, int picwidth, int y_pos,
 366                 uint8 *out, int blkwidth, int blkheight)
 367{
 368    int i, j;
 369    int offset, out_offset;
 370    uint32 prev_pix, result, pix1, pix2, pix4;
 371
 372    out_offset = 24 - blkwidth;
 373
 374    //switch(x_pos&0x3){
 375    switch (((uint32)ref)&0x3)
 376    {
 377        case 1:
 378            ref += y_pos * picwidth;
 379            offset =  picwidth - blkwidth - 3;
 380            for (j = 0; j < blkheight; j++)
 381            {
 382                pix1 = *ref++;
 383                pix2 = *((uint16*)ref);
 384                ref += 2;
 385                result = (pix2 << 8) | pix1;
 386
 387                for (i = 3; i < blkwidth; i += 4)
 388                {
 389                    pix4 = *((uint32*)ref);
 390                    ref += 4;
 391                    prev_pix = (pix4 << 24) & 0xFF000000; /* mask out byte belong to previous word */
 392                    result |= prev_pix;
 393                    *((uint32*)out) = result;  /* write 4 bytes */
 394                    out += 4;
 395                    result = pix4 >> 8; /* for the next loop */
 396                }
 397                ref += offset;
 398                out += out_offset;
 399            }
 400            break;
 401        case 2:
 402            ref += y_pos * picwidth;
 403            offset =  picwidth - blkwidth - 2;
 404            for (j = 0; j < blkheight; j++)
 405            {
 406                result = *((uint16*)ref);
 407                ref += 2;
 408                for (i = 2; i < blkwidth; i += 4)
 409                {
 410                    pix4 = *((uint32*)ref);
 411                    ref += 4;
 412                    prev_pix = (pix4 << 16) & 0xFFFF0000; /* mask out byte belong to previous word */
 413                    result |= prev_pix;
 414                    *((uint32*)out) = result;  /* write 4 bytes */
 415                    out += 4;
 416                    result = pix4 >> 16; /* for the next loop */
 417                }
 418                ref += offset;
 419                out += out_offset;
 420            }
 421            break;
 422        case 3:
 423            ref += y_pos * picwidth;
 424            offset =  picwidth - blkwidth - 1;
 425            for (j = 0; j < blkheight; j++)
 426            {
 427                result = *ref++;
 428                for (i = 1; i < blkwidth; i += 4)
 429                {
 430                    pix4 = *((uint32*)ref);
 431                    ref += 4;
 432                    prev_pix = (pix4 << 8) & 0xFFFFFF00; /* mask out byte belong to previous word */
 433                    result |= prev_pix;
 434                    *((uint32*)out) = result;  /* write 4 bytes */
 435                    out += 4;
 436                    result = pix4 >> 24; /* for the next loop */
 437                }
 438                ref += offset;
 439                out += out_offset;
 440            }
 441            break;
 442    }
 443}
 444
 445void CreatePad(uint8 *ref, int picwidth, int picheight, int x_pos, int y_pos,
 446               uint8 *out, int blkwidth, int blkheight)
 447{
 448    int x_inc0, x_mid;
 449    int y_inc, y_inc0, y_inc1, y_mid;
 450    int i, j;
 451    int offset;
 452
 453    if (x_pos < 0)
 454    {
 455        x_inc0 = 0;  /* increment for the first part */
 456        x_mid = ((blkwidth + x_pos > 0) ? -x_pos : blkwidth);  /* stopping point */
 457        x_pos = 0;
 458    }
 459    else if (x_pos + blkwidth > picwidth)
 460    {
 461        x_inc0 = 1;  /* increasing */
 462        x_mid = ((picwidth > x_pos) ? picwidth - x_pos - 1 : 0);  /* clip negative to zero, encode fool proof! */
 463    }
 464    else    /* normal case */
 465    {
 466        x_inc0 = 1;
 467        x_mid = blkwidth; /* just one run */
 468    }
 469
 470
 471    /* boundary for y_pos, taking the result from x_pos into account */
 472    if (y_pos < 0)
 473    {
 474        y_inc0 = (x_inc0 ? - x_mid : -blkwidth + x_mid); /* offset depending on x_inc1 and x_inc0 */
 475        y_inc1 = picwidth + y_inc0;
 476        y_mid = ((blkheight + y_pos > 0) ? -y_pos : blkheight); /* clip to prevent memory corruption */
 477        y_pos = 0;
 478    }
 479    else  if (y_pos + blkheight > picheight)
 480    {
 481        y_inc1 = (x_inc0 ? - x_mid : -blkwidth + x_mid); /* saturate */
 482        y_inc0 = picwidth + y_inc1;                 /* increasing */
 483        y_mid = ((picheight > y_pos) ? picheight - 1 - y_pos : 0);
 484    }
 485    else  /* normal case */
 486    {
 487        y_inc1 = (x_inc0 ? - x_mid : -blkwidth + x_mid);
 488        y_inc0 = picwidth + y_inc1;
 489        y_mid = blkheight;
 490    }
 491
 492    /* clip y_pos and x_pos */
 493    if (y_pos > picheight - 1) y_pos = picheight - 1;
 494    if (x_pos > picwidth - 1) x_pos = picwidth - 1;
 495
 496    ref += y_pos * picwidth + x_pos;
 497
 498    y_inc = y_inc0;  /* start with top half */
 499
 500    offset = 24 - blkwidth; /* to use in offset out */
 501    blkwidth -= x_mid; /* to use in the loop limit */
 502
 503    if (x_inc0 == 0)
 504    {
 505        for (j = 0; j < blkheight; j++)
 506        {
 507            if (j == y_mid)  /* put a check here to reduce the code size (for unrolling the loop) */
 508            {
 509                y_inc = y_inc1;  /* switch to lower half */
 510            }
 511            for (i = x_mid; i > 0; i--)   /* first or third quarter */
 512            {
 513                *out++ = *ref;
 514            }
 515            for (i = blkwidth; i > 0; i--)  /* second or fourth quarter */
 516            {
 517                *out++ = *ref++;
 518            }
 519            out += offset;
 520            ref += y_inc;
 521        }
 522    }
 523    else
 524    {
 525        for (j = 0; j < blkheight; j++)
 526        {
 527            if (j == y_mid)  /* put a check here to reduce the code size (for unrolling the loop) */
 528            {
 529                y_inc = y_inc1;  /* switch to lower half */
 530            }
 531            for (i = x_mid; i > 0; i--)   /* first or third quarter */
 532            {
 533                *out++ = *ref++;
 534            }
 535            for (i = blkwidth; i > 0; i--)  /* second or fourth quarter */
 536            {
 537                *out++ = *ref;
 538            }
 539            out += offset;
 540            ref += y_inc;
 541        }
 542    }
 543
 544    return ;
 545}
 546
 547void HorzInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
 548                   int blkwidth, int blkheight, int dx)
 549{
 550    uint8 *p_ref;
 551    uint32 *p_cur;
 552    uint32 tmp;
 553    uint32 pkres;
 554    int result, curr_offset, ref_offset;
 555    int j;
 556    int32 r0, r1, r2, r3, r4, r5;
 557    int32 r13, r6;
 558
 559    p_cur = (uint32*)out; /* assume it's word aligned */
 560    curr_offset = (outpitch - blkwidth) >> 2;
 561    p_ref = in;
 562    ref_offset = inpitch - blkwidth;
 563
 564    if (dx&1)
 565    {
 566        dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
 567        p_ref -= 2;
 568        r13 = 0;
 569        for (j = blkheight; j > 0; j--)
 570        {
 571            tmp = (uint32)(p_ref + blkwidth);
 572            r0 = p_ref[0];
 573            r1 = p_ref[2];
 574            r0 |= (r1 << 16);           /* 0,c,0,a */
 575            r1 = p_ref[1];
 576            r2 = p_ref[3];
 577            r1 |= (r2 << 16);           /* 0,d,0,b */
 578            while ((uint32)p_ref < tmp)
 579            {
 580                r2 = *(p_ref += 4); /* move pointer to e */
 581                r3 = p_ref[2];
 582                r2 |= (r3 << 16);           /* 0,g,0,e */
 583                r3 = p_ref[1];
 584                r4 = p_ref[3];
 585                r3 |= (r4 << 16);           /* 0,h,0,f */
 586
 587                r4 = r0 + r3;       /* c+h, a+f */
 588                r5 = r0 + r1;   /* c+d, a+b */
 589                r6 = r2 + r3;   /* g+h, e+f */
 590                r5 >>= 16;
 591                r5 |= (r6 << 16);   /* e+f, c+d */
 592                r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
 593                r4 += 0x100010; /* +16, +16 */
 594                r5 = r1 + r2;       /* d+g, b+e */
 595                r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
 596                r4 >>= 5;
 597                r13 |= r4;      /* check clipping */
 598
 599                r5 = p_ref[dx+2];
 600                r6 = p_ref[dx+4];
 601                r5 |= (r6 << 16);
 602                r4 += r5;
 603                r4 += 0x10001;
 604                r4 = (r4 >> 1) & 0xFF00FF;
 605
 606                r5 = p_ref[4];  /* i */
 607                r6 = (r5 << 16);
 608                r5 = r6 | (r2 >> 16);/* 0,i,0,g */
 609                r5 += r1;       /* d+i, b+g */ /* r5 not free */
 610                r1 >>= 16;
 611                r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
 612                r1 += r2;       /* f+g, d+e */
 613                r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
 614                r0 >>= 16;
 615                r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
 616                r0 += r3;       /* e+h, c+f */
 617                r5 += 0x100010; /* 16,16 */
 618                r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
 619                r5 >>= 5;
 620                r13 |= r5;      /* check clipping */
 621
 622                r0 = p_ref[dx+3];
 623                r1 = p_ref[dx+5];
 624                r0 |= (r1 << 16);
 625                r5 += r0;
 626                r5 += 0x10001;
 627                r5 = (r5 >> 1) & 0xFF00FF;
 628
 629                r4 |= (r5 << 8);    /* pack them together */
 630                *p_cur++ = r4;
 631                r1 = r3;
 632                r0 = r2;
 633            }
 634            p_cur += curr_offset; /* move to the next line */
 635            p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
 636
 637            if (r13&0xFF000700) /* need clipping */
 638            {
 639                /* move back to the beginning of the line */
 640                p_ref -= (ref_offset + blkwidth);   /* input */
 641                p_cur -= (outpitch >> 2);
 642
 643                tmp = (uint32)(p_ref + blkwidth);
 644                for (; (uint32)p_ref < tmp;)
 645                {
 646
 647                    r0 = *p_ref++;
 648                    r1 = *p_ref++;
 649                    r2 = *p_ref++;
 650                    r3 = *p_ref++;
 651                    r4 = *p_ref++;
 652                    /* first pixel */
 653                    r5 = *p_ref++;
 654                    result = (r0 + r5);
 655                    r0 = (r1 + r4);
 656                    result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
 657                    r0 = (r2 + r3);
 658                    result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
 659                    result = (result + 16) >> 5;
 660                    CLIP_RESULT(result)
 661                    /* 3/4 pel,  no need to clip */
 662                    result = (result + p_ref[dx] + 1);
 663                    pkres = (result >> 1) ;
 664                    /* second pixel */
 665                    r0 = *p_ref++;
 666                    result = (r1 + r0);
 667                    r1 = (r2 + r5);
 668                    result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
 669                    r1 = (r3 + r4);
 670                    result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
 671                    result = (result + 16) >> 5;
 672                    CLIP_RESULT(result)
 673                    /* 3/4 pel,  no need to clip */
 674                    result = (result + p_ref[dx] + 1);
 675                    result = (result >> 1);
 676                    pkres  |= (result << 8);
 677                    /* third pixel */
 678                    r1 = *p_ref++;
 679                    result = (r2 + r1);
 680                    r2 = (r3 + r0);
 681                    result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
 682                    r2 = (r4 + r5);
 683                    result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
 684                    result = (result + 16) >> 5;
 685                    CLIP_RESULT(result)
 686                    /* 3/4 pel,  no need to clip */
 687                    result = (result + p_ref[dx] + 1);
 688                    result = (result >> 1);
 689                    pkres  |= (result << 16);
 690                    /* fourth pixel */
 691                    r2 = *p_ref++;
 692                    result = (r3 + r2);
 693                    r3 = (r4 + r1);
 694                    result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
 695                    r3 = (r5 + r0);
 696                    result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
 697                    result = (result + 16) >> 5;
 698                    CLIP_RESULT(result)
 699                    /* 3/4 pel,  no need to clip */
 700                    result = (result + p_ref[dx] + 1);
 701                    result = (result >> 1);
 702                    pkres  |= (result << 24);
 703                    *p_cur++ = pkres; /* write 4 pixels */
 704                    p_ref -= 5;  /* offset back to the middle of filter */
 705                }
 706                p_cur += curr_offset;  /* move to the next line */
 707                p_ref += ref_offset;    /* move to the next line */
 708            }
 709        }
 710    }
 711    else
 712    {
 713        p_ref -= 2;
 714        r13 = 0;
 715        for (j = blkheight; j > 0; j--)
 716        {
 717            tmp = (uint32)(p_ref + blkwidth);
 718            r0 = p_ref[0];
 719            r1 = p_ref[2];
 720            r0 |= (r1 << 16);           /* 0,c,0,a */
 721            r1 = p_ref[1];
 722            r2 = p_ref[3];
 723            r1 |= (r2 << 16);           /* 0,d,0,b */
 724            while ((uint32)p_ref < tmp)
 725            {
 726                r2 = *(p_ref += 4); /* move pointer to e */
 727                r3 = p_ref[2];
 728                r2 |= (r3 << 16);           /* 0,g,0,e */
 729                r3 = p_ref[1];
 730                r4 = p_ref[3];
 731                r3 |= (r4 << 16);           /* 0,h,0,f */
 732
 733                r4 = r0 + r3;       /* c+h, a+f */
 734                r5 = r0 + r1;   /* c+d, a+b */
 735                r6 = r2 + r3;   /* g+h, e+f */
 736                r5 >>= 16;
 737                r5 |= (r6 << 16);   /* e+f, c+d */
 738                r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
 739                r4 += 0x100010; /* +16, +16 */
 740                r5 = r1 + r2;       /* d+g, b+e */
 741                r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
 742                r4 >>= 5;
 743                r13 |= r4;      /* check clipping */
 744                r4 &= 0xFF00FF; /* mask */
 745
 746                r5 = p_ref[4];  /* i */
 747                r6 = (r5 << 16);
 748                r5 = r6 | (r2 >> 16);/* 0,i,0,g */
 749                r5 += r1;       /* d+i, b+g */ /* r5 not free */
 750                r1 >>= 16;
 751                r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
 752                r1 += r2;       /* f+g, d+e */
 753                r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
 754                r0 >>= 16;
 755                r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
 756                r0 += r3;       /* e+h, c+f */
 757                r5 += 0x100010; /* 16,16 */
 758                r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
 759                r5 >>= 5;
 760                r13 |= r5;      /* check clipping */
 761                r5 &= 0xFF00FF; /* mask */
 762
 763                r4 |= (r5 << 8);    /* pack them together */
 764                *p_cur++ = r4;
 765                r1 = r3;
 766                r0 = r2;
 767            }
 768            p_cur += curr_offset; /* move to the next line */
 769            p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
 770
 771            if (r13&0xFF000700) /* need clipping */
 772            {
 773                /* move back to the beginning of the line */
 774                p_ref -= (ref_offset + blkwidth);   /* input */
 775                p_cur -= (outpitch >> 2);
 776
 777                tmp = (uint32)(p_ref + blkwidth);
 778                for (; (uint32)p_ref < tmp;)
 779                {
 780
 781                    r0 = *p_ref++;
 782                    r1 = *p_ref++;
 783                    r2 = *p_ref++;
 784                    r3 = *p_ref++;
 785                    r4 = *p_ref++;
 786                    /* first pixel */
 787                    r5 = *p_ref++;
 788                    result = (r0 + r5);
 789                    r0 = (r1 + r4);
 790                    result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
 791                    r0 = (r2 + r3);
 792                    result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
 793                    result = (result + 16) >> 5;
 794                    CLIP_RESULT(result)
 795                    pkres  = result;
 796                    /* second pixel */
 797                    r0 = *p_ref++;
 798                    result = (r1 + r0);
 799                    r1 = (r2 + r5);
 800                    result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
 801                    r1 = (r3 + r4);
 802                    result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
 803                    result = (result + 16) >> 5;
 804                    CLIP_RESULT(result)
 805                    pkres  |= (result << 8);
 806                    /* third pixel */
 807                    r1 = *p_ref++;
 808                    result = (r2 + r1);
 809                    r2 = (r3 + r0);
 810                    result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
 811                    r2 = (r4 + r5);
 812                    result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
 813                    result = (result + 16) >> 5;
 814                    CLIP_RESULT(result)
 815                    pkres  |= (result << 16);
 816                    /* fourth pixel */
 817                    r2 = *p_ref++;
 818                    result = (r3 + r2);
 819                    r3 = (r4 + r1);
 820                    result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
 821                    r3 = (r5 + r0);
 822                    result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
 823                    result = (result + 16) >> 5;
 824                    CLIP_RESULT(result)
 825                    pkres  |= (result << 24);
 826                    *p_cur++ = pkres;   /* write 4 pixels */
 827                    p_ref -= 5;
 828                }
 829                p_cur += curr_offset; /* move to the next line */
 830                p_ref += ref_offset;
 831            }
 832        }
 833    }
 834
 835    return ;
 836}
 837
 838void HorzInterp2MC(int *in, int inpitch, uint8 *out, int outpitch,
 839                   int blkwidth, int blkheight, int dx)
 840{
 841    int *p_ref;
 842    uint32 *p_cur;
 843    uint32 tmp, pkres;
 844    int result, result2, curr_offset, ref_offset;
 845    int j, r0, r1, r2, r3, r4, r5;
 846
 847    p_cur = (uint32*)out; /* assume it's word aligned */
 848    curr_offset = (outpitch - blkwidth) >> 2;
 849    p_ref = in;
 850    ref_offset = inpitch - blkwidth;
 851
 852    if (dx&1)
 853    {
 854        dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
 855
 856        for (j = blkheight; j > 0 ; j--)
 857        {
 858            tmp = (uint32)(p_ref + blkwidth);
 859            for (; (uint32)p_ref < tmp;)
 860            {
 861
 862                r0 = p_ref[-2];
 863                r1 = p_ref[-1];
 864                r2 = *p_ref++;
 865                r3 = *p_ref++;
 866                r4 = *p_ref++;
 867                /* first pixel */
 868                r5 = *p_ref++;
 869                result = (r0 + r5);
 870                r0 = (r1 + r4);
 871                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
 872                r0 = (r2 + r3);
 873                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
 874                result = (result + 512) >> 10;
 875                CLIP_RESULT(result)
 876                result2 = ((p_ref[dx] + 16) >> 5);
 877                CLIP_RESULT(result2)
 878                /* 3/4 pel,  no need to clip */
 879                result = (result + result2 + 1);
 880                pkres = (result >> 1);
 881                /* second pixel */
 882                r0 = *p_ref++;
 883                result = (r1 + r0);
 884                r1 = (r2 + r5);
 885                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
 886                r1 = (r3 + r4);
 887                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
 888                result = (result + 512) >> 10;
 889                CLIP_RESULT(result)
 890                result2 = ((p_ref[dx] + 16) >> 5);
 891                CLIP_RESULT(result2)
 892                /* 3/4 pel,  no need to clip */
 893                result = (result + result2 + 1);
 894                result = (result >> 1);
 895                pkres  |= (result << 8);
 896                /* third pixel */
 897                r1 = *p_ref++;
 898                result = (r2 + r1);
 899                r2 = (r3 + r0);
 900                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
 901                r2 = (r4 + r5);
 902                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
 903                result = (result + 512) >> 10;
 904                CLIP_RESULT(result)
 905                result2 = ((p_ref[dx] + 16) >> 5);
 906                CLIP_RESULT(result2)
 907                /* 3/4 pel,  no need to clip */
 908                result = (result + result2 + 1);
 909                result = (result >> 1);
 910                pkres  |= (result << 16);
 911                /* fourth pixel */
 912                r2 = *p_ref++;
 913                result = (r3 + r2);
 914                r3 = (r4 + r1);
 915                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
 916                r3 = (r5 + r0);
 917                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
 918                result = (result + 512) >> 10;
 919                CLIP_RESULT(result)
 920                result2 = ((p_ref[dx] + 16) >> 5);
 921                CLIP_RESULT(result2)
 922                /* 3/4 pel,  no need to clip */
 923                result = (result + result2 + 1);
 924                result = (result >> 1);
 925                pkres  |= (result << 24);
 926                *p_cur++ = pkres; /* write 4 pixels */
 927                p_ref -= 3;  /* offset back to the middle of filter */
 928            }
 929            p_cur += curr_offset;  /* move to the next line */
 930            p_ref += ref_offset;    /* move to the next line */
 931        }
 932    }
 933    else
 934    {
 935        for (j = blkheight; j > 0 ; j--)
 936        {
 937            tmp = (uint32)(p_ref + blkwidth);
 938            for (; (uint32)p_ref < tmp;)
 939            {
 940
 941                r0 = p_ref[-2];
 942                r1 = p_ref[-1];
 943                r2 = *p_ref++;
 944                r3 = *p_ref++;
 945                r4 = *p_ref++;
 946                /* first pixel */
 947                r5 = *p_ref++;
 948                result = (r0 + r5);
 949                r0 = (r1 + r4);
 950                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
 951                r0 = (r2 + r3);
 952                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
 953                result = (result + 512) >> 10;
 954                CLIP_RESULT(result)
 955                pkres  = result;
 956                /* second pixel */
 957                r0 = *p_ref++;
 958                result = (r1 + r0);
 959                r1 = (r2 + r5);
 960                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
 961                r1 = (r3 + r4);
 962                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
 963                result = (result + 512) >> 10;
 964                CLIP_RESULT(result)
 965                pkres  |= (result << 8);
 966                /* third pixel */
 967                r1 = *p_ref++;
 968                result = (r2 + r1);
 969                r2 = (r3 + r0);
 970                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
 971                r2 = (r4 + r5);
 972                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
 973                result = (result + 512) >> 10;
 974                CLIP_RESULT(result)
 975                pkres  |= (result << 16);
 976                /* fourth pixel */
 977                r2 = *p_ref++;
 978                result = (r3 + r2);
 979                r3 = (r4 + r1);
 980                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
 981                r3 = (r5 + r0);
 982                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
 983                result = (result + 512) >> 10;
 984                CLIP_RESULT(result)
 985                pkres  |= (result << 24);
 986                *p_cur++ = pkres; /* write 4 pixels */
 987                p_ref -= 3;  /* offset back to the middle of filter */
 988            }
 989            p_cur += curr_offset;  /* move to the next line */
 990            p_ref += ref_offset;    /* move to the next line */
 991        }
 992    }
 993
 994    return ;
 995}
 996
 997void HorzInterp3MC(uint8 *in, int inpitch, int *out, int outpitch,
 998                   int blkwidth, int blkheight)
 999{
1000    uint8 *p_ref;
1001    int   *p_cur;
1002    uint32 tmp;
1003    int result, curr_offset, ref_offset;
1004    int j, r0, r1, r2, r3, r4, r5;
1005
1006    p_cur = out;
1007    curr_offset = (outpitch - blkwidth);
1008    p_ref = in;
1009    ref_offset = inpitch - blkwidth;
1010
1011    for (j = blkheight; j > 0 ; j--)
1012    {
1013        tmp = (uint32)(p_ref + blkwidth);
1014        for (; (uint32)p_ref < tmp;)
1015        {
1016
1017            r0 = p_ref[-2];
1018            r1 = p_ref[-1];
1019            r2 = *p_ref++;
1020            r3 = *p_ref++;
1021            r4 = *p_ref++;
1022            /* first pixel */
1023            r5 = *p_ref++;
1024            result = (r0 + r5);
1025            r0 = (r1 + r4);
1026            result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1027            r0 = (r2 + r3);
1028            result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1029            *p_cur++ = result;
1030            /* second pixel */
1031            r0 = *p_ref++;
1032            result = (r1 + r0);
1033            r1 = (r2 + r5);
1034            result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1035            r1 = (r3 + r4);
1036            result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1037            *p_cur++ = result;
1038            /* third pixel */
1039            r1 = *p_ref++;
1040            result = (r2 + r1);
1041            r2 = (r3 + r0);
1042            result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1043            r2 = (r4 + r5);
1044            result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1045            *p_cur++ = result;
1046            /* fourth pixel */
1047            r2 = *p_ref++;
1048            result = (r3 + r2);
1049            r3 = (r4 + r1);
1050            result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1051            r3 = (r5 + r0);
1052            result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1053            *p_cur++ = result;
1054            p_ref -= 3; /* move back to the middle of the filter */
1055        }
1056        p_cur += curr_offset; /* move to the next line */
1057        p_ref += ref_offset;
1058    }
1059
1060    return ;
1061}
1062void VertInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
1063                   int blkwidth, int blkheight, int dy)
1064{
1065    uint8 *p_cur, *p_ref;
1066    uint32 tmp;
1067    int result, curr_offset, ref_offset;
1068    int j, i;
1069    int32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r13;
1070    uint8  tmp_in[24][24];
1071
1072    /* not word-aligned */
1073    if (((uint32)in)&0x3)
1074    {
1075        CreateAlign(in, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
1076        in = &tmp_in[2][0];
1077        inpitch = 24;
1078    }
1079    p_cur = out;
1080    curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
1081    ref_offset = blkheight * inpitch; /* for limit */
1082
1083    curr_offset += 3;
1084
1085    if (dy&1)
1086    {
1087        dy = (dy >> 1) ? 0 : -inpitch;
1088
1089        for (j = 0; j < blkwidth; j += 4, in += 4)
1090        {
1091            r13 = 0;
1092            p_ref = in;
1093            p_cur -= outpitch;  /* compensate for the first offset */
1094            tmp = (uint32)(p_ref + ref_offset); /* limit */
1095            while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
1096            {
1097                r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
1098                p_ref += inpitch;
1099                r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
1100                r0 &= 0xFF00FF;
1101
1102                r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
1103                r7 = (r1 >> 8) & 0xFF00FF;
1104                r1 &= 0xFF00FF;
1105
1106                r0 += r1;
1107                r6 += r7;
1108
1109                r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
1110                r8 = (r2 >> 8) & 0xFF00FF;
1111                r2 &= 0xFF00FF;
1112
1113                r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
1114                r7 = (r1 >> 8) & 0xFF00FF;
1115                r1 &= 0xFF00FF;
1116                r1 += r2;
1117
1118                r7 += r8;
1119
1120                r0 += 20 * r1;
1121                r6 += 20 * r7;
1122                r0 += 0x100010;
1123                r6 += 0x100010;
1124
1125                r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
1126                r8 = (r2 >> 8) & 0xFF00FF;
1127                r2 &= 0xFF00FF;
1128
1129                r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
1130                r7 = (r1 >> 8) & 0xFF00FF;
1131                r1 &= 0xFF00FF;
1132                r1 += r2;
1133
1134                r7 += r8;
1135
1136                r0 -= 5 * r1;
1137                r6 -= 5 * r7;
1138
1139                r0 >>= 5;
1140                r6 >>= 5;
1141                /* clip */
1142                r13 |= r6;
1143                r13 |= r0;
1144                //CLIPPACK(r6,result)
1145
1146                r1 = *((uint32*)(p_ref + dy));
1147                r2 = (r1 >> 8) & 0xFF00FF;
1148                r1 &= 0xFF00FF;
1149                r0 += r1;
1150                r6 += r2;
1151                r0 += 0x10001;
1152                r6 += 0x10001;
1153                r0 = (r0 >> 1) & 0xFF00FF;
1154                r6 = (r6 >> 1) & 0xFF00FF;
1155
1156                r0 |= (r6 << 8);  /* pack it back */
1157                *((uint32*)(p_cur += outpitch)) = r0;
1158            }
1159            p_cur += curr_offset; /* offset to the next pixel */
1160            if (r13 & 0xFF000700) /* this column need clipping */
1161            {
1162                p_cur -= 4;
1163                for (i = 0; i < 4; i++)
1164                {
1165                    p_ref = in + i;
1166                    p_cur -= outpitch;  /* compensate for the first offset */
1167
1168                    tmp = (uint32)(p_ref + ref_offset); /* limit */
1169                    while ((uint32)p_ref < tmp)
1170                    {                           /* loop un-rolled */
1171                        r0 = *(p_ref - (inpitch << 1));
1172                        r1 = *(p_ref - inpitch);
1173                        r2 = *p_ref;
1174                        r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1175                        r4 = *(p_ref += inpitch);
1176                        /* first pixel */
1177                        r5 = *(p_ref += inpitch);
1178                        result = (r0 + r5);
1179                        r0 = (r1 + r4);
1180                        result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1181                        r0 = (r2 + r3);
1182                        result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1183                        result = (result + 16) >> 5;
1184                        CLIP_RESULT(result)
1185                        /* 3/4 pel,  no need to clip */
1186                        result = (result + p_ref[dy-(inpitch<<1)] + 1);
1187                        result = (result >> 1);
1188                        *(p_cur += outpitch) = result;
1189                        /* second pixel */
1190                        r0 = *(p_ref += inpitch);
1191                        result = (r1 + r0);
1192                        r1 = (r2 + r5);
1193                        result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1194                        r1 = (r3 + r4);
1195                        result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1196                        result = (result + 16) >> 5;
1197                        CLIP_RESULT(result)
1198                        /* 3/4 pel,  no need to clip */
1199                        result = (result + p_ref[dy-(inpitch<<1)] + 1);
1200                        result = (result >> 1);
1201                        *(p_cur += outpitch) = result;
1202                        /* third pixel */
1203                        r1 = *(p_ref += inpitch);
1204                        result = (r2 + r1);
1205                        r2 = (r3 + r0);
1206                        result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1207                        r2 = (r4 + r5);
1208                        result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1209                        result = (result + 16) >> 5;
1210                        CLIP_RESULT(result)
1211                        /* 3/4 pel,  no need to clip */
1212                        result = (result + p_ref[dy-(inpitch<<1)] + 1);
1213                        result = (result >> 1);
1214                        *(p_cur += outpitch) = result;
1215                        /* fourth pixel */
1216                        r2 = *(p_ref += inpitch);
1217                        result = (r3 + r2);
1218                        r3 = (r4 + r1);
1219                        result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1220                        r3 = (r5 + r0);
1221                        result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1222                        result = (result + 16) >> 5;
1223                        CLIP_RESULT(result)
1224                        /* 3/4 pel,  no need to clip */
1225                        result = (result + p_ref[dy-(inpitch<<1)] + 1);
1226                        result = (result >> 1);
1227                        *(p_cur += outpitch) = result;
1228                        p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1229                    }
1230                    p_cur += (curr_offset - 3);
1231                }
1232            }
1233        }
1234    }
1235    else
1236    {
1237        for (j = 0; j < blkwidth; j += 4, in += 4)
1238        {
1239            r13 = 0;
1240            p_ref = in;
1241            p_cur -= outpitch;  /* compensate for the first offset */
1242            tmp = (uint32)(p_ref + ref_offset); /* limit */
1243            while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
1244            {
1245                r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
1246                p_ref += inpitch;
1247                r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
1248                r0 &= 0xFF00FF;
1249
1250                r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
1251                r7 = (r1 >> 8) & 0xFF00FF;
1252                r1 &= 0xFF00FF;
1253
1254                r0 += r1;
1255                r6 += r7;
1256
1257                r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
1258                r8 = (r2 >> 8) & 0xFF00FF;
1259                r2 &= 0xFF00FF;
1260
1261                r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
1262                r7 = (r1 >> 8) & 0xFF00FF;
1263                r1 &= 0xFF00FF;
1264                r1 += r2;
1265
1266                r7 += r8;
1267
1268                r0 += 20 * r1;
1269                r6 += 20 * r7;
1270                r0 += 0x100010;
1271                r6 += 0x100010;
1272
1273                r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
1274                r8 = (r2 >> 8) & 0xFF00FF;
1275                r2 &= 0xFF00FF;
1276
1277                r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
1278                r7 = (r1 >> 8) & 0xFF00FF;
1279                r1 &= 0xFF00FF;
1280                r1 += r2;
1281
1282                r7 += r8;
1283
1284                r0 -= 5 * r1;
1285                r6 -= 5 * r7;
1286
1287                r0 >>= 5;
1288                r6 >>= 5;
1289                /* clip */
1290                r13 |= r6;
1291                r13 |= r0;
1292                //CLIPPACK(r6,result)
1293                r0 &= 0xFF00FF;
1294                r6 &= 0xFF00FF;
1295                r0 |= (r6 << 8);  /* pack it back */
1296                *((uint32*)(p_cur += outpitch)) = r0;
1297            }
1298            p_cur += curr_offset; /* offset to the next pixel */
1299            if (r13 & 0xFF000700) /* this column need clipping */
1300            {
1301                p_cur -= 4;
1302                for (i = 0; i < 4; i++)
1303                {
1304                    p_ref = in + i;
1305                    p_cur -= outpitch;  /* compensate for the first offset */
1306                    tmp = (uint32)(p_ref + ref_offset); /* limit */
1307                    while ((uint32)p_ref < tmp)
1308                    {                           /* loop un-rolled */
1309                        r0 = *(p_ref - (inpitch << 1));
1310                        r1 = *(p_ref - inpitch);
1311                        r2 = *p_ref;
1312                        r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1313                        r4 = *(p_ref += inpitch);
1314                        /* first pixel */
1315                        r5 = *(p_ref += inpitch);
1316                        result = (r0 + r5);
1317                        r0 = (r1 + r4);
1318                        result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1319                        r0 = (r2 + r3);
1320                        result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1321                        result = (result + 16) >> 5;
1322                        CLIP_RESULT(result)
1323                        *(p_cur += outpitch) = result;
1324                        /* second pixel */
1325                        r0 = *(p_ref += inpitch);
1326                        result = (r1 + r0);
1327                        r1 = (r2 + r5);
1328                        result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1329                        r1 = (r3 + r4);
1330                        result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1331                        result = (result + 16) >> 5;
1332                        CLIP_RESULT(result)
1333                        *(p_cur += outpitch) = result;
1334                        /* third pixel */
1335                        r1 = *(p_ref += inpitch);
1336                        result = (r2 + r1);
1337                        r2 = (r3 + r0);
1338                        result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1339                        r2 = (r4 + r5);
1340                        result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1341                        result = (result + 16) >> 5;
1342                        CLIP_RESULT(result)
1343                        *(p_cur += outpitch) = result;
1344                        /* fourth pixel */
1345                        r2 = *(p_ref += inpitch);
1346                        result = (r3 + r2);
1347                        r3 = (r4 + r1);
1348                        result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1349                        r3 = (r5 + r0);
1350                        result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1351                        result = (result + 16) >> 5;
1352                        CLIP_RESULT(result)
1353                        *(p_cur += outpitch) = result;
1354                        p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1355                    }
1356                    p_cur += (curr_offset - 3);
1357                }
1358            }
1359        }
1360    }
1361
1362    return ;
1363}
1364
1365void VertInterp2MC(uint8 *in, int inpitch, int *out, int outpitch,
1366                   int blkwidth, int blkheight)
1367{
1368    int *p_cur;
1369    uint8 *p_ref;
1370    uint32 tmp;
1371    int result, curr_offset, ref_offset;
1372    int j, r0, r1, r2, r3, r4, r5;
1373
1374    p_cur = out;
1375    curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
1376    ref_offset = blkheight * inpitch; /* for limit */
1377
1378    for (j = 0; j < blkwidth; j++)
1379    {
1380        p_cur -= outpitch; /* compensate for the first offset */
1381        p_ref = in++;
1382
1383        tmp = (uint32)(p_ref + ref_offset); /* limit */
1384        while ((uint32)p_ref < tmp)
1385        {                           /* loop un-rolled */
1386            r0 = *(p_ref - (inpitch << 1));
1387            r1 = *(p_ref - inpitch);
1388            r2 = *p_ref;
1389            r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1390            r4 = *(p_ref += inpitch);
1391            /* first pixel */
1392            r5 = *(p_ref += inpitch);
1393            result = (r0 + r5);
1394            r0 = (r1 + r4);
1395            result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1396            r0 = (r2 + r3);
1397            result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1398            *(p_cur += outpitch) = result;
1399            /* second pixel */
1400            r0 = *(p_ref += inpitch);
1401            result = (r1 + r0);
1402            r1 = (r2 + r5);
1403            result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1404            r1 = (r3 + r4);
1405            result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1406            *(p_cur += outpitch) = result;
1407            /* third pixel */
1408            r1 = *(p_ref += inpitch);
1409            result = (r2 + r1);
1410            r2 = (r3 + r0);
1411            result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1412            r2 = (r4 + r5);
1413            result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1414            *(p_cur += outpitch) = result;
1415            /* fourth pixel */
1416            r2 = *(p_ref += inpitch);
1417            result = (r3 + r2);
1418            r3 = (r4 + r1);
1419            result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1420            r3 = (r5 + r0);
1421            result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1422            *(p_cur += outpitch) = result;
1423            p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1424        }
1425        p_cur += curr_offset;
1426    }
1427
1428    return ;
1429}
1430
1431void VertInterp3MC(int *in, int inpitch, uint8 *out, int outpitch,
1432                   int blkwidth, int blkheight, int dy)
1433{
1434    uint8 *p_cur;
1435    int *p_ref;
1436    uint32 tmp;
1437    int result, result2, curr_offset, ref_offset;
1438    int j, r0, r1, r2, r3, r4, r5;
1439
1440    p_cur = out;
1441    curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
1442    ref_offset = blkheight * inpitch; /* for limit */
1443
1444    if (dy&1)
1445    {
1446        dy = (dy >> 1) ? -(inpitch << 1) : -(inpitch << 1) - inpitch;
1447
1448        for (j = 0; j < blkwidth; j++)
1449        {
1450            p_cur -= outpitch; /* compensate for the first offset */
1451            p_ref = in++;
1452
1453            tmp = (uint32)(p_ref + ref_offset); /* limit */
1454            while ((uint32)p_ref < tmp)
1455            {                           /* loop un-rolled */
1456                r0 = *(p_ref - (inpitch << 1));
1457                r1 = *(p_ref - inpitch);
1458                r2 = *p_ref;
1459                r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1460                r4 = *(p_ref += inpitch);
1461                /* first pixel */
1462                r5 = *(p_ref += inpitch);
1463                result = (r0 + r5);
1464                r0 = (r1 + r4);
1465                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1466                r0 = (r2 + r3);
1467                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1468                result = (result + 512) >> 10;
1469                CLIP_RESULT(result)
1470                result2 = ((p_ref[dy] + 16) >> 5);
1471                CLIP_RESULT(result2)
1472                /* 3/4 pel,  no need to clip */
1473                result = (result + result2 + 1);
1474                result = (result >> 1);
1475                *(p_cur += outpitch) = result;
1476                /* second pixel */
1477                r0 = *(p_ref += inpitch);
1478                result = (r1 + r0);
1479                r1 = (r2 + r5);
1480                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1481                r1 = (r3 + r4);
1482                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1483                result = (result + 512) >> 10;
1484                CLIP_RESULT(result)
1485                result2 = ((p_ref[dy] + 16) >> 5);
1486                CLIP_RESULT(result2)
1487                /* 3/4 pel,  no need to clip */
1488                result = (result + result2 + 1);
1489                result = (result >> 1);
1490                *(p_cur += outpitch) = result;
1491                /* third pixel */
1492                r1 = *(p_ref += inpitch);
1493                result = (r2 + r1);
1494                r2 = (r3 + r0);
1495                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1496                r2 = (r4 + r5);
1497                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1498                result = (result + 512) >> 10;
1499                CLIP_RESULT(result)
1500                result2 = ((p_ref[dy] + 16) >> 5);
1501                CLIP_RESULT(result2)
1502                /* 3/4 pel,  no need to clip */
1503                result = (result + result2 + 1);
1504                result = (result >> 1);
1505                *(p_cur += outpitch) = result;
1506                /* fourth pixel */
1507                r2 = *(p_ref += inpitch);
1508                result = (r3 + r2);
1509                r3 = (r4 + r1);
1510                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1511                r3 = (r5 + r0);
1512                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1513                result = (result + 512) >> 10;
1514                CLIP_RESULT(result)
1515                result2 = ((p_ref[dy] + 16) >> 5);
1516                CLIP_RESULT(result2)
1517                /* 3/4 pel,  no need to clip */
1518                result = (result + result2 + 1);
1519                result = (result >> 1);
1520                *(p_cur += outpitch) = result;
1521                p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1522            }
1523            p_cur += curr_offset;
1524        }
1525    }
1526    else
1527    {
1528        for (j = 0; j < blkwidth; j++)
1529        {
1530            p_cur -= outpitch; /* compensate for the first offset */
1531            p_ref = in++;
1532
1533            tmp = (uint32)(p_ref + ref_offset); /* limit */
1534            while ((uint32)p_ref < tmp)
1535            {                           /* loop un-rolled */
1536                r0 = *(p_ref - (inpitch << 1));
1537                r1 = *(p_ref - inpitch);
1538                r2 = *p_ref;
1539                r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1540                r4 = *(p_ref += inpitch);
1541                /* first pixel */
1542                r5 = *(p_ref += inpitch);
1543                result = (r0 + r5);
1544                r0 = (r1 + r4);
1545                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1546                r0 = (r2 + r3);
1547                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1548                result = (result + 512) >> 10;
1549                CLIP_RESULT(result)
1550                *(p_cur += outpitch) = result;
1551                /* second pixel */
1552                r0 = *(p_ref += inpitch);
1553                result = (r1 + r0);
1554                r1 = (r2 + r5);
1555                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1556                r1 = (r3 + r4);
1557                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1558                result = (result + 512) >> 10;
1559                CLIP_RESULT(result)
1560                *(p_cur += outpitch) = result;
1561                /* third pixel */
1562                r1 = *(p_ref += inpitch);
1563                result = (r2 + r1);
1564                r2 = (r3 + r0);
1565                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1566                r2 = (r4 + r5);
1567                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1568                result = (result + 512) >> 10;
1569                CLIP_RESULT(result)
1570                *(p_cur += outpitch) = result;
1571                /* fourth pixel */
1572                r2 = *(p_ref += inpitch);
1573                result = (r3 + r2);
1574                r3 = (r4 + r1);
1575                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1576                r3 = (r5 + r0);
1577                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1578                result = (result + 512) >> 10;
1579                CLIP_RESULT(result)
1580                *(p_cur += outpitch) = result;
1581                p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1582            }
1583            p_cur += curr_offset;
1584        }
1585    }
1586
1587    return ;
1588}
1589
1590void DiagonalInterpMC(uint8 *in1, uint8 *in2, int inpitch,
1591                      uint8 *out, int outpitch,
1592                      int blkwidth, int blkheight)
1593{
1594    int j, i;
1595    int result;
1596    uint8 *p_cur, *p_ref, *p_tmp8;
1597    int curr_offset, ref_offset;
1598    uint8 tmp_res[24][24], tmp_in[24][24];
1599    uint32 *p_tmp;
1600    uint32 tmp, pkres, tmp_result;
1601    int32 r0, r1, r2, r3, r4, r5;
1602    int32 r6, r7, r8, r9, r10, r13;
1603
1604    ref_offset = inpitch - blkwidth;
1605    p_ref = in1 - 2;
1606    /* perform horizontal interpolation */
1607    /* not word-aligned */
1608    /* It is faster to read 1 byte at time to avoid calling CreateAlign */
1609    /*  if(((uint32)p_ref)&0x3)
1610        {
1611            CreateAlign(p_ref,inpitch,0,&tmp_in[0][0],blkwidth+8,blkheight);
1612            p_ref = &tmp_in[0][0];
1613            ref_offset = 24-blkwidth;
1614        }*/
1615
1616    p_tmp = (uint32*) & (tmp_res[0][0]);
1617    for (j = blkheight; j > 0; j--)
1618    {
1619        r13 = 0;
1620        tmp = (uint32)(p_ref + blkwidth);
1621
1622        //r0 = *((uint32*)p_ref);   /* d,c,b,a */
1623        //r1 = (r0>>8)&0xFF00FF;    /* 0,d,0,b */
1624        //r0 &= 0xFF00FF;           /* 0,c,0,a */
1625        /* It is faster to read 1 byte at a time,  */
1626        r0 = p_ref[0];
1627        r1 = p_ref[2];
1628        r0 |= (r1 << 16);           /* 0,c,0,a */
1629        r1 = p_ref[1];
1630        r2 = p_ref[3];
1631        r1 |= (r2 << 16);           /* 0,d,0,b */
1632
1633        while ((uint32)p_ref < tmp)
1634        {
1635            //r2 = *((uint32*)(p_ref+=4));/* h,g,f,e */
1636            //r3 = (r2>>8)&0xFF00FF;  /* 0,h,0,f */
1637            //r2 &= 0xFF00FF;           /* 0,g,0,e */
1638            /* It is faster to read 1 byte at a time,  */
1639            r2 = *(p_ref += 4);
1640            r3 = p_ref[2];
1641            r2 |= (r3 << 16);           /* 0,g,0,e */
1642            r3 = p_ref[1];
1643            r4 = p_ref[3];
1644            r3 |= (r4 << 16);           /* 0,h,0,f */
1645
1646            r4 = r0 + r3;       /* c+h, a+f */
1647            r5 = r0 + r1;   /* c+d, a+b */
1648            r6 = r2 + r3;   /* g+h, e+f */
1649            r5 >>= 16;
1650            r5 |= (r6 << 16);   /* e+f, c+d */
1651            r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
1652            r4 += 0x100010; /* +16, +16 */
1653            r5 = r1 + r2;       /* d+g, b+e */
1654            r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
1655            r4 >>= 5;
1656            r13 |= r4;      /* check clipping */
1657            r4 &= 0xFF00FF; /* mask */
1658
1659            r5 = p_ref[4];  /* i */
1660            r6 = (r5 << 16);
1661            r5 = r6 | (r2 >> 16);/* 0,i,0,g */
1662            r5 += r1;       /* d+i, b+g */ /* r5 not free */
1663            r1 >>= 16;
1664            r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
1665            r1 += r2;       /* f+g, d+e */
1666            r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
1667            r0 >>= 16;
1668            r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
1669            r0 += r3;       /* e+h, c+f */
1670            r5 += 0x100010; /* 16,16 */
1671            r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
1672            r5 >>= 5;
1673            r13 |= r5;      /* check clipping */
1674            r5 &= 0xFF00FF; /* mask */
1675
1676            r4 |= (r5 << 8);    /* pack them together */
1677            *p_tmp++ = r4;
1678            r1 = r3;
1679            r0 = r2;
1680        }
1681        p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
1682        p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
1683
1684        if (r13&0xFF000700) /* need clipping */
1685        {
1686            /* move back to the beginning of the line */
1687            p_ref -= (ref_offset + blkwidth);   /* input */
1688            p_tmp -= 6; /* intermediate output */
1689            tmp = (uint32)(p_ref + blkwidth);
1690            while ((uint32)p_ref < tmp)
1691            {
1692                r0 = *p_ref++;
1693                r1 = *p_ref++;
1694                r2 = *p_ref++;
1695                r3 = *p_ref++;
1696                r4 = *p_ref++;
1697                /* first pixel */
1698                r5 = *p_ref++;
1699                result = (r0 + r5);
1700                r0 = (r1 + r4);
1701                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1702                r0 = (r2 + r3);
1703                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1704                result = (result + 16) >> 5;
1705                CLIP_RESULT(result)
1706                pkres = result;
1707                /* second pixel */
1708                r0 = *p_ref++;
1709                result = (r1 + r0);
1710                r1 = (r2 + r5);
1711                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1712                r1 = (r3 + r4);
1713                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1714                result = (result + 16) >> 5;
1715                CLIP_RESULT(result)
1716                pkres |= (result << 8);
1717                /* third pixel */
1718                r1 = *p_ref++;
1719                result = (r2 + r1);
1720                r2 = (r3 + r0);
1721                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1722                r2 = (r4 + r5);
1723                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1724                result = (result + 16) >> 5;
1725                CLIP_RESULT(result)
1726                pkres |= (result << 16);
1727                /* fourth pixel */
1728                r2 = *p_ref++;
1729                result = (r3 + r2);
1730                r3 = (r4 + r1);
1731                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1732                r3 = (r5 + r0);
1733                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1734                result = (result + 16) >> 5;
1735                CLIP_RESULT(result)
1736                pkres |= (result << 24);
1737
1738                *p_tmp++ = pkres; /* write 4 pixel */
1739                p_ref -= 5;
1740            }
1741            p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
1742            p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
1743        }
1744    }
1745
1746    /*  perform vertical interpolation */
1747    /* not word-aligned */
1748    if (((uint32)in2)&0x3)
1749    {
1750        CreateAlign(in2, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
1751        in2 = &tmp_in[2][0];
1752        inpitch = 24;
1753    }
1754
1755    p_cur = out;
1756    curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically up and one pixel right */
1757    pkres = blkheight * inpitch; /* reuse it for limit */
1758
1759    curr_offset += 3;
1760
1761    for (j = 0; j < blkwidth; j += 4, in2 += 4)
1762    {
1763        r13 = 0;
1764        p_ref = in2;
1765        p_tmp8 = &(tmp_res[0][j]); /* intermediate result */
1766        p_tmp8 -= 24;  /* compensate for the first offset */
1767        p_cur -= outpitch;  /* compensate for the first offset */
1768        tmp = (uint32)(p_ref + pkres); /* limit */
1769        while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
1770        {
1771            /* Read 1 byte at a time is too slow, too many read and pack ops, need to call CreateAlign,  */
1772            /*p_ref8 = p_ref-(inpitch<<1);          r0 = p_ref8[0];         r1 = p_ref8[2];
1773            r0 |= (r1<<16);         r6 = p_ref8[1];         r1 = p_ref8[3];
1774            r6 |= (r1<<16);         p_ref+=inpitch; */
1775            r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
1776            p_ref += inpitch;
1777            r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
1778            r0 &= 0xFF00FF;
1779
1780            /*p_ref8 = p_ref+(inpitch<<1);
1781            r1 = p_ref8[0];         r7 = p_ref8[2];         r1 |= (r7<<16);
1782            r7 = p_ref8[1];         r2 = p_ref8[3];         r7 |= (r2<<16);*/
1783            r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
1784            r7 = (r1 >> 8) & 0xFF00FF;
1785            r1 &= 0xFF00FF;
1786
1787            r0 += r1;
1788            r6 += r7;
1789
1790            /*r2 = p_ref[0];            r8 = p_ref[2];          r2 |= (r8<<16);
1791            r8 = p_ref[1];          r1 = p_ref[3];          r8 |= (r1<<16);*/
1792            r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
1793            r8 = (r2 >> 8) & 0xFF00FF;
1794            r2 &= 0xFF00FF;
1795
1796            /*p_ref8 = p_ref-inpitch;           r1 = p_ref8[0];         r7 = p_ref8[2];
1797            r1 |= (r7<<16);         r1 += r2;           r7 = p_ref8[1];
1798            r2 = p_ref8[3];         r7 |= (r2<<16);*/
1799            r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
1800            r7 = (r1 >> 8) & 0xFF00FF;
1801            r1 &= 0xFF00FF;
1802            r1 += r2;
1803
1804            r7 += r8;
1805
1806            r0 += 20 * r1;
1807            r6 += 20 * r7;
1808            r0 += 0x100010;
1809            r6 += 0x100010;
1810
1811            /*p_ref8 = p_ref-(inpitch<<1);          r2 = p_ref8[0];         r8 = p_ref8[2];
1812            r2 |= (r8<<16);         r8 = p_ref8[1];         r1 = p_ref8[3];         r8 |= (r1<<16);*/
1813            r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
1814            r8 = (r2 >> 8) & 0xFF00FF;
1815            r2 &= 0xFF00FF;
1816
1817            /*p_ref8 = p_ref+inpitch;           r1 = p_ref8[0];         r7 = p_ref8[2];
1818            r1 |= (r7<<16);         r1 += r2;           r7 = p_ref8[1];
1819            r2 = p_ref8[3];         r7 |= (r2<<16);*/
1820            r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
1821            r7 = (r1 >> 8) & 0xFF00FF;
1822            r1 &= 0xFF00FF;
1823            r1 += r2;
1824
1825            r7 += r8;
1826
1827            r0 -= 5 * r1;
1828            r6 -= 5 * r7;
1829
1830            r0 >>= 5;
1831            r6 >>= 5;
1832            /* clip */
1833            r13 |= r6;
1834            r13 |= r0;
1835            //CLIPPACK(r6,result)
1836            /* add with horizontal results */
1837            r10 = *((uint32*)(p_tmp8 += 24));
1838            r9 = (r10 >> 8) & 0xFF00FF;
1839            r10 &= 0xFF00FF;
1840
1841            r0 += r10;
1842            r0 += 0x10001;
1843            r0 = (r0 >> 1) & 0xFF00FF;   /* mask to 8 bytes */
1844
1845            r6 += r9;
1846            r6 += 0x10001;
1847            r6 = (r6 >> 1) & 0xFF00FF;   /* mask to 8 bytes */
1848
1849            r0 |= (r6 << 8);  /* pack it back */
1850            *((uint32*)(p_cur += outpitch)) = r0;
1851        }
1852        p_cur += curr_offset; /* offset to the next pixel */
1853        if (r13 & 0xFF000700) /* this column need clipping */
1854        {
1855            p_cur -= 4;
1856            for (i = 0; i < 4; i++)
1857            {
1858                p_ref = in2 + i;
1859                p_tmp8 = &(tmp_res[0][j+i]); /* intermediate result */
1860                p_tmp8 -= 24;  /* compensate for the first offset */
1861                p_cur -= outpitch;  /* compensate for the first offset */
1862                tmp = (uint32)(p_ref + pkres); /* limit */
1863                while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
1864                {
1865                    r0 = *(p_ref - (inpitch << 1));
1866                    r1 = *(p_ref - inpitch);
1867                    r2 = *p_ref;
1868                    r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1869                    r4 = *(p_ref += inpitch);
1870                    /* first pixel */
1871                    r5 = *(p_ref += inpitch);
1872                    result = (r0 + r5);
1873                    r0 = (r1 + r4);
1874                    result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1875                    r0 = (r2 + r3);
1876                    result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1877                    result = (result + 16) >> 5;
1878                    CLIP_RESULT(result)
1879                    tmp_result = *(p_tmp8 += 24);  /* modify pointer before loading */
1880                    result = (result + tmp_result + 1);  /* no clip */
1881                    result = (result >> 1);
1882                    *(p_cur += outpitch) = result;
1883                    /* second pixel */
1884                    r0 = *(p_ref += inpitch);
1885                    result = (r1 + r0);
1886                    r1 = (r2 + r5);
1887                    result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1888                    r1 = (r3 + r4);
1889                    result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1890                    result = (result + 16) >> 5;
1891                    CLIP_RESULT(result)
1892                    tmp_result = *(p_tmp8 += 24);  /* intermediate result */
1893                    result = (result + tmp_result + 1);  /* no clip */
1894                    result = (result >> 1);
1895                    *(p_cur += outpitch) = result;
1896                    /* third pixel */
1897                    r1 = *(p_ref += inpitch);
1898                    result = (r2 + r1);
1899                    r2 = (r3 + r0);
1900                    result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1901                    r2 = (r4 + r5);
1902                    result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1903                    result = (result + 16) >> 5;
1904                    CLIP_RESULT(result)
1905                    tmp_result = *(p_tmp8 += 24);  /* intermediate result */
1906                    result = (result + tmp_result + 1);  /* no clip */
1907                    result = (result >> 1);
1908                    *(p_cur += outpitch) = result;
1909                    /* fourth pixel */
1910                    r2 = *(p_ref += inpitch);
1911                    result = (r3 + r2);
1912                    r3 = (r4 + r1);
1913                    result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1914                    r3 = (r5 + r0);
1915                    result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1916                    result = (result + 16) >> 5;
1917                    CLIP_RESULT(result)
1918                    tmp_result = *(p_tmp8 += 24);  /* intermediate result */
1919                    result = (result + tmp_result + 1);  /* no clip */
1920                    result = (result >> 1);
1921                    *(p_cur += outpitch) = result;
1922                    p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1923                }
1924                p_cur += (curr_offset - 3);
1925            }
1926        }
1927    }
1928
1929    return ;
1930}
1931
1932/* position G */
1933void FullPelMC(uint8 *in, int inpitch, uint8 *out, int outpitch,
1934               int blkwidth, int blkheight)
1935{
1936    int i, j;
1937    int offset_in = inpitch - blkwidth;
1938    int offset_out = outpitch - blkwidth;
1939    uint32 temp;
1940    uint8 byte;
1941
1942    if (((uint32)in)&3)
1943    {
1944        for (j = blkheight; j > 0; j--)
1945        {
1946            for (i = blkwidth; i > 0; i -= 4)
1947            {
1948                temp = *in++;
1949                byte = *in++;
1950                temp |= (byte << 8);
1951                byte = *in++;
1952                temp |= (byte << 16);
1953                byte = *in++;
1954                temp |= (byte << 24);
1955
1956                *((uint32*)out) = temp; /* write 4 bytes */
1957                out += 4;
1958            }
1959            out += offset_out;
1960            in += offset_in;
1961        }
1962    }
1963    else
1964    {
1965        for (j = blkheight; j > 0; j--)
1966        {
1967            for (i = blkwidth; i > 0; i -= 4)
1968            {
1969                temp = *((uint32*)in);
1970                *((uint32*)out) = temp;
1971                in += 4;
1972                out += 4;
1973            }
1974            out += offset_out;
1975            in += offset_in;
1976        }
1977    }
1978    return ;
1979}
1980
1981void ChromaMotionComp(uint8 *ref, int picwidth, int picheight,
1982                      int x_pos, int y_pos,
1983                      uint8 *pred, int pred_pitch,
1984                      int blkwidth, int blkheight)
1985{
1986    int dx, dy;
1987    int offset_dx, offset_dy;
1988    int index;
1989    uint8 temp[24][24];
1990
1991    dx = x_pos & 7;
1992    dy = y_pos & 7;
1993    offset_dx = (dx + 7) >> 3;
1994    offset_dy = (dy + 7) >> 3;
1995    x_pos = x_pos >> 3;  /* round it to full-pel resolution */
1996    y_pos = y_pos >> 3;
1997
1998    if ((x_pos >= 0 && x_pos + blkwidth + offset_dx <= picwidth) && (y_pos >= 0 && y_pos + blkheight + offset_dy <= picheight))
1999    {
2000        ref += y_pos * picwidth + x_pos;
2001    }
2002    else
2003    {
2004        CreatePad(ref, picwidth, picheight, x_pos, y_pos, &temp[0][0], blkwidth + offset_dx, blkheight + offset_dy);
2005        ref = &temp[0][0];
2006        picwidth = 24;
2007    }
2008
2009    index = offset_dx + (offset_dy << 1) + ((blkwidth << 1) & 0x7);
2010
2011    (*(ChromaMC_SIMD[index]))(ref, picwidth , dx, dy, pred, pred_pitch, blkwidth, blkheight);
2012    return ;
2013}
2014
2015
2016/* SIMD routines, unroll the loops in vertical direction, decreasing loops (things to be done)  */
2017void ChromaDiagonalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2018                           uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2019{
2020    int32 r0, r1, r2, r3, result0, result1;
2021    uint8 temp[288];
2022    uint8 *ref, *out;
2023    int i, j;
2024    int dx_8 = 8 - dx;
2025    int dy_8 = 8 - dy;
2026
2027    /* horizontal first */
2028    out = temp;
2029    for (i = 0; i < blkheight + 1; i++)
2030    {
2031        ref = pRef;
2032        r0 = ref[0];
2033        for (j = 0; j < blkwidth; j += 4)
2034        {
2035            r0 |= (ref[2] << 16);
2036            result0 = dx_8 * r0;
2037
2038            r1 = ref[1] | (ref[3] << 16);
2039            result0 += dx * r1;
2040            *(int32 *)out = result0;
2041
2042            result0 = dx_8 * r1;
2043
2044            r2 = ref[4];
2045            r0 = r0 >> 16;
2046            r1 = r0 | (r2 << 16);
2047            result0 += dx * r1;
2048            *(int32 *)(out + 16) = result0;
2049
2050            ref += 4;
2051            out += 4;
2052            r0 = r2;
2053        }
2054        pRef += srcPitch;
2055        out += (32 - blkwidth);
2056    }
2057
2058//  pRef -= srcPitch*(blkheight+1);
2059    ref = temp;
2060
2061    for (j = 0; j < blkwidth; j += 4)
2062    {
2063        r0 = *(int32 *)ref;
2064        r1 = *(int32 *)(ref + 16);
2065        ref += 32;
2066        out = pOut;
2067        for (i = 0; i < (blkheight >> 1); i++)
2068        {
2069            result0 = dy_8 * r0 + 0x00200020;
2070            r2 = *(int32 *)ref;
2071            result0 += dy * r2;
2072            result0 >>= 6;
2073            result0 &= 0x00FF00FF;
2074            r0 = r2;
2075
2076            result1 = dy_8 * r1 + 0x00200020;
2077            r3 = *(int32 *)(ref + 16);
2078            result1 += dy * r3;
2079            result1 >>= 6;
2080            result1 &= 0x00FF00FF;
2081            r1 = r3;
2082            *(int32 *)out = result0 | (result1 << 8);
2083            out += predPitch;
2084            ref += 32;
2085
2086            result0 = dy_8 * r0 + 0x00200020;
2087            r2 = *(int32 *)ref;
2088            result0 += dy * r2;
2089            result0 >>= 6;
2090            result0 &= 0x00FF00FF;
2091            r0 = r2;
2092
2093            result1 = dy_8 * r1 + 0x00200020;
2094            r3 = *(int32 *)(ref + 16);
2095            result1 += dy * r3;
2096            result1 >>= 6;
2097            result1 &= 0x00FF00FF;
2098            r1 = r3;
2099            *(int32 *)out = result0 | (result1 << 8);
2100            out += predPitch;
2101            ref += 32;
2102        }
2103        pOut += 4;
2104        ref = temp + 4; /* since it can only iterate twice max  */
2105    }
2106    return;
2107}
2108
2109void ChromaHorizontalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2110                             uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2111{
2112    OSCL_UNUSED_ARG(dy);
2113    int32 r0, r1, r2, result0, result1;
2114    uint8 *ref, *out;
2115    int i, j;
2116    int dx_8 = 8 - dx;
2117
2118    /* horizontal first */
2119    for (i = 0; i < blkheight; i++)
2120    {
2121        ref = pRef;
2122        out = pOut;
2123
2124        r0 = ref[0];
2125        for (j = 0; j < blkwidth; j += 4)
2126        {
2127            r0 |= (ref[2] << 16);
2128            result0 = dx_8 * r0 + 0x00040004;
2129
2130            r1 = ref[1] | (ref[3] << 16);
2131            result0 += dx * r1;
2132            result0 >>= 3;
2133            result0 &= 0x00FF00FF;
2134
2135            result1 = dx_8 * r1 + 0x00040004;
2136
2137            r2 = ref[4];
2138            r0 = r0 >> 16;
2139            r1 = r0 | (r2 << 16);
2140            result1 += dx * r1;
2141            result1 >>= 3;
2142            result1 &= 0x00FF00FF;
2143
2144            *(int32 *)out = result0 | (result1 << 8);
2145
2146            ref += 4;
2147            out += 4;
2148            r0 = r2;
2149        }
2150
2151        pRef += srcPitch;
2152        pOut += predPitch;
2153    }
2154    return;
2155}
2156
2157void ChromaVerticalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2158                           uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2159{
2160    OSCL_UNUSED_ARG(dx);
2161    int32 r0, r1, r2, r3, result0, result1;
2162    int i, j;
2163    uint8 *ref, *out;
2164    int dy_8 = 8 - dy;
2165    /* vertical first */
2166    for (i = 0; i < blkwidth; i += 4)
2167    {
2168        ref = pRef;
2169        out = pOut;
2170
2171        r0 = ref[0] | (ref[2] << 16);
2172        r1 = ref[1] | (ref[3] << 16);
2173        ref += srcPitch;
2174        for (j = 0; j < blkheight; j++)
2175        {
2176            result0 = dy_8 * r0 + 0x00040004;
2177            r2 = ref[0] | (ref[2] << 16);
2178            result0 += dy * r2;
2179            result0 >>= 3;
2180            result0 &= 0x00FF00FF;
2181            r0 = r2;
2182
2183            result1 = dy_8 * r1 + 0x00040004;
2184            r3 = ref[1] | (ref[3] << 16);
2185            result1 += dy * r3;
2186            result1 >>= 3;
2187            result1 &= 0x00FF00FF;
2188            r1 = r3;
2189            *(int32 *)out = result0 | (result1 << 8);
2190            ref += srcPitch;
2191            out += predPitch;
2192        }
2193        pOut += 4;
2194        pRef += 4;
2195    }
2196    return;
2197}
2198
2199void ChromaDiagonalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2200                            uint8 *pOut,  int predPitch, int blkwidth, int blkheight)
2201{
2202    OSCL_UNUSED_ARG(blkwidth);
2203    int32 r0, r1, temp0, temp1, result;
2204    int32 temp[9];
2205    int32 *out;
2206    int i, r_temp;
2207    int dy_8 = 8 - dy;
2208
2209    /* horizontal first */
2210    out = temp;
2211    for (i = 0; i < blkheight + 1; i++)
2212    {
2213        r_temp = pRef[1];
2214        temp0 = (pRef[0] << 3) + dx * (r_temp - pRef[0]);
2215        temp1 = (r_temp << 3) + dx * (pRef[2] - r_temp);
2216        r0 = temp0 | (temp1 << 16);
2217        *out++ = r0;
2218        pRef += srcPitch;
2219    }
2220
2221    pRef -= srcPitch * (blkheight + 1);
2222
2223    out = temp;
2224
2225    r0 = *out++;
2226
2227    for (i = 0; i < blkheight; i++)
2228    {
2229        result = dy_8 * r0 + 0x00200020;
2230        r1 = *out++;
2231        result += dy * r1;
2232        result >>= 6;
2233        result &= 0x00FF00FF;
2234        *(int16 *)pOut = (result >> 8) | (result & 0xFF);
2235        r0 = r1;
2236        pOut += predPitch;
2237    }
2238    return;
2239}
2240
2241void ChromaHorizontalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2242                              uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2243{
2244    OSCL_UNUSED_ARG(dy);
2245    OSCL_UNUSED_ARG(blkwidth);
2246    int i, temp, temp0, temp1;
2247
2248    /* horizontal first */
2249    for (i = 0; i < blkheight; i++)
2250    {
2251        temp = pRef[1];
2252        temp0 = ((pRef[0] << 3) + dx * (temp - pRef[0]) + 4) >> 3;
2253        temp1 = ((temp << 3) + dx * (pRef[2] - temp) + 4) >> 3;
2254
2255        *(int16 *)pOut = temp0 | (temp1 << 8);
2256        pRef += srcPitch;
2257        pOut += predPitch;
2258
2259    }
2260    return;
2261}
2262void ChromaVerticalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2263                            uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2264{
2265    OSCL_UNUSED_ARG(dx);
2266    OSCL_UNUSED_ARG(blkwidth);
2267    int32 r0, r1, result;
2268    int i;
2269    int dy_8 = 8 - dy;
2270    r0 = pRef[0] | (pRef[1] << 16);
2271    pRef += srcPitch;
2272    for (i = 0; i < blkheight; i++)
2273    {
2274        result = dy_8 * r0 + 0x00040004;
2275        r1 = pRef[0] | (pRef[1] << 16);
2276        result += dy * r1;
2277        result >>= 3;
2278        result &= 0x00FF00FF;
2279        *(int16 *)pOut = (result >> 8) | (result & 0xFF);
2280        r0 = r1;
2281        pRef += srcPitch;
2282        pOut += predPitch;
2283    }
2284    return;
2285}
2286
2287void ChromaFullMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2288                       uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2289{
2290    OSCL_UNUSED_ARG(dx);
2291    OSCL_UNUSED_ARG(dy);
2292    int i, j;
2293    int offset_in = srcPitch - blkwidth;
2294    int offset_out = predPitch - blkwidth;
2295    uint16 temp;
2296    uint8 byte;
2297
2298    if (((uint32)pRef)&1)
2299    {
2300        for (j = blkheight; j > 0; j--)
2301        {
2302            for (i = blkwidth; i > 0; i -= 2)
2303            {
2304                temp = *pRef++;
2305                byte = *pRef++;
2306                temp |= (byte << 8);
2307                *((uint16*)pOut) = temp; /* write 2 bytes */
2308                pOut += 2;
2309            }
2310            pOut += offset_out;
2311            pRef += offset_in;
2312        }
2313    }
2314    else
2315    {
2316        for (j = blkheight; j > 0; j--)
2317        {
2318            for (i = blkwidth; i > 0; i -= 2)
2319            {
2320                temp = *((uint16*)pRef);
2321                *((uint16*)pOut) = temp;
2322                pRef += 2;
2323                pOut += 2;
2324            }
2325            pOut += offset_out;
2326            pRef += offset_in;
2327        }
2328    }
2329    return ;
2330}