/Avc/pred_inter.cpp
C++ | 2330 lines | 1974 code | 201 blank | 155 comment | 158 complexity | 5750c7959b2cde35dcd102bfcf79ae88 MD5 | raw file
Large files files are truncated, but you can click here to view the full file
1/* ------------------------------------------------------------------ 2 * Copyright (C) 1998-2009 PacketVideo 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 13 * express or implied. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * ------------------------------------------------------------------- 17 */ 18#include "avcdec_lib.h" 19 20 21#define CLIP_RESULT(x) if((uint)x > 0xFF){ \ 22 x = 0xFF & (~(x>>31));} 23 24/* (blkwidth << 2) + (dy << 1) + dx */ 25static void (*const ChromaMC_SIMD[8])(uint8 *, int , int , int , uint8 *, int, int , int) = 26{ 27 &ChromaFullMC_SIMD, 28 &ChromaHorizontalMC_SIMD, 29 &ChromaVerticalMC_SIMD, 30 &ChromaDiagonalMC_SIMD, 31 &ChromaFullMC_SIMD, 32 &ChromaHorizontalMC2_SIMD, 33 &ChromaVerticalMC2_SIMD, 34 &ChromaDiagonalMC2_SIMD 35}; 36/* Perform motion prediction and compensation with residue if exist. */ 37void InterMBPrediction(AVCCommonObj *video) 38{ 39 AVCMacroblock *currMB = video->currMB; 40 AVCPictureData *currPic = video->currPic; 41 int mbPartIdx, subMbPartIdx; 42 int ref_idx; 43 int offset_MbPart_indx = 0; 44 int16 *mv; 45 uint32 x_pos, y_pos; 46 uint8 *curL, *curCb, *curCr; 47 uint8 *ref_l, *ref_Cb, *ref_Cr; 48 uint8 *predBlock, *predCb, *predCr; 49 int block_x, block_y, offset_x, offset_y, offsetP, offset; 50 int x_position = (video->mb_x << 4); 51 int y_position = (video->mb_y << 4); 52 int MbHeight, MbWidth, mbPartIdx_X, mbPartIdx_Y, offset_indx; 53 int picWidth = currPic->pitch; 54 int picHeight = currPic->height; 55 int16 *dataBlock; 56 uint32 cbp4x4; 57 uint32 tmp_word; 58 59 tmp_word = y_position * picWidth; 60 curL = currPic->Sl + tmp_word + x_position; 61 offset = (tmp_word >> 2) + (x_position >> 1); 62 curCb = currPic->Scb + offset; 63 curCr = currPic->Scr + offset; 64 65#ifdef USE_PRED_BLOCK 66 predBlock = video->pred + 84; 67 predCb = video->pred + 452; 68 predCr = video->pred + 596; 69#else 70 predBlock = curL; 71 predCb = curCb; 72 predCr = curCr; 73#endif 74 75 GetMotionVectorPredictor(video, false); 76 77 for (mbPartIdx = 0; mbPartIdx < currMB->NumMbPart; mbPartIdx++) 78 { 79 MbHeight = currMB->SubMbPartHeight[mbPartIdx]; 80 MbWidth = currMB->SubMbPartWidth[mbPartIdx]; 81 mbPartIdx_X = ((mbPartIdx + offset_MbPart_indx) & 1); 82 mbPartIdx_Y = (mbPartIdx + offset_MbPart_indx) >> 1; 83 ref_idx = currMB->ref_idx_L0[(mbPartIdx_Y << 1) + mbPartIdx_X]; 84 offset_indx = 0; 85 86 ref_l = video->RefPicList0[ref_idx]->Sl; 87 ref_Cb = video->RefPicList0[ref_idx]->Scb; 88 ref_Cr = video->RefPicList0[ref_idx]->Scr; 89 90 for (subMbPartIdx = 0; subMbPartIdx < currMB->NumSubMbPart[mbPartIdx]; subMbPartIdx++) 91 { 92 block_x = (mbPartIdx_X << 1) + ((subMbPartIdx + offset_indx) & 1); // check this 93 block_y = (mbPartIdx_Y << 1) + (((subMbPartIdx + offset_indx) >> 1) & 1); 94 mv = (int16*)(currMB->mvL0 + block_x + (block_y << 2)); 95 offset_x = x_position + (block_x << 2); 96 offset_y = y_position + (block_y << 2); 97 x_pos = (offset_x << 2) + *mv++; /*quarter pel */ 98 y_pos = (offset_y << 2) + *mv; /*quarter pel */ 99 100 //offset = offset_y * currPic->width; 101 //offsetC = (offset >> 2) + (offset_x >> 1); 102#ifdef USE_PRED_BLOCK 103 offsetP = (block_y * 80) + (block_x << 2); 104 LumaMotionComp(ref_l, picWidth, picHeight, x_pos, y_pos, 105 /*comp_Sl + offset + offset_x,*/ 106 predBlock + offsetP, 20, MbWidth, MbHeight); 107#else 108 offsetP = (block_y << 2) * picWidth + (block_x << 2); 109 LumaMotionComp(ref_l, picWidth, picHeight, x_pos, y_pos, 110 /*comp_Sl + offset + offset_x,*/ 111 predBlock + offsetP, picWidth, MbWidth, MbHeight); 112#endif 113 114#ifdef USE_PRED_BLOCK 115 offsetP = (block_y * 24) + (block_x << 1); 116 ChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos, 117 /*comp_Scb + offsetC,*/ 118 predCb + offsetP, 12, MbWidth >> 1, MbHeight >> 1); 119 ChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos, 120 /*comp_Scr + offsetC,*/ 121 predCr + offsetP, 12, MbWidth >> 1, MbHeight >> 1); 122#else 123 offsetP = (block_y * picWidth) + (block_x << 1); 124 ChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos, 125 /*comp_Scb + offsetC,*/ 126 predCb + offsetP, picWidth >> 1, MbWidth >> 1, MbHeight >> 1); 127 ChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos, 128 /*comp_Scr + offsetC,*/ 129 predCr + offsetP, picWidth >> 1, MbWidth >> 1, MbHeight >> 1); 130#endif 131 132 offset_indx = currMB->SubMbPartWidth[mbPartIdx] >> 3; 133 } 134 offset_MbPart_indx = currMB->MbPartWidth >> 4; 135 } 136 137 /* used in decoder, used to be if(!encFlag) */ 138 139 /* transform in raster scan order */ 140 dataBlock = video->block; 141 cbp4x4 = video->cbp4x4; 142 /* luma */ 143 for (block_y = 4; block_y > 0; block_y--) 144 { 145 for (block_x = 4; block_x > 0; block_x--) 146 { 147#ifdef USE_PRED_BLOCK 148 if (cbp4x4&1) 149 { 150 itrans(dataBlock, predBlock, predBlock, 20); 151 } 152#else 153 if (cbp4x4&1) 154 { 155 itrans(dataBlock, curL, curL, picWidth); 156 } 157#endif 158 cbp4x4 >>= 1; 159 dataBlock += 4; 160#ifdef USE_PRED_BLOCK 161 predBlock += 4; 162#else 163 curL += 4; 164#endif 165 } 166 dataBlock += 48; 167#ifdef USE_PRED_BLOCK 168 predBlock += 64; 169#else 170 curL += ((picWidth << 2) - 16); 171#endif 172 } 173 174 /* chroma */ 175 picWidth = (picWidth >> 1); 176 for (block_y = 2; block_y > 0; block_y--) 177 { 178 for (block_x = 2; block_x > 0; block_x--) 179 { 180#ifdef USE_PRED_BLOCK 181 if (cbp4x4&1) 182 { 183 ictrans(dataBlock, predCb, predCb, 12); 184 } 185#else 186 if (cbp4x4&1) 187 { 188 ictrans(dataBlock, curCb, curCb, picWidth); 189 } 190#endif 191 cbp4x4 >>= 1; 192 dataBlock += 4; 193#ifdef USE_PRED_BLOCK 194 predCb += 4; 195#else 196 curCb += 4; 197#endif 198 } 199 for (block_x = 2; block_x > 0; block_x--) 200 { 201#ifdef USE_PRED_BLOCK 202 if (cbp4x4&1) 203 { 204 ictrans(dataBlock, predCr, predCr, 12); 205 } 206#else 207 if (cbp4x4&1) 208 { 209 ictrans(dataBlock, curCr, curCr, picWidth); 210 } 211#endif 212 cbp4x4 >>= 1; 213 dataBlock += 4; 214#ifdef USE_PRED_BLOCK 215 predCr += 4; 216#else 217 curCr += 4; 218#endif 219 } 220 dataBlock += 48; 221#ifdef USE_PRED_BLOCK 222 predCb += 40; 223 predCr += 40; 224#else 225 curCb += ((picWidth << 2) - 8); 226 curCr += ((picWidth << 2) - 8); 227#endif 228 } 229 230#ifdef MB_BASED_DEBLOCK 231 SaveNeighborForIntraPred(video, offset); 232#endif 233 234 return ; 235} 236 237 238/* preform the actual motion comp here */ 239void LumaMotionComp(uint8 *ref, int picwidth, int picheight, 240 int x_pos, int y_pos, 241 uint8 *pred, int pred_pitch, 242 int blkwidth, int blkheight) 243{ 244 int dx, dy; 245 uint8 temp[24][24]; /* for padding, make the size multiple of 4 for packing */ 246 int temp2[21][21]; /* for intermediate results */ 247 uint8 *ref2; 248 249 dx = x_pos & 3; 250 dy = y_pos & 3; 251 x_pos = x_pos >> 2; /* round it to full-pel resolution */ 252 y_pos = y_pos >> 2; 253 254 /* perform actual motion compensation */ 255 if (dx == 0 && dy == 0) 256 { /* fullpel position *//* G */ 257 if (x_pos >= 0 && x_pos + blkwidth <= picwidth && y_pos >= 0 && y_pos + blkheight <= picheight) 258 { 259 ref += y_pos * picwidth + x_pos; 260 FullPelMC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight); 261 } 262 else 263 { 264 CreatePad(ref, picwidth, picheight, x_pos, y_pos, &temp[0][0], blkwidth, blkheight); 265 FullPelMC(&temp[0][0], 24, pred, pred_pitch, blkwidth, blkheight); 266 } 267 268 } /* other positions */ 269 else if (dy == 0) 270 { /* no vertical interpolation *//* a,b,c*/ 271 272 if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos >= 0 && y_pos + blkheight <= picheight) 273 { 274 ref += y_pos * picwidth + x_pos; 275 276 HorzInterp1MC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight, dx); 277 } 278 else /* need padding */ 279 { 280 CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos, &temp[0][0], blkwidth + 5, blkheight); 281 282 HorzInterp1MC(&temp[0][2], 24, pred, pred_pitch, blkwidth, blkheight, dx); 283 } 284 } 285 else if (dx == 0) 286 { /*no horizontal interpolation *//* d,h,n */ 287 288 if (x_pos >= 0 && x_pos + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight) 289 { 290 ref += y_pos * picwidth + x_pos; 291 292 VertInterp1MC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight, dy); 293 } 294 else /* need padding */ 295 { 296 CreatePad(ref, picwidth, picheight, x_pos, y_pos - 2, &temp[0][0], blkwidth, blkheight + 5); 297 298 VertInterp1MC(&temp[2][0], 24, pred, pred_pitch, blkwidth, blkheight, dy); 299 } 300 } 301 else if (dy == 2) 302 { /* horizontal cross *//* i, j, k */ 303 304 if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight) 305 { 306 ref += y_pos * picwidth + x_pos - 2; /* move to the left 2 pixels */ 307 308 VertInterp2MC(ref, picwidth, &temp2[0][0], 21, blkwidth + 5, blkheight); 309 310 HorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx); 311 } 312 else /* need padding */ 313 { 314 CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5, blkheight + 5); 315 316 VertInterp2MC(&temp[2][0], 24, &temp2[0][0], 21, blkwidth + 5, blkheight); 317 318 HorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx); 319 } 320 } 321 else if (dx == 2) 322 { /* vertical cross */ /* f,q */ 323 324 if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight) 325 { 326 ref += (y_pos - 2) * picwidth + x_pos; /* move to up 2 lines */ 327 328 HorzInterp3MC(ref, picwidth, &temp2[0][0], 21, blkwidth, blkheight + 5); 329 VertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy); 330 } 331 else /* need padding */ 332 { 333 CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5, blkheight + 5); 334 HorzInterp3MC(&temp[0][2], 24, &temp2[0][0], 21, blkwidth, blkheight + 5); 335 VertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy); 336 } 337 } 338 else 339 { /* diagonal *//* e,g,p,r */ 340 341 if (x_pos - 2 >= 0 && x_pos + 3 + (dx / 2) + blkwidth <= picwidth && 342 y_pos - 2 >= 0 && y_pos + 3 + blkheight + (dy / 2) <= picheight) 343 { 344 ref2 = ref + (y_pos + (dy / 2)) * picwidth + x_pos; 345 346 ref += (y_pos * picwidth) + x_pos + (dx / 2); 347 348 DiagonalInterpMC(ref2, ref, picwidth, pred, pred_pitch, blkwidth, blkheight); 349 } 350 else /* need padding */ 351 { 352 CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5 + (dx / 2), blkheight + 5 + (dy / 2)); 353 354 ref2 = &temp[2 + (dy/2)][2]; 355 356 ref = &temp[2][2 + (dx/2)]; 357 358 DiagonalInterpMC(ref2, ref, 24, pred, pred_pitch, blkwidth, blkheight); 359 } 360 } 361 362 return ; 363} 364 365void CreateAlign(uint8 *ref, int picwidth, int y_pos, 366 uint8 *out, int blkwidth, int blkheight) 367{ 368 int i, j; 369 int offset, out_offset; 370 uint32 prev_pix, result, pix1, pix2, pix4; 371 372 out_offset = 24 - blkwidth; 373 374 //switch(x_pos&0x3){ 375 switch (((uint32)ref)&0x3) 376 { 377 case 1: 378 ref += y_pos * picwidth; 379 offset = picwidth - blkwidth - 3; 380 for (j = 0; j < blkheight; j++) 381 { 382 pix1 = *ref++; 383 pix2 = *((uint16*)ref); 384 ref += 2; 385 result = (pix2 << 8) | pix1; 386 387 for (i = 3; i < blkwidth; i += 4) 388 { 389 pix4 = *((uint32*)ref); 390 ref += 4; 391 prev_pix = (pix4 << 24) & 0xFF000000; /* mask out byte belong to previous word */ 392 result |= prev_pix; 393 *((uint32*)out) = result; /* write 4 bytes */ 394 out += 4; 395 result = pix4 >> 8; /* for the next loop */ 396 } 397 ref += offset; 398 out += out_offset; 399 } 400 break; 401 case 2: 402 ref += y_pos * picwidth; 403 offset = picwidth - blkwidth - 2; 404 for (j = 0; j < blkheight; j++) 405 { 406 result = *((uint16*)ref); 407 ref += 2; 408 for (i = 2; i < blkwidth; i += 4) 409 { 410 pix4 = *((uint32*)ref); 411 ref += 4; 412 prev_pix = (pix4 << 16) & 0xFFFF0000; /* mask out byte belong to previous word */ 413 result |= prev_pix; 414 *((uint32*)out) = result; /* write 4 bytes */ 415 out += 4; 416 result = pix4 >> 16; /* for the next loop */ 417 } 418 ref += offset; 419 out += out_offset; 420 } 421 break; 422 case 3: 423 ref += y_pos * picwidth; 424 offset = picwidth - blkwidth - 1; 425 for (j = 0; j < blkheight; j++) 426 { 427 result = *ref++; 428 for (i = 1; i < blkwidth; i += 4) 429 { 430 pix4 = *((uint32*)ref); 431 ref += 4; 432 prev_pix = (pix4 << 8) & 0xFFFFFF00; /* mask out byte belong to previous word */ 433 result |= prev_pix; 434 *((uint32*)out) = result; /* write 4 bytes */ 435 out += 4; 436 result = pix4 >> 24; /* for the next loop */ 437 } 438 ref += offset; 439 out += out_offset; 440 } 441 break; 442 } 443} 444 445void CreatePad(uint8 *ref, int picwidth, int picheight, int x_pos, int y_pos, 446 uint8 *out, int blkwidth, int blkheight) 447{ 448 int x_inc0, x_mid; 449 int y_inc, y_inc0, y_inc1, y_mid; 450 int i, j; 451 int offset; 452 453 if (x_pos < 0) 454 { 455 x_inc0 = 0; /* increment for the first part */ 456 x_mid = ((blkwidth + x_pos > 0) ? -x_pos : blkwidth); /* stopping point */ 457 x_pos = 0; 458 } 459 else if (x_pos + blkwidth > picwidth) 460 { 461 x_inc0 = 1; /* increasing */ 462 x_mid = ((picwidth > x_pos) ? picwidth - x_pos - 1 : 0); /* clip negative to zero, encode fool proof! */ 463 } 464 else /* normal case */ 465 { 466 x_inc0 = 1; 467 x_mid = blkwidth; /* just one run */ 468 } 469 470 471 /* boundary for y_pos, taking the result from x_pos into account */ 472 if (y_pos < 0) 473 { 474 y_inc0 = (x_inc0 ? - x_mid : -blkwidth + x_mid); /* offset depending on x_inc1 and x_inc0 */ 475 y_inc1 = picwidth + y_inc0; 476 y_mid = ((blkheight + y_pos > 0) ? -y_pos : blkheight); /* clip to prevent memory corruption */ 477 y_pos = 0; 478 } 479 else if (y_pos + blkheight > picheight) 480 { 481 y_inc1 = (x_inc0 ? - x_mid : -blkwidth + x_mid); /* saturate */ 482 y_inc0 = picwidth + y_inc1; /* increasing */ 483 y_mid = ((picheight > y_pos) ? picheight - 1 - y_pos : 0); 484 } 485 else /* normal case */ 486 { 487 y_inc1 = (x_inc0 ? - x_mid : -blkwidth + x_mid); 488 y_inc0 = picwidth + y_inc1; 489 y_mid = blkheight; 490 } 491 492 /* clip y_pos and x_pos */ 493 if (y_pos > picheight - 1) y_pos = picheight - 1; 494 if (x_pos > picwidth - 1) x_pos = picwidth - 1; 495 496 ref += y_pos * picwidth + x_pos; 497 498 y_inc = y_inc0; /* start with top half */ 499 500 offset = 24 - blkwidth; /* to use in offset out */ 501 blkwidth -= x_mid; /* to use in the loop limit */ 502 503 if (x_inc0 == 0) 504 { 505 for (j = 0; j < blkheight; j++) 506 { 507 if (j == y_mid) /* put a check here to reduce the code size (for unrolling the loop) */ 508 { 509 y_inc = y_inc1; /* switch to lower half */ 510 } 511 for (i = x_mid; i > 0; i--) /* first or third quarter */ 512 { 513 *out++ = *ref; 514 } 515 for (i = blkwidth; i > 0; i--) /* second or fourth quarter */ 516 { 517 *out++ = *ref++; 518 } 519 out += offset; 520 ref += y_inc; 521 } 522 } 523 else 524 { 525 for (j = 0; j < blkheight; j++) 526 { 527 if (j == y_mid) /* put a check here to reduce the code size (for unrolling the loop) */ 528 { 529 y_inc = y_inc1; /* switch to lower half */ 530 } 531 for (i = x_mid; i > 0; i--) /* first or third quarter */ 532 { 533 *out++ = *ref++; 534 } 535 for (i = blkwidth; i > 0; i--) /* second or fourth quarter */ 536 { 537 *out++ = *ref; 538 } 539 out += offset; 540 ref += y_inc; 541 } 542 } 543 544 return ; 545} 546 547void HorzInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch, 548 int blkwidth, int blkheight, int dx) 549{ 550 uint8 *p_ref; 551 uint32 *p_cur; 552 uint32 tmp; 553 uint32 pkres; 554 int result, curr_offset, ref_offset; 555 int j; 556 int32 r0, r1, r2, r3, r4, r5; 557 int32 r13, r6; 558 559 p_cur = (uint32*)out; /* assume it's word aligned */ 560 curr_offset = (outpitch - blkwidth) >> 2; 561 p_ref = in; 562 ref_offset = inpitch - blkwidth; 563 564 if (dx&1) 565 { 566 dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */ 567 p_ref -= 2; 568 r13 = 0; 569 for (j = blkheight; j > 0; j--) 570 { 571 tmp = (uint32)(p_ref + blkwidth); 572 r0 = p_ref[0]; 573 r1 = p_ref[2]; 574 r0 |= (r1 << 16); /* 0,c,0,a */ 575 r1 = p_ref[1]; 576 r2 = p_ref[3]; 577 r1 |= (r2 << 16); /* 0,d,0,b */ 578 while ((uint32)p_ref < tmp) 579 { 580 r2 = *(p_ref += 4); /* move pointer to e */ 581 r3 = p_ref[2]; 582 r2 |= (r3 << 16); /* 0,g,0,e */ 583 r3 = p_ref[1]; 584 r4 = p_ref[3]; 585 r3 |= (r4 << 16); /* 0,h,0,f */ 586 587 r4 = r0 + r3; /* c+h, a+f */ 588 r5 = r0 + r1; /* c+d, a+b */ 589 r6 = r2 + r3; /* g+h, e+f */ 590 r5 >>= 16; 591 r5 |= (r6 << 16); /* e+f, c+d */ 592 r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */ 593 r4 += 0x100010; /* +16, +16 */ 594 r5 = r1 + r2; /* d+g, b+e */ 595 r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */ 596 r4 >>= 5; 597 r13 |= r4; /* check clipping */ 598 599 r5 = p_ref[dx+2]; 600 r6 = p_ref[dx+4]; 601 r5 |= (r6 << 16); 602 r4 += r5; 603 r4 += 0x10001; 604 r4 = (r4 >> 1) & 0xFF00FF; 605 606 r5 = p_ref[4]; /* i */ 607 r6 = (r5 << 16); 608 r5 = r6 | (r2 >> 16);/* 0,i,0,g */ 609 r5 += r1; /* d+i, b+g */ /* r5 not free */ 610 r1 >>= 16; 611 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */ 612 r1 += r2; /* f+g, d+e */ 613 r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */ 614 r0 >>= 16; 615 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */ 616 r0 += r3; /* e+h, c+f */ 617 r5 += 0x100010; /* 16,16 */ 618 r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */ 619 r5 >>= 5; 620 r13 |= r5; /* check clipping */ 621 622 r0 = p_ref[dx+3]; 623 r1 = p_ref[dx+5]; 624 r0 |= (r1 << 16); 625 r5 += r0; 626 r5 += 0x10001; 627 r5 = (r5 >> 1) & 0xFF00FF; 628 629 r4 |= (r5 << 8); /* pack them together */ 630 *p_cur++ = r4; 631 r1 = r3; 632 r0 = r2; 633 } 634 p_cur += curr_offset; /* move to the next line */ 635 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */ 636 637 if (r13&0xFF000700) /* need clipping */ 638 { 639 /* move back to the beginning of the line */ 640 p_ref -= (ref_offset + blkwidth); /* input */ 641 p_cur -= (outpitch >> 2); 642 643 tmp = (uint32)(p_ref + blkwidth); 644 for (; (uint32)p_ref < tmp;) 645 { 646 647 r0 = *p_ref++; 648 r1 = *p_ref++; 649 r2 = *p_ref++; 650 r3 = *p_ref++; 651 r4 = *p_ref++; 652 /* first pixel */ 653 r5 = *p_ref++; 654 result = (r0 + r5); 655 r0 = (r1 + r4); 656 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 657 r0 = (r2 + r3); 658 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 659 result = (result + 16) >> 5; 660 CLIP_RESULT(result) 661 /* 3/4 pel, no need to clip */ 662 result = (result + p_ref[dx] + 1); 663 pkres = (result >> 1) ; 664 /* second pixel */ 665 r0 = *p_ref++; 666 result = (r1 + r0); 667 r1 = (r2 + r5); 668 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 669 r1 = (r3 + r4); 670 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 671 result = (result + 16) >> 5; 672 CLIP_RESULT(result) 673 /* 3/4 pel, no need to clip */ 674 result = (result + p_ref[dx] + 1); 675 result = (result >> 1); 676 pkres |= (result << 8); 677 /* third pixel */ 678 r1 = *p_ref++; 679 result = (r2 + r1); 680 r2 = (r3 + r0); 681 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 682 r2 = (r4 + r5); 683 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 684 result = (result + 16) >> 5; 685 CLIP_RESULT(result) 686 /* 3/4 pel, no need to clip */ 687 result = (result + p_ref[dx] + 1); 688 result = (result >> 1); 689 pkres |= (result << 16); 690 /* fourth pixel */ 691 r2 = *p_ref++; 692 result = (r3 + r2); 693 r3 = (r4 + r1); 694 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 695 r3 = (r5 + r0); 696 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 697 result = (result + 16) >> 5; 698 CLIP_RESULT(result) 699 /* 3/4 pel, no need to clip */ 700 result = (result + p_ref[dx] + 1); 701 result = (result >> 1); 702 pkres |= (result << 24); 703 *p_cur++ = pkres; /* write 4 pixels */ 704 p_ref -= 5; /* offset back to the middle of filter */ 705 } 706 p_cur += curr_offset; /* move to the next line */ 707 p_ref += ref_offset; /* move to the next line */ 708 } 709 } 710 } 711 else 712 { 713 p_ref -= 2; 714 r13 = 0; 715 for (j = blkheight; j > 0; j--) 716 { 717 tmp = (uint32)(p_ref + blkwidth); 718 r0 = p_ref[0]; 719 r1 = p_ref[2]; 720 r0 |= (r1 << 16); /* 0,c,0,a */ 721 r1 = p_ref[1]; 722 r2 = p_ref[3]; 723 r1 |= (r2 << 16); /* 0,d,0,b */ 724 while ((uint32)p_ref < tmp) 725 { 726 r2 = *(p_ref += 4); /* move pointer to e */ 727 r3 = p_ref[2]; 728 r2 |= (r3 << 16); /* 0,g,0,e */ 729 r3 = p_ref[1]; 730 r4 = p_ref[3]; 731 r3 |= (r4 << 16); /* 0,h,0,f */ 732 733 r4 = r0 + r3; /* c+h, a+f */ 734 r5 = r0 + r1; /* c+d, a+b */ 735 r6 = r2 + r3; /* g+h, e+f */ 736 r5 >>= 16; 737 r5 |= (r6 << 16); /* e+f, c+d */ 738 r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */ 739 r4 += 0x100010; /* +16, +16 */ 740 r5 = r1 + r2; /* d+g, b+e */ 741 r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */ 742 r4 >>= 5; 743 r13 |= r4; /* check clipping */ 744 r4 &= 0xFF00FF; /* mask */ 745 746 r5 = p_ref[4]; /* i */ 747 r6 = (r5 << 16); 748 r5 = r6 | (r2 >> 16);/* 0,i,0,g */ 749 r5 += r1; /* d+i, b+g */ /* r5 not free */ 750 r1 >>= 16; 751 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */ 752 r1 += r2; /* f+g, d+e */ 753 r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */ 754 r0 >>= 16; 755 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */ 756 r0 += r3; /* e+h, c+f */ 757 r5 += 0x100010; /* 16,16 */ 758 r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */ 759 r5 >>= 5; 760 r13 |= r5; /* check clipping */ 761 r5 &= 0xFF00FF; /* mask */ 762 763 r4 |= (r5 << 8); /* pack them together */ 764 *p_cur++ = r4; 765 r1 = r3; 766 r0 = r2; 767 } 768 p_cur += curr_offset; /* move to the next line */ 769 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */ 770 771 if (r13&0xFF000700) /* need clipping */ 772 { 773 /* move back to the beginning of the line */ 774 p_ref -= (ref_offset + blkwidth); /* input */ 775 p_cur -= (outpitch >> 2); 776 777 tmp = (uint32)(p_ref + blkwidth); 778 for (; (uint32)p_ref < tmp;) 779 { 780 781 r0 = *p_ref++; 782 r1 = *p_ref++; 783 r2 = *p_ref++; 784 r3 = *p_ref++; 785 r4 = *p_ref++; 786 /* first pixel */ 787 r5 = *p_ref++; 788 result = (r0 + r5); 789 r0 = (r1 + r4); 790 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 791 r0 = (r2 + r3); 792 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 793 result = (result + 16) >> 5; 794 CLIP_RESULT(result) 795 pkres = result; 796 /* second pixel */ 797 r0 = *p_ref++; 798 result = (r1 + r0); 799 r1 = (r2 + r5); 800 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 801 r1 = (r3 + r4); 802 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 803 result = (result + 16) >> 5; 804 CLIP_RESULT(result) 805 pkres |= (result << 8); 806 /* third pixel */ 807 r1 = *p_ref++; 808 result = (r2 + r1); 809 r2 = (r3 + r0); 810 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 811 r2 = (r4 + r5); 812 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 813 result = (result + 16) >> 5; 814 CLIP_RESULT(result) 815 pkres |= (result << 16); 816 /* fourth pixel */ 817 r2 = *p_ref++; 818 result = (r3 + r2); 819 r3 = (r4 + r1); 820 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 821 r3 = (r5 + r0); 822 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 823 result = (result + 16) >> 5; 824 CLIP_RESULT(result) 825 pkres |= (result << 24); 826 *p_cur++ = pkres; /* write 4 pixels */ 827 p_ref -= 5; 828 } 829 p_cur += curr_offset; /* move to the next line */ 830 p_ref += ref_offset; 831 } 832 } 833 } 834 835 return ; 836} 837 838void HorzInterp2MC(int *in, int inpitch, uint8 *out, int outpitch, 839 int blkwidth, int blkheight, int dx) 840{ 841 int *p_ref; 842 uint32 *p_cur; 843 uint32 tmp, pkres; 844 int result, result2, curr_offset, ref_offset; 845 int j, r0, r1, r2, r3, r4, r5; 846 847 p_cur = (uint32*)out; /* assume it's word aligned */ 848 curr_offset = (outpitch - blkwidth) >> 2; 849 p_ref = in; 850 ref_offset = inpitch - blkwidth; 851 852 if (dx&1) 853 { 854 dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */ 855 856 for (j = blkheight; j > 0 ; j--) 857 { 858 tmp = (uint32)(p_ref + blkwidth); 859 for (; (uint32)p_ref < tmp;) 860 { 861 862 r0 = p_ref[-2]; 863 r1 = p_ref[-1]; 864 r2 = *p_ref++; 865 r3 = *p_ref++; 866 r4 = *p_ref++; 867 /* first pixel */ 868 r5 = *p_ref++; 869 result = (r0 + r5); 870 r0 = (r1 + r4); 871 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 872 r0 = (r2 + r3); 873 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 874 result = (result + 512) >> 10; 875 CLIP_RESULT(result) 876 result2 = ((p_ref[dx] + 16) >> 5); 877 CLIP_RESULT(result2) 878 /* 3/4 pel, no need to clip */ 879 result = (result + result2 + 1); 880 pkres = (result >> 1); 881 /* second pixel */ 882 r0 = *p_ref++; 883 result = (r1 + r0); 884 r1 = (r2 + r5); 885 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 886 r1 = (r3 + r4); 887 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 888 result = (result + 512) >> 10; 889 CLIP_RESULT(result) 890 result2 = ((p_ref[dx] + 16) >> 5); 891 CLIP_RESULT(result2) 892 /* 3/4 pel, no need to clip */ 893 result = (result + result2 + 1); 894 result = (result >> 1); 895 pkres |= (result << 8); 896 /* third pixel */ 897 r1 = *p_ref++; 898 result = (r2 + r1); 899 r2 = (r3 + r0); 900 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 901 r2 = (r4 + r5); 902 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 903 result = (result + 512) >> 10; 904 CLIP_RESULT(result) 905 result2 = ((p_ref[dx] + 16) >> 5); 906 CLIP_RESULT(result2) 907 /* 3/4 pel, no need to clip */ 908 result = (result + result2 + 1); 909 result = (result >> 1); 910 pkres |= (result << 16); 911 /* fourth pixel */ 912 r2 = *p_ref++; 913 result = (r3 + r2); 914 r3 = (r4 + r1); 915 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 916 r3 = (r5 + r0); 917 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 918 result = (result + 512) >> 10; 919 CLIP_RESULT(result) 920 result2 = ((p_ref[dx] + 16) >> 5); 921 CLIP_RESULT(result2) 922 /* 3/4 pel, no need to clip */ 923 result = (result + result2 + 1); 924 result = (result >> 1); 925 pkres |= (result << 24); 926 *p_cur++ = pkres; /* write 4 pixels */ 927 p_ref -= 3; /* offset back to the middle of filter */ 928 } 929 p_cur += curr_offset; /* move to the next line */ 930 p_ref += ref_offset; /* move to the next line */ 931 } 932 } 933 else 934 { 935 for (j = blkheight; j > 0 ; j--) 936 { 937 tmp = (uint32)(p_ref + blkwidth); 938 for (; (uint32)p_ref < tmp;) 939 { 940 941 r0 = p_ref[-2]; 942 r1 = p_ref[-1]; 943 r2 = *p_ref++; 944 r3 = *p_ref++; 945 r4 = *p_ref++; 946 /* first pixel */ 947 r5 = *p_ref++; 948 result = (r0 + r5); 949 r0 = (r1 + r4); 950 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 951 r0 = (r2 + r3); 952 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 953 result = (result + 512) >> 10; 954 CLIP_RESULT(result) 955 pkres = result; 956 /* second pixel */ 957 r0 = *p_ref++; 958 result = (r1 + r0); 959 r1 = (r2 + r5); 960 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 961 r1 = (r3 + r4); 962 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 963 result = (result + 512) >> 10; 964 CLIP_RESULT(result) 965 pkres |= (result << 8); 966 /* third pixel */ 967 r1 = *p_ref++; 968 result = (r2 + r1); 969 r2 = (r3 + r0); 970 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 971 r2 = (r4 + r5); 972 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 973 result = (result + 512) >> 10; 974 CLIP_RESULT(result) 975 pkres |= (result << 16); 976 /* fourth pixel */ 977 r2 = *p_ref++; 978 result = (r3 + r2); 979 r3 = (r4 + r1); 980 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 981 r3 = (r5 + r0); 982 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 983 result = (result + 512) >> 10; 984 CLIP_RESULT(result) 985 pkres |= (result << 24); 986 *p_cur++ = pkres; /* write 4 pixels */ 987 p_ref -= 3; /* offset back to the middle of filter */ 988 } 989 p_cur += curr_offset; /* move to the next line */ 990 p_ref += ref_offset; /* move to the next line */ 991 } 992 } 993 994 return ; 995} 996 997void HorzInterp3MC(uint8 *in, int inpitch, int *out, int outpitch, 998 int blkwidth, int blkheight) 999{ 1000 uint8 *p_ref; 1001 int *p_cur; 1002 uint32 tmp; 1003 int result, curr_offset, ref_offset; 1004 int j, r0, r1, r2, r3, r4, r5; 1005 1006 p_cur = out; 1007 curr_offset = (outpitch - blkwidth); 1008 p_ref = in; 1009 ref_offset = inpitch - blkwidth; 1010 1011 for (j = blkheight; j > 0 ; j--) 1012 { 1013 tmp = (uint32)(p_ref + blkwidth); 1014 for (; (uint32)p_ref < tmp;) 1015 { 1016 1017 r0 = p_ref[-2]; 1018 r1 = p_ref[-1]; 1019 r2 = *p_ref++; 1020 r3 = *p_ref++; 1021 r4 = *p_ref++; 1022 /* first pixel */ 1023 r5 = *p_ref++; 1024 result = (r0 + r5); 1025 r0 = (r1 + r4); 1026 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1027 r0 = (r2 + r3); 1028 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1029 *p_cur++ = result; 1030 /* second pixel */ 1031 r0 = *p_ref++; 1032 result = (r1 + r0); 1033 r1 = (r2 + r5); 1034 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1035 r1 = (r3 + r4); 1036 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1037 *p_cur++ = result; 1038 /* third pixel */ 1039 r1 = *p_ref++; 1040 result = (r2 + r1); 1041 r2 = (r3 + r0); 1042 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1043 r2 = (r4 + r5); 1044 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1045 *p_cur++ = result; 1046 /* fourth pixel */ 1047 r2 = *p_ref++; 1048 result = (r3 + r2); 1049 r3 = (r4 + r1); 1050 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1051 r3 = (r5 + r0); 1052 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1053 *p_cur++ = result; 1054 p_ref -= 3; /* move back to the middle of the filter */ 1055 } 1056 p_cur += curr_offset; /* move to the next line */ 1057 p_ref += ref_offset; 1058 } 1059 1060 return ; 1061} 1062void VertInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch, 1063 int blkwidth, int blkheight, int dy) 1064{ 1065 uint8 *p_cur, *p_ref; 1066 uint32 tmp; 1067 int result, curr_offset, ref_offset; 1068 int j, i; 1069 int32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r13; 1070 uint8 tmp_in[24][24]; 1071 1072 /* not word-aligned */ 1073 if (((uint32)in)&0x3) 1074 { 1075 CreateAlign(in, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5); 1076 in = &tmp_in[2][0]; 1077 inpitch = 24; 1078 } 1079 p_cur = out; 1080 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */ 1081 ref_offset = blkheight * inpitch; /* for limit */ 1082 1083 curr_offset += 3; 1084 1085 if (dy&1) 1086 { 1087 dy = (dy >> 1) ? 0 : -inpitch; 1088 1089 for (j = 0; j < blkwidth; j += 4, in += 4) 1090 { 1091 r13 = 0; 1092 p_ref = in; 1093 p_cur -= outpitch; /* compensate for the first offset */ 1094 tmp = (uint32)(p_ref + ref_offset); /* limit */ 1095 while ((uint32)p_ref < tmp) /* the loop un-rolled */ 1096 { 1097 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */ 1098 p_ref += inpitch; 1099 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */ 1100 r0 &= 0xFF00FF; 1101 1102 r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */ 1103 r7 = (r1 >> 8) & 0xFF00FF; 1104 r1 &= 0xFF00FF; 1105 1106 r0 += r1; 1107 r6 += r7; 1108 1109 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */ 1110 r8 = (r2 >> 8) & 0xFF00FF; 1111 r2 &= 0xFF00FF; 1112 1113 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */ 1114 r7 = (r1 >> 8) & 0xFF00FF; 1115 r1 &= 0xFF00FF; 1116 r1 += r2; 1117 1118 r7 += r8; 1119 1120 r0 += 20 * r1; 1121 r6 += 20 * r7; 1122 r0 += 0x100010; 1123 r6 += 0x100010; 1124 1125 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */ 1126 r8 = (r2 >> 8) & 0xFF00FF; 1127 r2 &= 0xFF00FF; 1128 1129 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */ 1130 r7 = (r1 >> 8) & 0xFF00FF; 1131 r1 &= 0xFF00FF; 1132 r1 += r2; 1133 1134 r7 += r8; 1135 1136 r0 -= 5 * r1; 1137 r6 -= 5 * r7; 1138 1139 r0 >>= 5; 1140 r6 >>= 5; 1141 /* clip */ 1142 r13 |= r6; 1143 r13 |= r0; 1144 //CLIPPACK(r6,result) 1145 1146 r1 = *((uint32*)(p_ref + dy)); 1147 r2 = (r1 >> 8) & 0xFF00FF; 1148 r1 &= 0xFF00FF; 1149 r0 += r1; 1150 r6 += r2; 1151 r0 += 0x10001; 1152 r6 += 0x10001; 1153 r0 = (r0 >> 1) & 0xFF00FF; 1154 r6 = (r6 >> 1) & 0xFF00FF; 1155 1156 r0 |= (r6 << 8); /* pack it back */ 1157 *((uint32*)(p_cur += outpitch)) = r0; 1158 } 1159 p_cur += curr_offset; /* offset to the next pixel */ 1160 if (r13 & 0xFF000700) /* this column need clipping */ 1161 { 1162 p_cur -= 4; 1163 for (i = 0; i < 4; i++) 1164 { 1165 p_ref = in + i; 1166 p_cur -= outpitch; /* compensate for the first offset */ 1167 1168 tmp = (uint32)(p_ref + ref_offset); /* limit */ 1169 while ((uint32)p_ref < tmp) 1170 { /* loop un-rolled */ 1171 r0 = *(p_ref - (inpitch << 1)); 1172 r1 = *(p_ref - inpitch); 1173 r2 = *p_ref; 1174 r3 = *(p_ref += inpitch); /* modify pointer before loading */ 1175 r4 = *(p_ref += inpitch); 1176 /* first pixel */ 1177 r5 = *(p_ref += inpitch); 1178 result = (r0 + r5); 1179 r0 = (r1 + r4); 1180 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1181 r0 = (r2 + r3); 1182 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1183 result = (result + 16) >> 5; 1184 CLIP_RESULT(result) 1185 /* 3/4 pel, no need to clip */ 1186 result = (result + p_ref[dy-(inpitch<<1)] + 1); 1187 result = (result >> 1); 1188 *(p_cur += outpitch) = result; 1189 /* second pixel */ 1190 r0 = *(p_ref += inpitch); 1191 result = (r1 + r0); 1192 r1 = (r2 + r5); 1193 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1194 r1 = (r3 + r4); 1195 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1196 result = (result + 16) >> 5; 1197 CLIP_RESULT(result) 1198 /* 3/4 pel, no need to clip */ 1199 result = (result + p_ref[dy-(inpitch<<1)] + 1); 1200 result = (result >> 1); 1201 *(p_cur += outpitch) = result; 1202 /* third pixel */ 1203 r1 = *(p_ref += inpitch); 1204 result = (r2 + r1); 1205 r2 = (r3 + r0); 1206 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1207 r2 = (r4 + r5); 1208 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1209 result = (result + 16) >> 5; 1210 CLIP_RESULT(result) 1211 /* 3/4 pel, no need to clip */ 1212 result = (result + p_ref[dy-(inpitch<<1)] + 1); 1213 result = (result >> 1); 1214 *(p_cur += outpitch) = result; 1215 /* fourth pixel */ 1216 r2 = *(p_ref += inpitch); 1217 result = (r3 + r2); 1218 r3 = (r4 + r1); 1219 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1220 r3 = (r5 + r0); 1221 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1222 result = (result + 16) >> 5; 1223 CLIP_RESULT(result) 1224 /* 3/4 pel, no need to clip */ 1225 result = (result + p_ref[dy-(inpitch<<1)] + 1); 1226 result = (result >> 1); 1227 *(p_cur += outpitch) = result; 1228 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ 1229 } 1230 p_cur += (curr_offset - 3); 1231 } 1232 } 1233 } 1234 } 1235 else 1236 { 1237 for (j = 0; j < blkwidth; j += 4, in += 4) 1238 { 1239 r13 = 0; 1240 p_ref = in; 1241 p_cur -= outpitch; /* compensate for the first offset */ 1242 tmp = (uint32)(p_ref + ref_offset); /* limit */ 1243 while ((uint32)p_ref < tmp) /* the loop un-rolled */ 1244 { 1245 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */ 1246 p_ref += inpitch; 1247 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */ 1248 r0 &= 0xFF00FF; 1249 1250 r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */ 1251 r7 = (r1 >> 8) & 0xFF00FF; 1252 r1 &= 0xFF00FF; 1253 1254 r0 += r1; 1255 r6 += r7; 1256 1257 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */ 1258 r8 = (r2 >> 8) & 0xFF00FF; 1259 r2 &= 0xFF00FF; 1260 1261 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */ 1262 r7 = (r1 >> 8) & 0xFF00FF; 1263 r1 &= 0xFF00FF; 1264 r1 += r2; 1265 1266 r7 += r8; 1267 1268 r0 += 20 * r1; 1269 r6 += 20 * r7; 1270 r0 += 0x100010; 1271 r6 += 0x100010; 1272 1273 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */ 1274 r8 = (r2 >> 8) & 0xFF00FF; 1275 r2 &= 0xFF00FF; 1276 1277 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */ 1278 r7 = (r1 >> 8) & 0xFF00FF; 1279 r1 &= 0xFF00FF; 1280 r1 += r2; 1281 1282 r7 += r8; 1283 1284 r0 -= 5 * r1; 1285 r6 -= 5 * r7; 1286 1287 r0 >>= 5; 1288 r6 >>= 5; 1289 /* clip */ 1290 r13 |= r6; 1291 r13 |= r0; 1292 //CLIPPACK(r6,result) 1293 r0 &= 0xFF00FF; 1294 r6 &= 0xFF00FF; 1295 r0 |= (r6 << 8); /* pack it back */ 1296 *((uint32*)(p_cur += outpitch)) = r0; 1297 } 1298 p_cur += curr_offset; /* offset to the next pixel */ 1299 if (r13 & 0xFF000700) /* this column need clipping */ 1300 { 1301 p_cur -= 4; 1302 for (i = 0; i < 4; i++) 1303 { 1304 p_ref = in + i; 1305 p_cur -= outpitch; /* compensate for the first offset */ 1306 tmp = (uint32)(p_ref + ref_offset); /* limit */ 1307 while ((uint32)p_ref < tmp) 1308 { /* loop un-rolled */ 1309 r0 = *(p_ref - (inpitch << 1)); 1310 r1 = *(p_ref - inpitch); 1311 r2 = *p_ref; 1312 r3 = *(p_ref += inpitch); /* modify pointer before loading */ 1313 r4 = *(p_ref += inpitch); 1314 /* first pixel */ 1315 r5 = *(p_ref += inpitch); 1316 result = (r0 + r5); 1317 r0 = (r1 + r4); 1318 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1319 r0 = (r2 + r3); 1320 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1321 result = (result + 16) >> 5; 1322 CLIP_RESULT(result) 1323 *(p_cur += outpitch) = result; 1324 /* second pixel */ 1325 r0 = *(p_ref += inpitch); 1326 result = (r1 + r0); 1327 r1 = (r2 + r5); 1328 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1329 r1 = (r3 + r4); 1330 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1331 result = (result + 16) >> 5; 1332 CLIP_RESULT(result) 1333 *(p_cur += outpitch) = result; 1334 /* third pixel */ 1335 r1 = *(p_ref += inpitch); 1336 result = (r2 + r1); 1337 r2 = (r3 + r0); 1338 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1339 r2 = (r4 + r5); 1340 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1341 result = (result + 16) >> 5; 1342 CLIP_RESULT(result) 1343 *(p_cur += outpitch) = result; 1344 /* fourth pixel */ 1345 r2 = *(p_ref += inpitch); 1346 result = (r3 + r2); 1347 r3 = (r4 + r1); 1348 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1349 r3 = (r5 + r0); 1350 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1351 result = (result + 16) >> 5; 1352 CLIP_RESULT(result) 1353 *(p_cur += outpitch) = result; 1354 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ 1355 } 1356 p_cur += (curr_offset - 3); 1357 } 1358 } 1359 } 1360 } 1361 1362 return ; 1363} 1364 1365void VertInterp2MC(uint8 *in, int inpitch, int *out, int outpitch, 1366 …
Large files files are truncated, but you can click here to view the full file