/media/libstagefright/codecs/m4v_h263/enc/src/fastidct.cpp
https://bitbucket.org/aways/android_frameworks_av · C++ · 1888 lines · 1527 code · 229 blank · 132 comment · 32 complexity · ab869ee4389e36833b879389343ec6d3 MD5 · raw file
- /* ------------------------------------------------------------------
- * Copyright (C) 1998-2009 PacketVideo
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
- * express or implied.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- * -------------------------------------------------------------------
- */
- /*
- ------------------------------------------------------------------------------
- REVISION HISTORY
- Who: Date: July/2001
- Description: 1. Optimized BlockIDCT bitmap checking.
- 2. Rearranged functions.
- 3. Do column IDCT first, then row IDCT.
- 4. Combine motion comp and IDCT, require
- two sets of row IDCTs one for INTRA
- and one for INTER.
- 5. Add AAN IDCT
- Who: Date: 8/16/01
- 1. Increase the input precision to 8 bits, i.e. change RDCTBITS
- to 11, have to comment out all in-line assembly since 16 bit
- multiplication doesn't work. Try to use diffent precision with
- 32 bit mult. but hasn't finished. Turns out that without in-line
- assembly the performance doesn't change much (only 1%).
- Who: Date: 9/04/05
- 1. Replace AAN IDCT with Chen's IDCT to accommodate 16 bit data type.
- */
- #include "mp4def.h"
- #include "mp4enc_lib.h"
- #include "mp4lib_int.h"
- #include "dct.h"
- #define ADD_CLIP { \
- tmp = *rec + tmp; \
- if((UInt)tmp > mask) tmp = mask&(~(tmp>>31)); \
- *rec++ = tmp; \
- }
- #define INTRA_CLIP { \
- if((UInt)tmp > mask) tmp = mask&(~(tmp>>31)); \
- *rec++ = tmp; \
- }
- #define CLIP_RESULT(x) if((UInt)x > 0xFF){x = 0xFF & (~(x>>31));}
- #define ADD_AND_CLIP1(x) x += (pred_word&0xFF); CLIP_RESULT(x);
- #define ADD_AND_CLIP2(x) x += ((pred_word>>8)&0xFF); CLIP_RESULT(x);
- #define ADD_AND_CLIP3(x) x += ((pred_word>>16)&0xFF); CLIP_RESULT(x);
- #define ADD_AND_CLIP4(x) x += ((pred_word>>24)&0xFF); CLIP_RESULT(x);
- void idct_col0(Short *blk)
- {
- OSCL_UNUSED_ARG(blk);
- return;
- }
- void idct_col1(Short *blk)
- {
- blk[0] = blk[8] = blk[16] = blk[24] = blk[32] = blk[40] = blk[48] = blk[56] =
- blk[0] << 3;
- return ;
- }
- void idct_col2(Short *blk)
- {
- int32 x0, x1, x3, x5, x7;//, x8;
- x1 = blk[8];
- x0 = ((int32)blk[0] << 11) + 128;
- /* both upper and lower*/
- x7 = W7 * x1;
- x1 = W1 * x1;
- x3 = x7;
- x5 = (181 * (x1 - x7) + 128) >> 8;
- x7 = (181 * (x1 + x7) + 128) >> 8;
- blk[0] = (x0 + x1) >> 8;
- blk[8] = (x0 + x7) >> 8;
- blk[16] = (x0 + x5) >> 8;
- blk[24] = (x0 + x3) >> 8;
- blk[56] = (x0 - x1) >> 8;
- blk[48] = (x0 - x7) >> 8;
- blk[40] = (x0 - x5) >> 8;
- blk[32] = (x0 - x3) >> 8;
- return ;
- }
- void idct_col3(Short *blk)
- {
- int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
- x2 = blk[16];
- x1 = blk[8];
- x0 = ((int32)blk[0] << 11) + 128;
- x4 = x0;
- x6 = W6 * x2;
- x2 = W2 * x2;
- x8 = x0 - x2;
- x0 += x2;
- x2 = x8;
- x8 = x4 - x6;
- x4 += x6;
- x6 = x8;
- x7 = W7 * x1;
- x1 = W1 * x1;
- x3 = x7;
- x5 = (181 * (x1 - x7) + 128) >> 8;
- x7 = (181 * (x1 + x7) + 128) >> 8;
- blk[0] = (x0 + x1) >> 8;
- blk[8] = (x4 + x7) >> 8;
- blk[16] = (x6 + x5) >> 8;
- blk[24] = (x2 + x3) >> 8;
- blk[56] = (x0 - x1) >> 8;
- blk[48] = (x4 - x7) >> 8;
- blk[40] = (x6 - x5) >> 8;
- blk[32] = (x2 - x3) >> 8;
- return ;
- }
- void idct_col4(Short *blk)
- {
- int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
- x2 = blk[16];
- x1 = blk[8];
- x3 = blk[24];
- x0 = ((int32)blk[0] << 11) + 128;
- x4 = x0;
- x6 = W6 * x2;
- x2 = W2 * x2;
- x8 = x0 - x2;
- x0 += x2;
- x2 = x8;
- x8 = x4 - x6;
- x4 += x6;
- x6 = x8;
- x7 = W7 * x1;
- x1 = W1 * x1;
- x5 = W3 * x3;
- x3 = -W5 * x3;
- x8 = x1 - x5;
- x1 += x5;
- x5 = x8;
- x8 = x7 - x3;
- x3 += x7;
- x7 = (181 * (x5 + x8) + 128) >> 8;
- x5 = (181 * (x5 - x8) + 128) >> 8;
- blk[0] = (x0 + x1) >> 8;
- blk[8] = (x4 + x7) >> 8;
- blk[16] = (x6 + x5) >> 8;
- blk[24] = (x2 + x3) >> 8;
- blk[56] = (x0 - x1) >> 8;
- blk[48] = (x4 - x7) >> 8;
- blk[40] = (x6 - x5) >> 8;
- blk[32] = (x2 - x3) >> 8;
- return ;
- }
- #ifndef SMALL_DCT
- void idct_col0x40(Short *blk)
- {
- int32 x1, x3, x5, x7;//, x8;
- x1 = blk[8];
- /* both upper and lower*/
- x7 = W7 * x1;
- x1 = W1 * x1;
- x3 = x7;
- x5 = (181 * (x1 - x7) + 128) >> 8;
- x7 = (181 * (x1 + x7) + 128) >> 8;
- blk[0] = (128 + x1) >> 8;
- blk[8] = (128 + x7) >> 8;
- blk[16] = (128 + x5) >> 8;
- blk[24] = (128 + x3) >> 8;
- blk[56] = (128 - x1) >> 8;
- blk[48] = (128 - x7) >> 8;
- blk[40] = (128 - x5) >> 8;
- blk[32] = (128 - x3) >> 8;
- return ;
- }
- void idct_col0x20(Short *blk)
- {
- int32 x0, x2, x4, x6;
- x2 = blk[16];
- x6 = W6 * x2;
- x2 = W2 * x2;
- x0 = 128 + x2;
- x2 = 128 - x2;
- x4 = 128 + x6;
- x6 = 128 - x6;
- blk[0] = (x0) >> 8;
- blk[56] = (x0) >> 8;
- blk[8] = (x4) >> 8;
- blk[48] = (x4) >> 8;
- blk[16] = (x6) >> 8;
- blk[40] = (x6) >> 8;
- blk[24] = (x2) >> 8;
- blk[32] = (x2) >> 8;
- return ;
- }
- void idct_col0x10(Short *blk)
- {
- int32 x1, x3, x5, x7;
- x3 = blk[24];
- x1 = W3 * x3;
- x3 = W5 * x3;
- x7 = (181 * (x3 - x1) + 128) >> 8;
- x5 = (-181 * (x1 + x3) + 128) >> 8;
- blk[0] = (128 + x1) >> 8;
- blk[8] = (128 + x7) >> 8;
- blk[16] = (128 + x5) >> 8;
- blk[24] = (128 - x3) >> 8;
- blk[56] = (128 - x1) >> 8;
- blk[48] = (128 - x7) >> 8;
- blk[40] = (128 - x5) >> 8;
- blk[32] = (128 + x3) >> 8;
- return ;
- }
- #endif /* SMALL_DCT */
- void idct_col(Short *blk)
- {
- int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
- x1 = (int32)blk[32] << 11;
- x2 = blk[48];
- x3 = blk[16];
- x4 = blk[8];
- x5 = blk[56];
- x6 = blk[40];
- x7 = blk[24];
- x0 = ((int32)blk[0] << 11) + 128;
- /* first stage */
- x8 = W7 * (x4 + x5);
- x4 = x8 + (W1 - W7) * x4;
- x5 = x8 - (W1 + W7) * x5;
- x8 = W3 * (x6 + x7);
- x6 = x8 - (W3 - W5) * x6;
- x7 = x8 - (W3 + W5) * x7;
- /* second stage */
- x8 = x0 + x1;
- x0 -= x1;
- x1 = W6 * (x3 + x2);
- x2 = x1 - (W2 + W6) * x2;
- x3 = x1 + (W2 - W6) * x3;
- x1 = x4 + x6;
- x4 -= x6;
- x6 = x5 + x7;
- x5 -= x7;
- /* third stage */
- x7 = x8 + x3;
- x8 -= x3;
- x3 = x0 + x2;
- x0 -= x2;
- x2 = (181 * (x4 + x5) + 128) >> 8;
- x4 = (181 * (x4 - x5) + 128) >> 8;
- /* fourth stage */
- blk[0] = (x7 + x1) >> 8;
- blk[8] = (x3 + x2) >> 8;
- blk[16] = (x0 + x4) >> 8;
- blk[24] = (x8 + x6) >> 8;
- blk[32] = (x8 - x6) >> 8;
- blk[40] = (x0 - x4) >> 8;
- blk[48] = (x3 - x2) >> 8;
- blk[56] = (x7 - x1) >> 8;
- return ;
- }
- /* This function should not be called at all ****/
- void idct_row0Inter(Short *srce, UChar *rec, Int lx)
- {
- OSCL_UNUSED_ARG(srce);
- OSCL_UNUSED_ARG(rec);
- OSCL_UNUSED_ARG(lx);
- return;
- }
- void idct_row1Inter(Short *blk, UChar *rec, Int lx)
- {
- int tmp;
- int i = 8;
- uint32 pred_word, dst_word;
- int res, res2;
- /* preset the offset, such that we can take advantage pre-offset addressing mode */
- rec -= lx;
- blk -= 8;
- while (i--)
- {
- tmp = (*(blk += 8) + 32) >> 6;
- *blk = 0;
- pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
- res = tmp + (pred_word & 0xFF);
- CLIP_RESULT(res);
- res2 = tmp + ((pred_word >> 8) & 0xFF);
- CLIP_RESULT(res2);
- dst_word = (res2 << 8) | res;
- res = tmp + ((pred_word >> 16) & 0xFF);
- CLIP_RESULT(res);
- dst_word |= (res << 16);
- res = tmp + ((pred_word >> 24) & 0xFF);
- CLIP_RESULT(res);
- dst_word |= (res << 24);
- *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
- pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
- res = tmp + (pred_word & 0xFF);
- CLIP_RESULT(res);
- res2 = tmp + ((pred_word >> 8) & 0xFF);
- CLIP_RESULT(res2);
- dst_word = (res2 << 8) | res;
- res = tmp + ((pred_word >> 16) & 0xFF);
- CLIP_RESULT(res);
- dst_word |= (res << 16);
- res = tmp + ((pred_word >> 24) & 0xFF);
- CLIP_RESULT(res);
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
- }
- return;
- }
- void idct_row2Inter(Short *blk, UChar *rec, Int lx)
- {
- int32 x0, x1, x2, x4, x5;
- int i = 8;
- uint32 pred_word, dst_word;
- int res, res2;
- /* preset the offset, such that we can take advantage pre-offset addressing mode */
- rec -= lx;
- blk -= 8;
- while (i--)
- {
- /* shortcut */
- x4 = blk[9];
- blk[9] = 0;
- x0 = ((*(blk += 8)) << 8) + 8192;
- *blk = 0; /* for proper rounding in the fourth stage */
- /* first stage */
- x5 = (W7 * x4 + 4) >> 3;
- x4 = (W1 * x4 + 4) >> 3;
- /* third stage */
- x2 = (181 * (x4 + x5) + 128) >> 8;
- x1 = (181 * (x4 - x5) + 128) >> 8;
- /* fourth stage */
- pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
- res = (x0 + x4) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (x0 + x2) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (x0 + x1) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (x0 + x5) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
- pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
- res = (x0 - x5) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (x0 - x1) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (x0 - x2) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (x0 - x4) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
- }
- return ;
- }
- void idct_row3Inter(Short *blk, UChar *rec, Int lx)
- {
- int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
- int i = 8;
- uint32 pred_word, dst_word;
- int res, res2;
- /* preset the offset, such that we can take advantage pre-offset addressing mode */
- rec -= lx;
- blk -= 8;
- while (i--)
- {
- x2 = blk[10];
- blk[10] = 0;
- x1 = blk[9];
- blk[9] = 0;
- x0 = ((*(blk += 8)) << 8) + 8192;
- *blk = 0; /* for proper rounding in the fourth stage */
- /* both upper and lower*/
- /* both x2orx6 and x0orx4 */
- x4 = x0;
- x6 = (W6 * x2 + 4) >> 3;
- x2 = (W2 * x2 + 4) >> 3;
- x8 = x0 - x2;
- x0 += x2;
- x2 = x8;
- x8 = x4 - x6;
- x4 += x6;
- x6 = x8;
- x7 = (W7 * x1 + 4) >> 3;
- x1 = (W1 * x1 + 4) >> 3;
- x3 = x7;
- x5 = (181 * (x1 - x7) + 128) >> 8;
- x7 = (181 * (x1 + x7) + 128) >> 8;
- pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
- res = (x0 + x1) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (x4 + x7) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (x6 + x5) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (x2 + x3) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
- pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
- res = (x2 - x3) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (x6 - x5) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (x4 - x7) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (x0 - x1) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
- }
- return ;
- }
- void idct_row4Inter(Short *blk, UChar *rec, Int lx)
- {
- int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
- int i = 8;
- uint32 pred_word, dst_word;
- int res, res2;
- /* preset the offset, such that we can take advantage pre-offset addressing mode */
- rec -= lx;
- blk -= 8;
- while (i--)
- {
- x2 = blk[10];
- blk[10] = 0;
- x1 = blk[9];
- blk[9] = 0;
- x3 = blk[11];
- blk[11] = 0;
- x0 = ((*(blk += 8)) << 8) + 8192;
- *blk = 0; /* for proper rounding in the fourth stage */
- x4 = x0;
- x6 = (W6 * x2 + 4) >> 3;
- x2 = (W2 * x2 + 4) >> 3;
- x8 = x0 - x2;
- x0 += x2;
- x2 = x8;
- x8 = x4 - x6;
- x4 += x6;
- x6 = x8;
- x7 = (W7 * x1 + 4) >> 3;
- x1 = (W1 * x1 + 4) >> 3;
- x5 = (W3 * x3 + 4) >> 3;
- x3 = (- W5 * x3 + 4) >> 3;
- x8 = x1 - x5;
- x1 += x5;
- x5 = x8;
- x8 = x7 - x3;
- x3 += x7;
- x7 = (181 * (x5 + x8) + 128) >> 8;
- x5 = (181 * (x5 - x8) + 128) >> 8;
- pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
- res = (x0 + x1) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (x4 + x7) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (x6 + x5) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (x2 + x3) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
- pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
- res = (x2 - x3) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (x6 - x5) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (x4 - x7) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (x0 - x1) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
- }
- return ;
- }
- #ifndef SMALL_DCT
- void idct_row0x40Inter(Short *blk, UChar *rec, Int lx)
- {
- int32 x1, x2, x4, x5;
- int i = 8;
- uint32 pred_word, dst_word;
- int res, res2;
- /* preset the offset, such that we can take advantage pre-offset addressing mode */
- rec -= lx;
- while (i--)
- {
- /* shortcut */
- x4 = blk[1];
- blk[1] = 0;
- blk += 8; /* for proper rounding in the fourth stage */
- /* first stage */
- x5 = (W7 * x4 + 4) >> 3;
- x4 = (W1 * x4 + 4) >> 3;
- /* third stage */
- x2 = (181 * (x4 + x5) + 128) >> 8;
- x1 = (181 * (x4 - x5) + 128) >> 8;
- /* fourth stage */
- pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
- res = (8192 + x4) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (8192 + x2) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (8192 + x1) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (8192 + x5) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
- pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
- res = (8192 - x5) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (8192 - x1) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (8192 - x2) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (8192 - x4) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
- }
- return ;
- }
- void idct_row0x20Inter(Short *blk, UChar *rec, Int lx)
- {
- int32 x0, x2, x4, x6;
- int i = 8;
- uint32 pred_word, dst_word;
- int res, res2;
- /* preset the offset, such that we can take advantage pre-offset addressing mode */
- rec -= lx;
- while (i--)
- {
- x2 = blk[2];
- blk[2] = 0;
- blk += 8; /* for proper rounding in the fourth stage */
- /* both upper and lower*/
- /* both x2orx6 and x0orx4 */
- x6 = (W6 * x2 + 4) >> 3;
- x2 = (W2 * x2 + 4) >> 3;
- x0 = 8192 + x2;
- x2 = 8192 - x2;
- x4 = 8192 + x6;
- x6 = 8192 - x6;
- pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
- res = (x0) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (x4) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (x6) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (x2) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
- pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
- res = (x2) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (x6) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (x4) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (x0) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
- }
- return ;
- }
- void idct_row0x10Inter(Short *blk, UChar *rec, Int lx)
- {
- int32 x1, x3, x5, x7;
- int i = 8;
- uint32 pred_word, dst_word;
- int res, res2;
- /* preset the offset, such that we can take advantage pre-offset addressing mode */
- rec -= lx;
- while (i--)
- {
- x3 = blk[3];
- blk[3] = 0;
- blk += 8;
- x1 = (W3 * x3 + 4) >> 3;
- x3 = (-W5 * x3 + 4) >> 3;
- x7 = (-181 * (x3 + x1) + 128) >> 8;
- x5 = (181 * (x3 - x1) + 128) >> 8;
- pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
- res = (8192 + x1) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (8192 + x7) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (8192 + x5) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (8192 + x3) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
- pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
- res = (8192 - x3) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (8192 - x5) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (8192 - x7) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (8192 - x1) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
- }
- return ;
- }
- #endif /* SMALL_DCT */
- void idct_rowInter(Short *blk, UChar *rec, Int lx)
- {
- int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
- int i = 8;
- uint32 pred_word, dst_word;
- int res, res2;
- /* preset the offset, such that we can take advantage pre-offset addressing mode */
- rec -= lx;
- blk -= 8;
- while (i--)
- {
- x1 = (int32)blk[12] << 8;
- blk[12] = 0;
- x2 = blk[14];
- blk[14] = 0;
- x3 = blk[10];
- blk[10] = 0;
- x4 = blk[9];
- blk[9] = 0;
- x5 = blk[15];
- blk[15] = 0;
- x6 = blk[13];
- blk[13] = 0;
- x7 = blk[11];
- blk[11] = 0;
- x0 = ((*(blk += 8)) << 8) + 8192;
- *blk = 0; /* for proper rounding in the fourth stage */
- /* first stage */
- x8 = W7 * (x4 + x5) + 4;
- x4 = (x8 + (W1 - W7) * x4) >> 3;
- x5 = (x8 - (W1 + W7) * x5) >> 3;
- x8 = W3 * (x6 + x7) + 4;
- x6 = (x8 - (W3 - W5) * x6) >> 3;
- x7 = (x8 - (W3 + W5) * x7) >> 3;
- /* second stage */
- x8 = x0 + x1;
- x0 -= x1;
- x1 = W6 * (x3 + x2) + 4;
- x2 = (x1 - (W2 + W6) * x2) >> 3;
- x3 = (x1 + (W2 - W6) * x3) >> 3;
- x1 = x4 + x6;
- x4 -= x6;
- x6 = x5 + x7;
- x5 -= x7;
- /* third stage */
- x7 = x8 + x3;
- x8 -= x3;
- x3 = x0 + x2;
- x0 -= x2;
- x2 = (181 * (x4 + x5) + 128) >> 8;
- x4 = (181 * (x4 - x5) + 128) >> 8;
- /* fourth stage */
- pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
- res = (x7 + x1) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (x3 + x2) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (x0 + x4) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (x8 + x6) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
- pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
- res = (x8 - x6) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (x0 - x4) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (x3 - x2) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (x7 - x1) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
- }
- return;
- }
- void idct_row0Intra(Short *srce, UChar *rec, Int lx)
- {
- OSCL_UNUSED_ARG(srce);
- OSCL_UNUSED_ARG(rec);
- OSCL_UNUSED_ARG(lx);
- return;
- }
- void idct_row1Intra(Short *blk, UChar *rec, Int lx)
- {
- int32 tmp;
- int i = 8;
- rec -= lx;
- blk -= 8;
- while (i--)
- {
- tmp = ((*(blk += 8) + 32) >> 6);
- *blk = 0;
- CLIP_RESULT(tmp)
- tmp |= (tmp << 8);
- tmp |= (tmp << 16);
- *((uint32*)(rec += lx)) = tmp;
- *((uint32*)(rec + 4)) = tmp;
- }
- return;
- }
- void idct_row2Intra(Short *blk, UChar *rec, Int lx)
- {
- int32 x0, x1, x2, x4, x5;
- int res, res2;
- uint32 dst_word;
- int i = 8;
- rec -= lx;
- blk -= 8;
- while (i--)
- {
- /* shortcut */
- x4 = blk[9];
- blk[9] = 0;
- x0 = ((*(blk += 8)) << 8) + 8192;
- *blk = 0; /* for proper rounding in the fourth stage */
- /* first stage */
- x5 = (W7 * x4 + 4) >> 3;
- x4 = (W1 * x4 + 4) >> 3;
- /* third stage */
- x2 = (181 * (x4 + x5) + 128) >> 8;
- x1 = (181 * (x4 - x5) + 128) >> 8;
- /* fourth stage */
- res = ((x0 + x4) >> 14);
- CLIP_RESULT(res)
- res2 = ((x0 + x2) >> 14);
- CLIP_RESULT(res2)
- dst_word = (res2 << 8) | res;
- res = ((x0 + x1) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 16);
- res = ((x0 + x5) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 24);
- *((uint32*)(rec += lx)) = dst_word;
- res = ((x0 - x5) >> 14);
- CLIP_RESULT(res)
- res2 = ((x0 - x1) >> 14);
- CLIP_RESULT(res2)
- dst_word = (res2 << 8) | res;
- res = ((x0 - x2) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 16);
- res = ((x0 - x4) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word;
- }
- return ;
- }
- void idct_row3Intra(Short *blk, UChar *rec, Int lx)
- {
- int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
- int res, res2;
- uint32 dst_word;
- int i = 8;
- rec -= lx;
- blk -= 8;
- while (i--)
- {
- x2 = blk[10];
- blk[10] = 0;
- x1 = blk[9];
- blk[9] = 0;
- x0 = ((*(blk += 8)) << 8) + 8192;
- *blk = 0;/* for proper rounding in the fourth stage */
- /* both upper and lower*/
- /* both x2orx6 and x0orx4 */
- x4 = x0;
- x6 = (W6 * x2 + 4) >> 3;
- x2 = (W2 * x2 + 4) >> 3;
- x8 = x0 - x2;
- x0 += x2;
- x2 = x8;
- x8 = x4 - x6;
- x4 += x6;
- x6 = x8;
- x7 = (W7 * x1 + 4) >> 3;
- x1 = (W1 * x1 + 4) >> 3;
- x3 = x7;
- x5 = (181 * (x1 - x7) + 128) >> 8;
- x7 = (181 * (x1 + x7) + 128) >> 8;
- res = ((x0 + x1) >> 14);
- CLIP_RESULT(res)
- res2 = ((x4 + x7) >> 14);
- CLIP_RESULT(res2)
- dst_word = (res2 << 8) | res;
- res = ((x6 + x5) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 16);
- res = ((x2 + x3) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 24);
- *((uint32*)(rec += lx)) = dst_word;
- res = ((x2 - x3) >> 14);
- CLIP_RESULT(res)
- res2 = ((x6 - x5) >> 14);
- CLIP_RESULT(res2)
- dst_word = (res2 << 8) | res;
- res = ((x4 - x7) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 16);
- res = ((x0 - x1) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word;
- }
- return ;
- }
- void idct_row4Intra(Short *blk, UChar *rec, Int lx)
- {
- int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
- int res, res2;
- uint32 dst_word;
- int i = 8;
- rec -= lx;
- blk -= 8;
- while (i--)
- {
- x2 = blk[10];
- blk[10] = 0;
- x1 = blk[9];
- blk[9] = 0;
- x3 = blk[11];
- blk[11] = 0;
- x0 = ((*(blk += 8)) << 8) + 8192;
- *blk = 0; /* for proper rounding in the fourth stage */
- x4 = x0;
- x6 = (W6 * x2 + 4) >> 3;
- x2 = (W2 * x2 + 4) >> 3;
- x8 = x0 - x2;
- x0 += x2;
- x2 = x8;
- x8 = x4 - x6;
- x4 += x6;
- x6 = x8;
- x7 = (W7 * x1 + 4) >> 3;
- x1 = (W1 * x1 + 4) >> 3;
- x5 = (W3 * x3 + 4) >> 3;
- x3 = (- W5 * x3 + 4) >> 3;
- x8 = x1 - x5;
- x1 += x5;
- x5 = x8;
- x8 = x7 - x3;
- x3 += x7;
- x7 = (181 * (x5 + x8) + 128) >> 8;
- x5 = (181 * (x5 - x8) + 128) >> 8;
- res = ((x0 + x1) >> 14);
- CLIP_RESULT(res)
- res2 = ((x4 + x7) >> 14);
- CLIP_RESULT(res2)
- dst_word = (res2 << 8) | res;
- res = ((x6 + x5) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 16);
- res = ((x2 + x3) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 24);
- *((uint32*)(rec += lx)) = dst_word;
- res = ((x2 - x3) >> 14);
- CLIP_RESULT(res)
- res2 = ((x6 - x5) >> 14);
- CLIP_RESULT(res2)
- dst_word = (res2 << 8) | res;
- res = ((x4 - x7) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 16);
- res = ((x0 - x1) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word;
- }
- return ;
- }
- #ifndef SMALL_DCT
- void idct_row0x40Intra(Short *blk, UChar *rec, Int lx)
- {
- int32 x1, x2, x4, x5;
- int res, res2;
- uint32 dst_word;
- int i = 8;
- rec -= lx;
- while (i--)
- {
- /* shortcut */
- x4 = blk[1];
- blk[1] = 0;
- blk += 8;
- /* first stage */
- x5 = (W7 * x4 + 4) >> 3;
- x4 = (W1 * x4 + 4) >> 3;
- /* third stage */
- x2 = (181 * (x4 + x5) + 128) >> 8;
- x1 = (181 * (x4 - x5) + 128) >> 8;
- /* fourth stage */
- res = ((8192 + x4) >> 14);
- CLIP_RESULT(res)
- res2 = ((8192 + x2) >> 14);
- CLIP_RESULT(res2)
- dst_word = (res2 << 8) | res;
- res = ((8192 + x1) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 16);
- res = ((8192 + x5) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 24);
- *((uint32*)(rec += lx)) = dst_word;
- res = ((8192 - x5) >> 14);
- CLIP_RESULT(res)
- res2 = ((8192 - x1) >> 14);
- CLIP_RESULT(res2)
- dst_word = (res2 << 8) | res;
- res = ((8192 - x2) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 16);
- res = ((8192 - x4) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word;
- }
- return ;
- }
- void idct_row0x20Intra(Short *blk, UChar *rec, Int lx)
- {
- int32 x0, x2, x4, x6;
- int res, res2;
- uint32 dst_word;
- int i = 8;
- rec -= lx;
- while (i--)
- {
- x2 = blk[2];
- blk[2] = 0;
- blk += 8;
- /* both upper and lower*/
- /* both x2orx6 and x0orx4 */
- x6 = (W6 * x2 + 4) >> 3;
- x2 = (W2 * x2 + 4) >> 3;
- x0 = 8192 + x2;
- x2 = 8192 - x2;
- x4 = 8192 + x6;
- x6 = 8192 - x6;
- res = ((x0) >> 14);
- CLIP_RESULT(res)
- res2 = ((x4) >> 14);
- CLIP_RESULT(res2)
- dst_word = (res2 << 8) | res;
- res = ((x6) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 16);
- res = ((x2) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 24);
- *((uint32*)(rec += lx)) = dst_word;
- res = ((x2) >> 14);
- CLIP_RESULT(res)
- res2 = ((x6) >> 14);
- CLIP_RESULT(res2)
- dst_word = (res2 << 8) | res;
- res = ((x4) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 16);
- res = ((x0) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word;
- }
- return ;
- }
- void idct_row0x10Intra(Short *blk, UChar *rec, Int lx)
- {
- int32 x1, x3, x5, x7;
- int res, res2;
- uint32 dst_word;
- int i = 8;
- rec -= lx;
- while (i--)
- {
- x3 = blk[3];
- blk[3] = 0 ;
- blk += 8;
- x1 = (W3 * x3 + 4) >> 3;
- x3 = (W5 * x3 + 4) >> 3;
- x7 = (181 * (x3 - x1) + 128) >> 8;
- x5 = (-181 * (x1 + x3) + 128) >> 8;
- res = ((8192 + x1) >> 14);
- CLIP_RESULT(res)
- res2 = ((8192 + x7) >> 14);
- CLIP_RESULT(res2)
- dst_word = (res2 << 8) | res;
- res = ((8192 + x5) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 16);
- res = ((8192 - x3) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 24);
- *((uint32*)(rec += lx)) = dst_word;
- res = ((8192 + x3) >> 14);
- CLIP_RESULT(res)
- res2 = ((8192 - x5) >> 14);
- CLIP_RESULT(res2)
- dst_word = (res2 << 8) | res;
- res = ((8192 - x7) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 16);
- res = ((8192 - x1) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word;
- }
- return ;
- }
- #endif /* SMALL_DCT */
- void idct_rowIntra(Short *blk, UChar *rec, Int lx)
- {
- int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
- int i = 8;
- int res, res2;
- uint32 dst_word;
- blk -= 8;
- rec -= lx;
- while (i--)
- {
- x1 = (int32)blk[12] << 8;
- blk[12] = 0;
- x2 = blk[14];
- blk[14] = 0;
- x3 = blk[10];
- blk[10] = 0;
- x4 = blk[9];
- blk[9] = 0;
- x5 = blk[15];
- blk[15] = 0;
- x6 = blk[13];
- blk[13] = 0;
- x7 = blk[11];
- blk[11] = 0;
- x0 = ((*(blk += 8)) << 8) + 8192;
- *blk = 0; /* for proper rounding in the fourth stage */
- /* first stage */
- x8 = W7 * (x4 + x5) + 4;
- x4 = (x8 + (W1 - W7) * x4) >> 3;
- x5 = (x8 - (W1 + W7) * x5) >> 3;
- x8 = W3 * (x6 + x7) + 4;
- x6 = (x8 - (W3 - W5) * x6) >> 3;
- x7 = (x8 - (W3 + W5) * x7) >> 3;
- /* second stage */
- x8 = x0 + x1;
- x0 -= x1;
- x1 = W6 * (x3 + x2) + 4;
- x2 = (x1 - (W2 + W6) * x2) >> 3;
- x3 = (x1 + (W2 - W6) * x3) >> 3;
- x1 = x4 + x6;
- x4 -= x6;
- x6 = x5 + x7;
- x5 -= x7;
- /* third stage */
- x7 = x8 + x3;
- x8 -= x3;
- x3 = x0 + x2;
- x0 -= x2;
- x2 = (181 * (x4 + x5) + 128) >> 8;
- x4 = (181 * (x4 - x5) + 128) >> 8;
- /* fourth stage */
- res = ((x7 + x1) >> 14);
- CLIP_RESULT(res)
- res2 = ((x3 + x2) >> 14);
- CLIP_RESULT(res2)
- dst_word = res | (res2 << 8);
- res = ((x0 + x4) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 16);
- res = ((x8 + x6) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 24);
- *((uint32*)(rec += lx)) = dst_word;
- res = ((x8 - x6) >> 14);
- CLIP_RESULT(res)
- res2 = ((x0 - x4) >> 14);
- CLIP_RESULT(res2)
- dst_word = res | (res2 << 8);
- res = ((x3 - x2) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 16);
- res = ((x7 - x1) >> 14);
- CLIP_RESULT(res)
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word;
- }
- return;
- }
- /* This function should not be called at all ****/
- void idct_row0zmv(Short *srce, UChar *rec, UChar *pred, Int lx)
- {
- OSCL_UNUSED_ARG(srce);
- OSCL_UNUSED_ARG(rec);
- OSCL_UNUSED_ARG(pred);
- OSCL_UNUSED_ARG(lx);
- return;
- }
- void idct_row1zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
- {
- int tmp;
- int i = 8;
- uint32 pred_word, dst_word;
- int res, res2;
- /* preset the offset, such that we can take advantage pre-offset addressing mode */
- pred -= 16;
- rec -= lx;
- blk -= 8;
- while (i--)
- {
- tmp = (*(blk += 8) + 32) >> 6;
- *blk = 0;
- pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
- res = tmp + (pred_word & 0xFF);
- CLIP_RESULT(res);
- res2 = tmp + ((pred_word >> 8) & 0xFF);
- CLIP_RESULT(res2);
- dst_word = (res2 << 8) | res;
- res = tmp + ((pred_word >> 16) & 0xFF);
- CLIP_RESULT(res);
- dst_word |= (res << 16);
- res = tmp + ((pred_word >> 24) & 0xFF);
- CLIP_RESULT(res);
- dst_word |= (res << 24);
- *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
- pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
- res = tmp + (pred_word & 0xFF);
- CLIP_RESULT(res);
- res2 = tmp + ((pred_word >> 8) & 0xFF);
- CLIP_RESULT(res2);
- dst_word = (res2 << 8) | res;
- res = tmp + ((pred_word >> 16) & 0xFF);
- CLIP_RESULT(res);
- dst_word |= (res << 16);
- res = tmp + ((pred_word >> 24) & 0xFF);
- CLIP_RESULT(res);
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
- }
- return;
- }
- void idct_row2zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
- {
- int32 x0, x1, x2, x4, x5;
- int i = 8;
- uint32 pred_word, dst_word;
- int res, res2;
- /* preset the offset, such that we can take advantage pre-offset addressing mode */
- rec -= lx;
- pred -= 16;
- blk -= 8;
- while (i--)
- {
- /* shortcut */
- x4 = blk[9];
- blk[9] = 0;
- x0 = ((*(blk += 8)) << 8) + 8192;
- *blk = 0; /* for proper rounding in the fourth stage */
- /* first stage */
- x5 = (W7 * x4 + 4) >> 3;
- x4 = (W1 * x4 + 4) >> 3;
- /* third stage */
- x2 = (181 * (x4 + x5) + 128) >> 8;
- x1 = (181 * (x4 - x5) + 128) >> 8;
- /* fourth stage */
- pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
- res = (x0 + x4) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (x0 + x2) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (x0 + x1) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (x0 + x5) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
- pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
- res = (x0 - x5) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (x0 - x1) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (x0 - x2) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (x0 - x4) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
- }
- return ;
- }
- void idct_row3zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
- {
- int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
- int i = 8;
- uint32 pred_word, dst_word;
- int res, res2;
- /* preset the offset, such that we can take advantage pre-offset addressing mode */
- rec -= lx;
- pred -= 16;
- blk -= 8;
- while (i--)
- {
- x2 = blk[10];
- blk[10] = 0;
- x1 = blk[9];
- blk[9] = 0;
- x0 = ((*(blk += 8)) << 8) + 8192;
- *blk = 0; /* for proper rounding in the fourth stage */
- /* both upper and lower*/
- /* both x2orx6 and x0orx4 */
- x4 = x0;
- x6 = (W6 * x2 + 4) >> 3;
- x2 = (W2 * x2 + 4) >> 3;
- x8 = x0 - x2;
- x0 += x2;
- x2 = x8;
- x8 = x4 - x6;
- x4 += x6;
- x6 = x8;
- x7 = (W7 * x1 + 4) >> 3;
- x1 = (W1 * x1 + 4) >> 3;
- x3 = x7;
- x5 = (181 * (x1 - x7) + 128) >> 8;
- x7 = (181 * (x1 + x7) + 128) >> 8;
- pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
- res = (x0 + x1) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (x4 + x7) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (x6 + x5) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (x2 + x3) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
- pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
- res = (x2 - x3) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (x6 - x5) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (x4 - x7) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (x0 - x1) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
- }
- return ;
- }
- void idct_row4zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
- {
- int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
- int i = 8;
- uint32 pred_word, dst_word;
- int res, res2;
- /* preset the offset, such that we can take advantage pre-offset addressing mode */
- rec -= lx;
- pred -= 16;
- blk -= 8;
- while (i--)
- {
- x2 = blk[10];
- blk[10] = 0;
- x1 = blk[9];
- blk[9] = 0;
- x3 = blk[11];
- blk[11] = 0;
- x0 = ((*(blk += 8)) << 8) + 8192;
- *blk = 0; /* for proper rounding in the fourth stage */
- x4 = x0;
- x6 = (W6 * x2 + 4) >> 3;
- x2 = (W2 * x2 + 4) >> 3;
- x8 = x0 - x2;
- x0 += x2;
- x2 = x8;
- x8 = x4 - x6;
- x4 += x6;
- x6 = x8;
- x7 = (W7 * x1 + 4) >> 3;
- x1 = (W1 * x1 + 4) >> 3;
- x5 = (W3 * x3 + 4) >> 3;
- x3 = (- W5 * x3 + 4) >> 3;
- x8 = x1 - x5;
- x1 += x5;
- x5 = x8;
- x8 = x7 - x3;
- x3 += x7;
- x7 = (181 * (x5 + x8) + 128) >> 8;
- x5 = (181 * (x5 - x8) + 128) >> 8;
- pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
- res = (x0 + x1) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (x4 + x7) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (x6 + x5) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (x2 + x3) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
- pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
- res = (x2 - x3) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (x6 - x5) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (x4 - x7) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (x0 - x1) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
- }
- return ;
- }
- #ifndef SMALL_DCT
- void idct_row0x40zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
- {
- int32 x1, x2, x4, x5;
- int i = 8;
- uint32 pred_word, dst_word;
- int res, res2;
- /* preset the offset, such that we can take advantage pre-offset addressing mode */
- rec -= lx;
- pred -= 16;
- while (i--)
- {
- /* shortcut */
- x4 = blk[1];
- blk[1] = 0;
- blk += 8; /* for proper rounding in the fourth stage */
- /* first stage */
- x5 = (W7 * x4 + 4) >> 3;
- x4 = (W1 * x4 + 4) >> 3;
- /* third stage */
- x2 = (181 * (x4 + x5) + 128) >> 8;
- x1 = (181 * (x4 - x5) + 128) >> 8;
- /* fourth stage */
- pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
- res = (8192 + x4) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (8192 + x2) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (8192 + x1) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (8192 + x5) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
- pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
- res = (8192 - x5) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (8192 - x1) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (8192 - x2) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (8192 - x4) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
- }
- return ;
- }
- void idct_row0x20zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
- {
- int32 x0, x2, x4, x6;
- int i = 8;
- uint32 pred_word, dst_word;
- int res, res2;
- /* preset the offset, such that we can take advantage pre-offset addressing mode */
- rec -= lx;
- pred -= 16;
- while (i--)
- {
- x2 = blk[2];
- blk[2] = 0;
- blk += 8; /* for proper rounding in the fourth stage */
- /* both upper and lower*/
- /* both x2orx6 and x0orx4 */
- x6 = (W6 * x2 + 4) >> 3;
- x2 = (W2 * x2 + 4) >> 3;
- x0 = 8192 + x2;
- x2 = 8192 - x2;
- x4 = 8192 + x6;
- x6 = 8192 - x6;
- pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
- res = (x0) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (x4) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (x6) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (x2) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
- pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
- res = (x2) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (x6) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (x4) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (x0) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
- }
- return ;
- }
- void idct_row0x10zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
- {
- int32 x1, x3, x5, x7;
- int i = 8;
- uint32 pred_word, dst_word;
- int res, res2;
- /* preset the offset, such that we can take advantage pre-offset addressing mode */
- rec -= lx;
- pred -= 16;
- while (i--)
- {
- x3 = blk[3];
- blk[3] = 0;
- blk += 8;
- x1 = (W3 * x3 + 4) >> 3;
- x3 = (-W5 * x3 + 4) >> 3;
- x7 = (-181 * (x3 + x1) + 128) >> 8;
- x5 = (181 * (x3 - x1) + 128) >> 8;
- pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
- res = (8192 + x1) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (8192 + x7) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (8192 + x5) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (8192 + x3) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
- pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
- res = (8192 - x3) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (8192 - x5) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (8192 - x7) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (8192 - x1) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
- }
- return ;
- }
- #endif /* SMALL_DCT */
- void idct_rowzmv(Short *blk, UChar *rec, UChar *pred, Int lx)
- {
- int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
- int i = 8;
- uint32 pred_word, dst_word;
- int res, res2;
- /* preset the offset, such that we can take advantage pre-offset addressing mode */
- rec -= lx;
- pred -= 16;
- blk -= 8;
- while (i--)
- {
- x1 = (int32)blk[12] << 8;
- blk[12] = 0;
- x2 = blk[14];
- blk[14] = 0;
- x3 = blk[10];
- blk[10] = 0;
- x4 = blk[9];
- blk[9] = 0;
- x5 = blk[15];
- blk[15] = 0;
- x6 = blk[13];
- blk[13] = 0;
- x7 = blk[11];
- blk[11] = 0;
- x0 = ((*(blk += 8)) << 8) + 8192;
- *blk = 0; /* for proper rounding in the fourth stage */
- /* first stage */
- x8 = W7 * (x4 + x5) + 4;
- x4 = (x8 + (W1 - W7) * x4) >> 3;
- x5 = (x8 - (W1 + W7) * x5) >> 3;
- x8 = W3 * (x6 + x7) + 4;
- x6 = (x8 - (W3 - W5) * x6) >> 3;
- x7 = (x8 - (W3 + W5) * x7) >> 3;
- /* second stage */
- x8 = x0 + x1;
- x0 -= x1;
- x1 = W6 * (x3 + x2) + 4;
- x2 = (x1 - (W2 + W6) * x2) >> 3;
- x3 = (x1 + (W2 - W6) * x3) >> 3;
- x1 = x4 + x6;
- x4 -= x6;
- x6 = x5 + x7;
- x5 -= x7;
- /* third stage */
- x7 = x8 + x3;
- x8 -= x3;
- x3 = x0 + x2;
- x0 -= x2;
- x2 = (181 * (x4 + x5) + 128) >> 8;
- x4 = (181 * (x4 - x5) + 128) >> 8;
- /* fourth stage */
- pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
- res = (x7 + x1) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (x3 + x2) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (x0 + x4) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (x8 + x6) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
- pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
- res = (x8 - x6) >> 14;
- ADD_AND_CLIP1(res);
- res2 = (x0 - x4) >> 14;
- ADD_AND_CLIP2(res2);
- dst_word = (res2 << 8) | res;
- res = (x3 - x2) >> 14;
- ADD_AND_CLIP3(res);
- dst_word |= (res << 16);
- res = (x7 - x1) >> 14;
- ADD_AND_CLIP4(res);
- dst_word |= (res << 24);
- *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
- }
- return;
- }
- /*----------------------------------------------------------------------------
- ; End Function: idctcol
- ----------------------------------------------------------------------------*/
- /* ======================================================================== */
- /* Function : BlockIDCTMotionComp */
- /* Date : 10/16/2000 */
- /* Purpose : fast IDCT routine */
- /* In/out : */
- /* Int* coeff_in Dequantized coefficient
- Int block_out output IDCT coefficient
- Int maxval clip value */
- /* Modified : 7/31/01, add checking for all-zero and DC-only block. */
- /* do 8 columns at a time */
- /* 8/2/01, do column first then row-IDCT. */
- /* 8/2/01, remove clipping (included in motion comp). */
- /* 8/7/01, combine with motion comp. */
- /* 8/8/01, use AAN IDCT */
- /* 9/4/05, use Chen's IDCT and 16 bit block */
- /* ======================================================================== */
- void BlockIDCTMotionComp(Short *block, UChar *bitmapcol, UChar bitmaprow,
- Int dctMode, UChar *rec, UChar *pred, Int lx_intra)
- {
- Int i;
- Int tmp, tmp2;
- ULong tmp4;
- Int bmap;
- Short *ptr = block;
- UChar *endcol;
- UInt mask = 0xFF;
- Int lx = lx_intra >> 1;
- Int intra = (lx_intra & 1);
- /* all-zero block */
- if (dctMode == 0 || bitmaprow == 0)
- {
- if (intra)
- {
- *((ULong*)rec) = *((ULong*)(rec + 4)) = 0;
- *((ULong*)(rec += lx)) = 0;
- *((ULong*)(rec + 4)) = 0;
- *((ULong*)(rec += lx)) = 0;
- *((ULong*)(rec + 4)) = 0;
- *((ULong*)(rec += lx)) = 0;
- *((ULong*)(rec + 4)) = 0;
- *((ULong*)(rec += lx)) = 0;
- *((ULong*)(rec + 4)) = 0;
- *((ULong*)(rec += lx)) = 0;
- *((ULong*)(rec + 4)) = 0;
- *((ULong*)(rec += lx)) = 0;
- *((ULong*)(rec + 4)) = 0;
- *((ULong*)(rec += lx)) =