/media/libstagefright/codecs/m4v_h263/enc/src/fastidct.cpp

https://bitbucket.org/aways/android_frameworks_av · C++ · 1888 lines · 1527 code · 229 blank · 132 comment · 32 complexity · ab869ee4389e36833b879389343ec6d3 MD5 · raw file

  1. /* ------------------------------------------------------------------
  2. * Copyright (C) 1998-2009 PacketVideo
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
  13. * express or implied.
  14. * See the License for the specific language governing permissions
  15. * and limitations under the License.
  16. * -------------------------------------------------------------------
  17. */
  18. /*
  19. ------------------------------------------------------------------------------
  20. REVISION HISTORY
  21. Who: Date: July/2001
  22. Description: 1. Optimized BlockIDCT bitmap checking.
  23. 2. Rearranged functions.
  24. 3. Do column IDCT first, then row IDCT.
  25. 4. Combine motion comp and IDCT, require
  26. two sets of row IDCTs one for INTRA
  27. and one for INTER.
  28. 5. Add AAN IDCT
  29. Who: Date: 8/16/01
  30. 1. Increase the input precision to 8 bits, i.e. change RDCTBITS
  31. to 11, have to comment out all in-line assembly since 16 bit
  32. multiplication doesn't work. Try to use diffent precision with
  33. 32 bit mult. but hasn't finished. Turns out that without in-line
  34. assembly the performance doesn't change much (only 1%).
  35. Who: Date: 9/04/05
  36. 1. Replace AAN IDCT with Chen's IDCT to accommodate 16 bit data type.
  37. */
  38. #include "mp4def.h"
  39. #include "mp4enc_lib.h"
  40. #include "mp4lib_int.h"
  41. #include "dct.h"
  42. #define ADD_CLIP { \
  43. tmp = *rec + tmp; \
  44. if((UInt)tmp > mask) tmp = mask&(~(tmp>>31)); \
  45. *rec++ = tmp; \
  46. }
  47. #define INTRA_CLIP { \
  48. if((UInt)tmp > mask) tmp = mask&(~(tmp>>31)); \
  49. *rec++ = tmp; \
  50. }
  51. #define CLIP_RESULT(x) if((UInt)x > 0xFF){x = 0xFF & (~(x>>31));}
  52. #define ADD_AND_CLIP1(x) x += (pred_word&0xFF); CLIP_RESULT(x);
  53. #define ADD_AND_CLIP2(x) x += ((pred_word>>8)&0xFF); CLIP_RESULT(x);
  54. #define ADD_AND_CLIP3(x) x += ((pred_word>>16)&0xFF); CLIP_RESULT(x);
  55. #define ADD_AND_CLIP4(x) x += ((pred_word>>24)&0xFF); CLIP_RESULT(x);
  56. void idct_col0(Short *blk)
  57. {
  58. OSCL_UNUSED_ARG(blk);
  59. return;
  60. }
  61. void idct_col1(Short *blk)
  62. {
  63. blk[0] = blk[8] = blk[16] = blk[24] = blk[32] = blk[40] = blk[48] = blk[56] =
  64. blk[0] << 3;
  65. return ;
  66. }
  67. void idct_col2(Short *blk)
  68. {
  69. int32 x0, x1, x3, x5, x7;//, x8;
  70. x1 = blk[8];
  71. x0 = ((int32)blk[0] << 11) + 128;
  72. /* both upper and lower*/
  73. x7 = W7 * x1;
  74. x1 = W1 * x1;
  75. x3 = x7;
  76. x5 = (181 * (x1 - x7) + 128) >> 8;
  77. x7 = (181 * (x1 + x7) + 128) >> 8;
  78. blk[0] = (x0 + x1) >> 8;
  79. blk[8] = (x0 + x7) >> 8;
  80. blk[16] = (x0 + x5) >> 8;
  81. blk[24] = (x0 + x3) >> 8;
  82. blk[56] = (x0 - x1) >> 8;
  83. blk[48] = (x0 - x7) >> 8;
  84. blk[40] = (x0 - x5) >> 8;
  85. blk[32] = (x0 - x3) >> 8;
  86. return ;
  87. }
  88. void idct_col3(Short *blk)
  89. {
  90. int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
  91. x2 = blk[16];
  92. x1 = blk[8];
  93. x0 = ((int32)blk[0] << 11) + 128;
  94. x4 = x0;
  95. x6 = W6 * x2;
  96. x2 = W2 * x2;
  97. x8 = x0 - x2;
  98. x0 += x2;
  99. x2 = x8;
  100. x8 = x4 - x6;
  101. x4 += x6;
  102. x6 = x8;
  103. x7 = W7 * x1;
  104. x1 = W1 * x1;
  105. x3 = x7;
  106. x5 = (181 * (x1 - x7) + 128) >> 8;
  107. x7 = (181 * (x1 + x7) + 128) >> 8;
  108. blk[0] = (x0 + x1) >> 8;
  109. blk[8] = (x4 + x7) >> 8;
  110. blk[16] = (x6 + x5) >> 8;
  111. blk[24] = (x2 + x3) >> 8;
  112. blk[56] = (x0 - x1) >> 8;
  113. blk[48] = (x4 - x7) >> 8;
  114. blk[40] = (x6 - x5) >> 8;
  115. blk[32] = (x2 - x3) >> 8;
  116. return ;
  117. }
  118. void idct_col4(Short *blk)
  119. {
  120. int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
  121. x2 = blk[16];
  122. x1 = blk[8];
  123. x3 = blk[24];
  124. x0 = ((int32)blk[0] << 11) + 128;
  125. x4 = x0;
  126. x6 = W6 * x2;
  127. x2 = W2 * x2;
  128. x8 = x0 - x2;
  129. x0 += x2;
  130. x2 = x8;
  131. x8 = x4 - x6;
  132. x4 += x6;
  133. x6 = x8;
  134. x7 = W7 * x1;
  135. x1 = W1 * x1;
  136. x5 = W3 * x3;
  137. x3 = -W5 * x3;
  138. x8 = x1 - x5;
  139. x1 += x5;
  140. x5 = x8;
  141. x8 = x7 - x3;
  142. x3 += x7;
  143. x7 = (181 * (x5 + x8) + 128) >> 8;
  144. x5 = (181 * (x5 - x8) + 128) >> 8;
  145. blk[0] = (x0 + x1) >> 8;
  146. blk[8] = (x4 + x7) >> 8;
  147. blk[16] = (x6 + x5) >> 8;
  148. blk[24] = (x2 + x3) >> 8;
  149. blk[56] = (x0 - x1) >> 8;
  150. blk[48] = (x4 - x7) >> 8;
  151. blk[40] = (x6 - x5) >> 8;
  152. blk[32] = (x2 - x3) >> 8;
  153. return ;
  154. }
  155. #ifndef SMALL_DCT
  156. void idct_col0x40(Short *blk)
  157. {
  158. int32 x1, x3, x5, x7;//, x8;
  159. x1 = blk[8];
  160. /* both upper and lower*/
  161. x7 = W7 * x1;
  162. x1 = W1 * x1;
  163. x3 = x7;
  164. x5 = (181 * (x1 - x7) + 128) >> 8;
  165. x7 = (181 * (x1 + x7) + 128) >> 8;
  166. blk[0] = (128 + x1) >> 8;
  167. blk[8] = (128 + x7) >> 8;
  168. blk[16] = (128 + x5) >> 8;
  169. blk[24] = (128 + x3) >> 8;
  170. blk[56] = (128 - x1) >> 8;
  171. blk[48] = (128 - x7) >> 8;
  172. blk[40] = (128 - x5) >> 8;
  173. blk[32] = (128 - x3) >> 8;
  174. return ;
  175. }
  176. void idct_col0x20(Short *blk)
  177. {
  178. int32 x0, x2, x4, x6;
  179. x2 = blk[16];
  180. x6 = W6 * x2;
  181. x2 = W2 * x2;
  182. x0 = 128 + x2;
  183. x2 = 128 - x2;
  184. x4 = 128 + x6;
  185. x6 = 128 - x6;
  186. blk[0] = (x0) >> 8;
  187. blk[56] = (x0) >> 8;
  188. blk[8] = (x4) >> 8;
  189. blk[48] = (x4) >> 8;
  190. blk[16] = (x6) >> 8;
  191. blk[40] = (x6) >> 8;
  192. blk[24] = (x2) >> 8;
  193. blk[32] = (x2) >> 8;
  194. return ;
  195. }
  196. void idct_col0x10(Short *blk)
  197. {
  198. int32 x1, x3, x5, x7;
  199. x3 = blk[24];
  200. x1 = W3 * x3;
  201. x3 = W5 * x3;
  202. x7 = (181 * (x3 - x1) + 128) >> 8;
  203. x5 = (-181 * (x1 + x3) + 128) >> 8;
  204. blk[0] = (128 + x1) >> 8;
  205. blk[8] = (128 + x7) >> 8;
  206. blk[16] = (128 + x5) >> 8;
  207. blk[24] = (128 - x3) >> 8;
  208. blk[56] = (128 - x1) >> 8;
  209. blk[48] = (128 - x7) >> 8;
  210. blk[40] = (128 - x5) >> 8;
  211. blk[32] = (128 + x3) >> 8;
  212. return ;
  213. }
  214. #endif /* SMALL_DCT */
  215. void idct_col(Short *blk)
  216. {
  217. int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
  218. x1 = (int32)blk[32] << 11;
  219. x2 = blk[48];
  220. x3 = blk[16];
  221. x4 = blk[8];
  222. x5 = blk[56];
  223. x6 = blk[40];
  224. x7 = blk[24];
  225. x0 = ((int32)blk[0] << 11) + 128;
  226. /* first stage */
  227. x8 = W7 * (x4 + x5);
  228. x4 = x8 + (W1 - W7) * x4;
  229. x5 = x8 - (W1 + W7) * x5;
  230. x8 = W3 * (x6 + x7);
  231. x6 = x8 - (W3 - W5) * x6;
  232. x7 = x8 - (W3 + W5) * x7;
  233. /* second stage */
  234. x8 = x0 + x1;
  235. x0 -= x1;
  236. x1 = W6 * (x3 + x2);
  237. x2 = x1 - (W2 + W6) * x2;
  238. x3 = x1 + (W2 - W6) * x3;
  239. x1 = x4 + x6;
  240. x4 -= x6;
  241. x6 = x5 + x7;
  242. x5 -= x7;
  243. /* third stage */
  244. x7 = x8 + x3;
  245. x8 -= x3;
  246. x3 = x0 + x2;
  247. x0 -= x2;
  248. x2 = (181 * (x4 + x5) + 128) >> 8;
  249. x4 = (181 * (x4 - x5) + 128) >> 8;
  250. /* fourth stage */
  251. blk[0] = (x7 + x1) >> 8;
  252. blk[8] = (x3 + x2) >> 8;
  253. blk[16] = (x0 + x4) >> 8;
  254. blk[24] = (x8 + x6) >> 8;
  255. blk[32] = (x8 - x6) >> 8;
  256. blk[40] = (x0 - x4) >> 8;
  257. blk[48] = (x3 - x2) >> 8;
  258. blk[56] = (x7 - x1) >> 8;
  259. return ;
  260. }
  261. /* This function should not be called at all ****/
  262. void idct_row0Inter(Short *srce, UChar *rec, Int lx)
  263. {
  264. OSCL_UNUSED_ARG(srce);
  265. OSCL_UNUSED_ARG(rec);
  266. OSCL_UNUSED_ARG(lx);
  267. return;
  268. }
  269. void idct_row1Inter(Short *blk, UChar *rec, Int lx)
  270. {
  271. int tmp;
  272. int i = 8;
  273. uint32 pred_word, dst_word;
  274. int res, res2;
  275. /* preset the offset, such that we can take advantage pre-offset addressing mode */
  276. rec -= lx;
  277. blk -= 8;
  278. while (i--)
  279. {
  280. tmp = (*(blk += 8) + 32) >> 6;
  281. *blk = 0;
  282. pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
  283. res = tmp + (pred_word & 0xFF);
  284. CLIP_RESULT(res);
  285. res2 = tmp + ((pred_word >> 8) & 0xFF);
  286. CLIP_RESULT(res2);
  287. dst_word = (res2 << 8) | res;
  288. res = tmp + ((pred_word >> 16) & 0xFF);
  289. CLIP_RESULT(res);
  290. dst_word |= (res << 16);
  291. res = tmp + ((pred_word >> 24) & 0xFF);
  292. CLIP_RESULT(res);
  293. dst_word |= (res << 24);
  294. *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
  295. pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
  296. res = tmp + (pred_word & 0xFF);
  297. CLIP_RESULT(res);
  298. res2 = tmp + ((pred_word >> 8) & 0xFF);
  299. CLIP_RESULT(res2);
  300. dst_word = (res2 << 8) | res;
  301. res = tmp + ((pred_word >> 16) & 0xFF);
  302. CLIP_RESULT(res);
  303. dst_word |= (res << 16);
  304. res = tmp + ((pred_word >> 24) & 0xFF);
  305. CLIP_RESULT(res);
  306. dst_word |= (res << 24);
  307. *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
  308. }
  309. return;
  310. }
  311. void idct_row2Inter(Short *blk, UChar *rec, Int lx)
  312. {
  313. int32 x0, x1, x2, x4, x5;
  314. int i = 8;
  315. uint32 pred_word, dst_word;
  316. int res, res2;
  317. /* preset the offset, such that we can take advantage pre-offset addressing mode */
  318. rec -= lx;
  319. blk -= 8;
  320. while (i--)
  321. {
  322. /* shortcut */
  323. x4 = blk[9];
  324. blk[9] = 0;
  325. x0 = ((*(blk += 8)) << 8) + 8192;
  326. *blk = 0; /* for proper rounding in the fourth stage */
  327. /* first stage */
  328. x5 = (W7 * x4 + 4) >> 3;
  329. x4 = (W1 * x4 + 4) >> 3;
  330. /* third stage */
  331. x2 = (181 * (x4 + x5) + 128) >> 8;
  332. x1 = (181 * (x4 - x5) + 128) >> 8;
  333. /* fourth stage */
  334. pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
  335. res = (x0 + x4) >> 14;
  336. ADD_AND_CLIP1(res);
  337. res2 = (x0 + x2) >> 14;
  338. ADD_AND_CLIP2(res2);
  339. dst_word = (res2 << 8) | res;
  340. res = (x0 + x1) >> 14;
  341. ADD_AND_CLIP3(res);
  342. dst_word |= (res << 16);
  343. res = (x0 + x5) >> 14;
  344. ADD_AND_CLIP4(res);
  345. dst_word |= (res << 24);
  346. *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
  347. pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
  348. res = (x0 - x5) >> 14;
  349. ADD_AND_CLIP1(res);
  350. res2 = (x0 - x1) >> 14;
  351. ADD_AND_CLIP2(res2);
  352. dst_word = (res2 << 8) | res;
  353. res = (x0 - x2) >> 14;
  354. ADD_AND_CLIP3(res);
  355. dst_word |= (res << 16);
  356. res = (x0 - x4) >> 14;
  357. ADD_AND_CLIP4(res);
  358. dst_word |= (res << 24);
  359. *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
  360. }
  361. return ;
  362. }
  363. void idct_row3Inter(Short *blk, UChar *rec, Int lx)
  364. {
  365. int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
  366. int i = 8;
  367. uint32 pred_word, dst_word;
  368. int res, res2;
  369. /* preset the offset, such that we can take advantage pre-offset addressing mode */
  370. rec -= lx;
  371. blk -= 8;
  372. while (i--)
  373. {
  374. x2 = blk[10];
  375. blk[10] = 0;
  376. x1 = blk[9];
  377. blk[9] = 0;
  378. x0 = ((*(blk += 8)) << 8) + 8192;
  379. *blk = 0; /* for proper rounding in the fourth stage */
  380. /* both upper and lower*/
  381. /* both x2orx6 and x0orx4 */
  382. x4 = x0;
  383. x6 = (W6 * x2 + 4) >> 3;
  384. x2 = (W2 * x2 + 4) >> 3;
  385. x8 = x0 - x2;
  386. x0 += x2;
  387. x2 = x8;
  388. x8 = x4 - x6;
  389. x4 += x6;
  390. x6 = x8;
  391. x7 = (W7 * x1 + 4) >> 3;
  392. x1 = (W1 * x1 + 4) >> 3;
  393. x3 = x7;
  394. x5 = (181 * (x1 - x7) + 128) >> 8;
  395. x7 = (181 * (x1 + x7) + 128) >> 8;
  396. pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
  397. res = (x0 + x1) >> 14;
  398. ADD_AND_CLIP1(res);
  399. res2 = (x4 + x7) >> 14;
  400. ADD_AND_CLIP2(res2);
  401. dst_word = (res2 << 8) | res;
  402. res = (x6 + x5) >> 14;
  403. ADD_AND_CLIP3(res);
  404. dst_word |= (res << 16);
  405. res = (x2 + x3) >> 14;
  406. ADD_AND_CLIP4(res);
  407. dst_word |= (res << 24);
  408. *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
  409. pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
  410. res = (x2 - x3) >> 14;
  411. ADD_AND_CLIP1(res);
  412. res2 = (x6 - x5) >> 14;
  413. ADD_AND_CLIP2(res2);
  414. dst_word = (res2 << 8) | res;
  415. res = (x4 - x7) >> 14;
  416. ADD_AND_CLIP3(res);
  417. dst_word |= (res << 16);
  418. res = (x0 - x1) >> 14;
  419. ADD_AND_CLIP4(res);
  420. dst_word |= (res << 24);
  421. *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
  422. }
  423. return ;
  424. }
  425. void idct_row4Inter(Short *blk, UChar *rec, Int lx)
  426. {
  427. int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
  428. int i = 8;
  429. uint32 pred_word, dst_word;
  430. int res, res2;
  431. /* preset the offset, such that we can take advantage pre-offset addressing mode */
  432. rec -= lx;
  433. blk -= 8;
  434. while (i--)
  435. {
  436. x2 = blk[10];
  437. blk[10] = 0;
  438. x1 = blk[9];
  439. blk[9] = 0;
  440. x3 = blk[11];
  441. blk[11] = 0;
  442. x0 = ((*(blk += 8)) << 8) + 8192;
  443. *blk = 0; /* for proper rounding in the fourth stage */
  444. x4 = x0;
  445. x6 = (W6 * x2 + 4) >> 3;
  446. x2 = (W2 * x2 + 4) >> 3;
  447. x8 = x0 - x2;
  448. x0 += x2;
  449. x2 = x8;
  450. x8 = x4 - x6;
  451. x4 += x6;
  452. x6 = x8;
  453. x7 = (W7 * x1 + 4) >> 3;
  454. x1 = (W1 * x1 + 4) >> 3;
  455. x5 = (W3 * x3 + 4) >> 3;
  456. x3 = (- W5 * x3 + 4) >> 3;
  457. x8 = x1 - x5;
  458. x1 += x5;
  459. x5 = x8;
  460. x8 = x7 - x3;
  461. x3 += x7;
  462. x7 = (181 * (x5 + x8) + 128) >> 8;
  463. x5 = (181 * (x5 - x8) + 128) >> 8;
  464. pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
  465. res = (x0 + x1) >> 14;
  466. ADD_AND_CLIP1(res);
  467. res2 = (x4 + x7) >> 14;
  468. ADD_AND_CLIP2(res2);
  469. dst_word = (res2 << 8) | res;
  470. res = (x6 + x5) >> 14;
  471. ADD_AND_CLIP3(res);
  472. dst_word |= (res << 16);
  473. res = (x2 + x3) >> 14;
  474. ADD_AND_CLIP4(res);
  475. dst_word |= (res << 24);
  476. *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
  477. pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
  478. res = (x2 - x3) >> 14;
  479. ADD_AND_CLIP1(res);
  480. res2 = (x6 - x5) >> 14;
  481. ADD_AND_CLIP2(res2);
  482. dst_word = (res2 << 8) | res;
  483. res = (x4 - x7) >> 14;
  484. ADD_AND_CLIP3(res);
  485. dst_word |= (res << 16);
  486. res = (x0 - x1) >> 14;
  487. ADD_AND_CLIP4(res);
  488. dst_word |= (res << 24);
  489. *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
  490. }
  491. return ;
  492. }
  493. #ifndef SMALL_DCT
  494. void idct_row0x40Inter(Short *blk, UChar *rec, Int lx)
  495. {
  496. int32 x1, x2, x4, x5;
  497. int i = 8;
  498. uint32 pred_word, dst_word;
  499. int res, res2;
  500. /* preset the offset, such that we can take advantage pre-offset addressing mode */
  501. rec -= lx;
  502. while (i--)
  503. {
  504. /* shortcut */
  505. x4 = blk[1];
  506. blk[1] = 0;
  507. blk += 8; /* for proper rounding in the fourth stage */
  508. /* first stage */
  509. x5 = (W7 * x4 + 4) >> 3;
  510. x4 = (W1 * x4 + 4) >> 3;
  511. /* third stage */
  512. x2 = (181 * (x4 + x5) + 128) >> 8;
  513. x1 = (181 * (x4 - x5) + 128) >> 8;
  514. /* fourth stage */
  515. pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
  516. res = (8192 + x4) >> 14;
  517. ADD_AND_CLIP1(res);
  518. res2 = (8192 + x2) >> 14;
  519. ADD_AND_CLIP2(res2);
  520. dst_word = (res2 << 8) | res;
  521. res = (8192 + x1) >> 14;
  522. ADD_AND_CLIP3(res);
  523. dst_word |= (res << 16);
  524. res = (8192 + x5) >> 14;
  525. ADD_AND_CLIP4(res);
  526. dst_word |= (res << 24);
  527. *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
  528. pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
  529. res = (8192 - x5) >> 14;
  530. ADD_AND_CLIP1(res);
  531. res2 = (8192 - x1) >> 14;
  532. ADD_AND_CLIP2(res2);
  533. dst_word = (res2 << 8) | res;
  534. res = (8192 - x2) >> 14;
  535. ADD_AND_CLIP3(res);
  536. dst_word |= (res << 16);
  537. res = (8192 - x4) >> 14;
  538. ADD_AND_CLIP4(res);
  539. dst_word |= (res << 24);
  540. *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
  541. }
  542. return ;
  543. }
  544. void idct_row0x20Inter(Short *blk, UChar *rec, Int lx)
  545. {
  546. int32 x0, x2, x4, x6;
  547. int i = 8;
  548. uint32 pred_word, dst_word;
  549. int res, res2;
  550. /* preset the offset, such that we can take advantage pre-offset addressing mode */
  551. rec -= lx;
  552. while (i--)
  553. {
  554. x2 = blk[2];
  555. blk[2] = 0;
  556. blk += 8; /* for proper rounding in the fourth stage */
  557. /* both upper and lower*/
  558. /* both x2orx6 and x0orx4 */
  559. x6 = (W6 * x2 + 4) >> 3;
  560. x2 = (W2 * x2 + 4) >> 3;
  561. x0 = 8192 + x2;
  562. x2 = 8192 - x2;
  563. x4 = 8192 + x6;
  564. x6 = 8192 - x6;
  565. pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
  566. res = (x0) >> 14;
  567. ADD_AND_CLIP1(res);
  568. res2 = (x4) >> 14;
  569. ADD_AND_CLIP2(res2);
  570. dst_word = (res2 << 8) | res;
  571. res = (x6) >> 14;
  572. ADD_AND_CLIP3(res);
  573. dst_word |= (res << 16);
  574. res = (x2) >> 14;
  575. ADD_AND_CLIP4(res);
  576. dst_word |= (res << 24);
  577. *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
  578. pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
  579. res = (x2) >> 14;
  580. ADD_AND_CLIP1(res);
  581. res2 = (x6) >> 14;
  582. ADD_AND_CLIP2(res2);
  583. dst_word = (res2 << 8) | res;
  584. res = (x4) >> 14;
  585. ADD_AND_CLIP3(res);
  586. dst_word |= (res << 16);
  587. res = (x0) >> 14;
  588. ADD_AND_CLIP4(res);
  589. dst_word |= (res << 24);
  590. *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
  591. }
  592. return ;
  593. }
  594. void idct_row0x10Inter(Short *blk, UChar *rec, Int lx)
  595. {
  596. int32 x1, x3, x5, x7;
  597. int i = 8;
  598. uint32 pred_word, dst_word;
  599. int res, res2;
  600. /* preset the offset, such that we can take advantage pre-offset addressing mode */
  601. rec -= lx;
  602. while (i--)
  603. {
  604. x3 = blk[3];
  605. blk[3] = 0;
  606. blk += 8;
  607. x1 = (W3 * x3 + 4) >> 3;
  608. x3 = (-W5 * x3 + 4) >> 3;
  609. x7 = (-181 * (x3 + x1) + 128) >> 8;
  610. x5 = (181 * (x3 - x1) + 128) >> 8;
  611. pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
  612. res = (8192 + x1) >> 14;
  613. ADD_AND_CLIP1(res);
  614. res2 = (8192 + x7) >> 14;
  615. ADD_AND_CLIP2(res2);
  616. dst_word = (res2 << 8) | res;
  617. res = (8192 + x5) >> 14;
  618. ADD_AND_CLIP3(res);
  619. dst_word |= (res << 16);
  620. res = (8192 + x3) >> 14;
  621. ADD_AND_CLIP4(res);
  622. dst_word |= (res << 24);
  623. *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
  624. pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
  625. res = (8192 - x3) >> 14;
  626. ADD_AND_CLIP1(res);
  627. res2 = (8192 - x5) >> 14;
  628. ADD_AND_CLIP2(res2);
  629. dst_word = (res2 << 8) | res;
  630. res = (8192 - x7) >> 14;
  631. ADD_AND_CLIP3(res);
  632. dst_word |= (res << 16);
  633. res = (8192 - x1) >> 14;
  634. ADD_AND_CLIP4(res);
  635. dst_word |= (res << 24);
  636. *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
  637. }
  638. return ;
  639. }
  640. #endif /* SMALL_DCT */
  641. void idct_rowInter(Short *blk, UChar *rec, Int lx)
  642. {
  643. int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
  644. int i = 8;
  645. uint32 pred_word, dst_word;
  646. int res, res2;
  647. /* preset the offset, such that we can take advantage pre-offset addressing mode */
  648. rec -= lx;
  649. blk -= 8;
  650. while (i--)
  651. {
  652. x1 = (int32)blk[12] << 8;
  653. blk[12] = 0;
  654. x2 = blk[14];
  655. blk[14] = 0;
  656. x3 = blk[10];
  657. blk[10] = 0;
  658. x4 = blk[9];
  659. blk[9] = 0;
  660. x5 = blk[15];
  661. blk[15] = 0;
  662. x6 = blk[13];
  663. blk[13] = 0;
  664. x7 = blk[11];
  665. blk[11] = 0;
  666. x0 = ((*(blk += 8)) << 8) + 8192;
  667. *blk = 0; /* for proper rounding in the fourth stage */
  668. /* first stage */
  669. x8 = W7 * (x4 + x5) + 4;
  670. x4 = (x8 + (W1 - W7) * x4) >> 3;
  671. x5 = (x8 - (W1 + W7) * x5) >> 3;
  672. x8 = W3 * (x6 + x7) + 4;
  673. x6 = (x8 - (W3 - W5) * x6) >> 3;
  674. x7 = (x8 - (W3 + W5) * x7) >> 3;
  675. /* second stage */
  676. x8 = x0 + x1;
  677. x0 -= x1;
  678. x1 = W6 * (x3 + x2) + 4;
  679. x2 = (x1 - (W2 + W6) * x2) >> 3;
  680. x3 = (x1 + (W2 - W6) * x3) >> 3;
  681. x1 = x4 + x6;
  682. x4 -= x6;
  683. x6 = x5 + x7;
  684. x5 -= x7;
  685. /* third stage */
  686. x7 = x8 + x3;
  687. x8 -= x3;
  688. x3 = x0 + x2;
  689. x0 -= x2;
  690. x2 = (181 * (x4 + x5) + 128) >> 8;
  691. x4 = (181 * (x4 - x5) + 128) >> 8;
  692. /* fourth stage */
  693. pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
  694. res = (x7 + x1) >> 14;
  695. ADD_AND_CLIP1(res);
  696. res2 = (x3 + x2) >> 14;
  697. ADD_AND_CLIP2(res2);
  698. dst_word = (res2 << 8) | res;
  699. res = (x0 + x4) >> 14;
  700. ADD_AND_CLIP3(res);
  701. dst_word |= (res << 16);
  702. res = (x8 + x6) >> 14;
  703. ADD_AND_CLIP4(res);
  704. dst_word |= (res << 24);
  705. *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
  706. pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
  707. res = (x8 - x6) >> 14;
  708. ADD_AND_CLIP1(res);
  709. res2 = (x0 - x4) >> 14;
  710. ADD_AND_CLIP2(res2);
  711. dst_word = (res2 << 8) | res;
  712. res = (x3 - x2) >> 14;
  713. ADD_AND_CLIP3(res);
  714. dst_word |= (res << 16);
  715. res = (x7 - x1) >> 14;
  716. ADD_AND_CLIP4(res);
  717. dst_word |= (res << 24);
  718. *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
  719. }
  720. return;
  721. }
  722. void idct_row0Intra(Short *srce, UChar *rec, Int lx)
  723. {
  724. OSCL_UNUSED_ARG(srce);
  725. OSCL_UNUSED_ARG(rec);
  726. OSCL_UNUSED_ARG(lx);
  727. return;
  728. }
  729. void idct_row1Intra(Short *blk, UChar *rec, Int lx)
  730. {
  731. int32 tmp;
  732. int i = 8;
  733. rec -= lx;
  734. blk -= 8;
  735. while (i--)
  736. {
  737. tmp = ((*(blk += 8) + 32) >> 6);
  738. *blk = 0;
  739. CLIP_RESULT(tmp)
  740. tmp |= (tmp << 8);
  741. tmp |= (tmp << 16);
  742. *((uint32*)(rec += lx)) = tmp;
  743. *((uint32*)(rec + 4)) = tmp;
  744. }
  745. return;
  746. }
  747. void idct_row2Intra(Short *blk, UChar *rec, Int lx)
  748. {
  749. int32 x0, x1, x2, x4, x5;
  750. int res, res2;
  751. uint32 dst_word;
  752. int i = 8;
  753. rec -= lx;
  754. blk -= 8;
  755. while (i--)
  756. {
  757. /* shortcut */
  758. x4 = blk[9];
  759. blk[9] = 0;
  760. x0 = ((*(blk += 8)) << 8) + 8192;
  761. *blk = 0; /* for proper rounding in the fourth stage */
  762. /* first stage */
  763. x5 = (W7 * x4 + 4) >> 3;
  764. x4 = (W1 * x4 + 4) >> 3;
  765. /* third stage */
  766. x2 = (181 * (x4 + x5) + 128) >> 8;
  767. x1 = (181 * (x4 - x5) + 128) >> 8;
  768. /* fourth stage */
  769. res = ((x0 + x4) >> 14);
  770. CLIP_RESULT(res)
  771. res2 = ((x0 + x2) >> 14);
  772. CLIP_RESULT(res2)
  773. dst_word = (res2 << 8) | res;
  774. res = ((x0 + x1) >> 14);
  775. CLIP_RESULT(res)
  776. dst_word |= (res << 16);
  777. res = ((x0 + x5) >> 14);
  778. CLIP_RESULT(res)
  779. dst_word |= (res << 24);
  780. *((uint32*)(rec += lx)) = dst_word;
  781. res = ((x0 - x5) >> 14);
  782. CLIP_RESULT(res)
  783. res2 = ((x0 - x1) >> 14);
  784. CLIP_RESULT(res2)
  785. dst_word = (res2 << 8) | res;
  786. res = ((x0 - x2) >> 14);
  787. CLIP_RESULT(res)
  788. dst_word |= (res << 16);
  789. res = ((x0 - x4) >> 14);
  790. CLIP_RESULT(res)
  791. dst_word |= (res << 24);
  792. *((uint32*)(rec + 4)) = dst_word;
  793. }
  794. return ;
  795. }
  796. void idct_row3Intra(Short *blk, UChar *rec, Int lx)
  797. {
  798. int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
  799. int res, res2;
  800. uint32 dst_word;
  801. int i = 8;
  802. rec -= lx;
  803. blk -= 8;
  804. while (i--)
  805. {
  806. x2 = blk[10];
  807. blk[10] = 0;
  808. x1 = blk[9];
  809. blk[9] = 0;
  810. x0 = ((*(blk += 8)) << 8) + 8192;
  811. *blk = 0;/* for proper rounding in the fourth stage */
  812. /* both upper and lower*/
  813. /* both x2orx6 and x0orx4 */
  814. x4 = x0;
  815. x6 = (W6 * x2 + 4) >> 3;
  816. x2 = (W2 * x2 + 4) >> 3;
  817. x8 = x0 - x2;
  818. x0 += x2;
  819. x2 = x8;
  820. x8 = x4 - x6;
  821. x4 += x6;
  822. x6 = x8;
  823. x7 = (W7 * x1 + 4) >> 3;
  824. x1 = (W1 * x1 + 4) >> 3;
  825. x3 = x7;
  826. x5 = (181 * (x1 - x7) + 128) >> 8;
  827. x7 = (181 * (x1 + x7) + 128) >> 8;
  828. res = ((x0 + x1) >> 14);
  829. CLIP_RESULT(res)
  830. res2 = ((x4 + x7) >> 14);
  831. CLIP_RESULT(res2)
  832. dst_word = (res2 << 8) | res;
  833. res = ((x6 + x5) >> 14);
  834. CLIP_RESULT(res)
  835. dst_word |= (res << 16);
  836. res = ((x2 + x3) >> 14);
  837. CLIP_RESULT(res)
  838. dst_word |= (res << 24);
  839. *((uint32*)(rec += lx)) = dst_word;
  840. res = ((x2 - x3) >> 14);
  841. CLIP_RESULT(res)
  842. res2 = ((x6 - x5) >> 14);
  843. CLIP_RESULT(res2)
  844. dst_word = (res2 << 8) | res;
  845. res = ((x4 - x7) >> 14);
  846. CLIP_RESULT(res)
  847. dst_word |= (res << 16);
  848. res = ((x0 - x1) >> 14);
  849. CLIP_RESULT(res)
  850. dst_word |= (res << 24);
  851. *((uint32*)(rec + 4)) = dst_word;
  852. }
  853. return ;
  854. }
  855. void idct_row4Intra(Short *blk, UChar *rec, Int lx)
  856. {
  857. int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
  858. int res, res2;
  859. uint32 dst_word;
  860. int i = 8;
  861. rec -= lx;
  862. blk -= 8;
  863. while (i--)
  864. {
  865. x2 = blk[10];
  866. blk[10] = 0;
  867. x1 = blk[9];
  868. blk[9] = 0;
  869. x3 = blk[11];
  870. blk[11] = 0;
  871. x0 = ((*(blk += 8)) << 8) + 8192;
  872. *blk = 0; /* for proper rounding in the fourth stage */
  873. x4 = x0;
  874. x6 = (W6 * x2 + 4) >> 3;
  875. x2 = (W2 * x2 + 4) >> 3;
  876. x8 = x0 - x2;
  877. x0 += x2;
  878. x2 = x8;
  879. x8 = x4 - x6;
  880. x4 += x6;
  881. x6 = x8;
  882. x7 = (W7 * x1 + 4) >> 3;
  883. x1 = (W1 * x1 + 4) >> 3;
  884. x5 = (W3 * x3 + 4) >> 3;
  885. x3 = (- W5 * x3 + 4) >> 3;
  886. x8 = x1 - x5;
  887. x1 += x5;
  888. x5 = x8;
  889. x8 = x7 - x3;
  890. x3 += x7;
  891. x7 = (181 * (x5 + x8) + 128) >> 8;
  892. x5 = (181 * (x5 - x8) + 128) >> 8;
  893. res = ((x0 + x1) >> 14);
  894. CLIP_RESULT(res)
  895. res2 = ((x4 + x7) >> 14);
  896. CLIP_RESULT(res2)
  897. dst_word = (res2 << 8) | res;
  898. res = ((x6 + x5) >> 14);
  899. CLIP_RESULT(res)
  900. dst_word |= (res << 16);
  901. res = ((x2 + x3) >> 14);
  902. CLIP_RESULT(res)
  903. dst_word |= (res << 24);
  904. *((uint32*)(rec += lx)) = dst_word;
  905. res = ((x2 - x3) >> 14);
  906. CLIP_RESULT(res)
  907. res2 = ((x6 - x5) >> 14);
  908. CLIP_RESULT(res2)
  909. dst_word = (res2 << 8) | res;
  910. res = ((x4 - x7) >> 14);
  911. CLIP_RESULT(res)
  912. dst_word |= (res << 16);
  913. res = ((x0 - x1) >> 14);
  914. CLIP_RESULT(res)
  915. dst_word |= (res << 24);
  916. *((uint32*)(rec + 4)) = dst_word;
  917. }
  918. return ;
  919. }
  920. #ifndef SMALL_DCT
  921. void idct_row0x40Intra(Short *blk, UChar *rec, Int lx)
  922. {
  923. int32 x1, x2, x4, x5;
  924. int res, res2;
  925. uint32 dst_word;
  926. int i = 8;
  927. rec -= lx;
  928. while (i--)
  929. {
  930. /* shortcut */
  931. x4 = blk[1];
  932. blk[1] = 0;
  933. blk += 8;
  934. /* first stage */
  935. x5 = (W7 * x4 + 4) >> 3;
  936. x4 = (W1 * x4 + 4) >> 3;
  937. /* third stage */
  938. x2 = (181 * (x4 + x5) + 128) >> 8;
  939. x1 = (181 * (x4 - x5) + 128) >> 8;
  940. /* fourth stage */
  941. res = ((8192 + x4) >> 14);
  942. CLIP_RESULT(res)
  943. res2 = ((8192 + x2) >> 14);
  944. CLIP_RESULT(res2)
  945. dst_word = (res2 << 8) | res;
  946. res = ((8192 + x1) >> 14);
  947. CLIP_RESULT(res)
  948. dst_word |= (res << 16);
  949. res = ((8192 + x5) >> 14);
  950. CLIP_RESULT(res)
  951. dst_word |= (res << 24);
  952. *((uint32*)(rec += lx)) = dst_word;
  953. res = ((8192 - x5) >> 14);
  954. CLIP_RESULT(res)
  955. res2 = ((8192 - x1) >> 14);
  956. CLIP_RESULT(res2)
  957. dst_word = (res2 << 8) | res;
  958. res = ((8192 - x2) >> 14);
  959. CLIP_RESULT(res)
  960. dst_word |= (res << 16);
  961. res = ((8192 - x4) >> 14);
  962. CLIP_RESULT(res)
  963. dst_word |= (res << 24);
  964. *((uint32*)(rec + 4)) = dst_word;
  965. }
  966. return ;
  967. }
  968. void idct_row0x20Intra(Short *blk, UChar *rec, Int lx)
  969. {
  970. int32 x0, x2, x4, x6;
  971. int res, res2;
  972. uint32 dst_word;
  973. int i = 8;
  974. rec -= lx;
  975. while (i--)
  976. {
  977. x2 = blk[2];
  978. blk[2] = 0;
  979. blk += 8;
  980. /* both upper and lower*/
  981. /* both x2orx6 and x0orx4 */
  982. x6 = (W6 * x2 + 4) >> 3;
  983. x2 = (W2 * x2 + 4) >> 3;
  984. x0 = 8192 + x2;
  985. x2 = 8192 - x2;
  986. x4 = 8192 + x6;
  987. x6 = 8192 - x6;
  988. res = ((x0) >> 14);
  989. CLIP_RESULT(res)
  990. res2 = ((x4) >> 14);
  991. CLIP_RESULT(res2)
  992. dst_word = (res2 << 8) | res;
  993. res = ((x6) >> 14);
  994. CLIP_RESULT(res)
  995. dst_word |= (res << 16);
  996. res = ((x2) >> 14);
  997. CLIP_RESULT(res)
  998. dst_word |= (res << 24);
  999. *((uint32*)(rec += lx)) = dst_word;
  1000. res = ((x2) >> 14);
  1001. CLIP_RESULT(res)
  1002. res2 = ((x6) >> 14);
  1003. CLIP_RESULT(res2)
  1004. dst_word = (res2 << 8) | res;
  1005. res = ((x4) >> 14);
  1006. CLIP_RESULT(res)
  1007. dst_word |= (res << 16);
  1008. res = ((x0) >> 14);
  1009. CLIP_RESULT(res)
  1010. dst_word |= (res << 24);
  1011. *((uint32*)(rec + 4)) = dst_word;
  1012. }
  1013. return ;
  1014. }
  1015. void idct_row0x10Intra(Short *blk, UChar *rec, Int lx)
  1016. {
  1017. int32 x1, x3, x5, x7;
  1018. int res, res2;
  1019. uint32 dst_word;
  1020. int i = 8;
  1021. rec -= lx;
  1022. while (i--)
  1023. {
  1024. x3 = blk[3];
  1025. blk[3] = 0 ;
  1026. blk += 8;
  1027. x1 = (W3 * x3 + 4) >> 3;
  1028. x3 = (W5 * x3 + 4) >> 3;
  1029. x7 = (181 * (x3 - x1) + 128) >> 8;
  1030. x5 = (-181 * (x1 + x3) + 128) >> 8;
  1031. res = ((8192 + x1) >> 14);
  1032. CLIP_RESULT(res)
  1033. res2 = ((8192 + x7) >> 14);
  1034. CLIP_RESULT(res2)
  1035. dst_word = (res2 << 8) | res;
  1036. res = ((8192 + x5) >> 14);
  1037. CLIP_RESULT(res)
  1038. dst_word |= (res << 16);
  1039. res = ((8192 - x3) >> 14);
  1040. CLIP_RESULT(res)
  1041. dst_word |= (res << 24);
  1042. *((uint32*)(rec += lx)) = dst_word;
  1043. res = ((8192 + x3) >> 14);
  1044. CLIP_RESULT(res)
  1045. res2 = ((8192 - x5) >> 14);
  1046. CLIP_RESULT(res2)
  1047. dst_word = (res2 << 8) | res;
  1048. res = ((8192 - x7) >> 14);
  1049. CLIP_RESULT(res)
  1050. dst_word |= (res << 16);
  1051. res = ((8192 - x1) >> 14);
  1052. CLIP_RESULT(res)
  1053. dst_word |= (res << 24);
  1054. *((uint32*)(rec + 4)) = dst_word;
  1055. }
  1056. return ;
  1057. }
  1058. #endif /* SMALL_DCT */
  1059. void idct_rowIntra(Short *blk, UChar *rec, Int lx)
  1060. {
  1061. int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
  1062. int i = 8;
  1063. int res, res2;
  1064. uint32 dst_word;
  1065. blk -= 8;
  1066. rec -= lx;
  1067. while (i--)
  1068. {
  1069. x1 = (int32)blk[12] << 8;
  1070. blk[12] = 0;
  1071. x2 = blk[14];
  1072. blk[14] = 0;
  1073. x3 = blk[10];
  1074. blk[10] = 0;
  1075. x4 = blk[9];
  1076. blk[9] = 0;
  1077. x5 = blk[15];
  1078. blk[15] = 0;
  1079. x6 = blk[13];
  1080. blk[13] = 0;
  1081. x7 = blk[11];
  1082. blk[11] = 0;
  1083. x0 = ((*(blk += 8)) << 8) + 8192;
  1084. *blk = 0; /* for proper rounding in the fourth stage */
  1085. /* first stage */
  1086. x8 = W7 * (x4 + x5) + 4;
  1087. x4 = (x8 + (W1 - W7) * x4) >> 3;
  1088. x5 = (x8 - (W1 + W7) * x5) >> 3;
  1089. x8 = W3 * (x6 + x7) + 4;
  1090. x6 = (x8 - (W3 - W5) * x6) >> 3;
  1091. x7 = (x8 - (W3 + W5) * x7) >> 3;
  1092. /* second stage */
  1093. x8 = x0 + x1;
  1094. x0 -= x1;
  1095. x1 = W6 * (x3 + x2) + 4;
  1096. x2 = (x1 - (W2 + W6) * x2) >> 3;
  1097. x3 = (x1 + (W2 - W6) * x3) >> 3;
  1098. x1 = x4 + x6;
  1099. x4 -= x6;
  1100. x6 = x5 + x7;
  1101. x5 -= x7;
  1102. /* third stage */
  1103. x7 = x8 + x3;
  1104. x8 -= x3;
  1105. x3 = x0 + x2;
  1106. x0 -= x2;
  1107. x2 = (181 * (x4 + x5) + 128) >> 8;
  1108. x4 = (181 * (x4 - x5) + 128) >> 8;
  1109. /* fourth stage */
  1110. res = ((x7 + x1) >> 14);
  1111. CLIP_RESULT(res)
  1112. res2 = ((x3 + x2) >> 14);
  1113. CLIP_RESULT(res2)
  1114. dst_word = res | (res2 << 8);
  1115. res = ((x0 + x4) >> 14);
  1116. CLIP_RESULT(res)
  1117. dst_word |= (res << 16);
  1118. res = ((x8 + x6) >> 14);
  1119. CLIP_RESULT(res)
  1120. dst_word |= (res << 24);
  1121. *((uint32*)(rec += lx)) = dst_word;
  1122. res = ((x8 - x6) >> 14);
  1123. CLIP_RESULT(res)
  1124. res2 = ((x0 - x4) >> 14);
  1125. CLIP_RESULT(res2)
  1126. dst_word = res | (res2 << 8);
  1127. res = ((x3 - x2) >> 14);
  1128. CLIP_RESULT(res)
  1129. dst_word |= (res << 16);
  1130. res = ((x7 - x1) >> 14);
  1131. CLIP_RESULT(res)
  1132. dst_word |= (res << 24);
  1133. *((uint32*)(rec + 4)) = dst_word;
  1134. }
  1135. return;
  1136. }
  1137. /* This function should not be called at all ****/
  1138. void idct_row0zmv(Short *srce, UChar *rec, UChar *pred, Int lx)
  1139. {
  1140. OSCL_UNUSED_ARG(srce);
  1141. OSCL_UNUSED_ARG(rec);
  1142. OSCL_UNUSED_ARG(pred);
  1143. OSCL_UNUSED_ARG(lx);
  1144. return;
  1145. }
  1146. void idct_row1zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
  1147. {
  1148. int tmp;
  1149. int i = 8;
  1150. uint32 pred_word, dst_word;
  1151. int res, res2;
  1152. /* preset the offset, such that we can take advantage pre-offset addressing mode */
  1153. pred -= 16;
  1154. rec -= lx;
  1155. blk -= 8;
  1156. while (i--)
  1157. {
  1158. tmp = (*(blk += 8) + 32) >> 6;
  1159. *blk = 0;
  1160. pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
  1161. res = tmp + (pred_word & 0xFF);
  1162. CLIP_RESULT(res);
  1163. res2 = tmp + ((pred_word >> 8) & 0xFF);
  1164. CLIP_RESULT(res2);
  1165. dst_word = (res2 << 8) | res;
  1166. res = tmp + ((pred_word >> 16) & 0xFF);
  1167. CLIP_RESULT(res);
  1168. dst_word |= (res << 16);
  1169. res = tmp + ((pred_word >> 24) & 0xFF);
  1170. CLIP_RESULT(res);
  1171. dst_word |= (res << 24);
  1172. *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
  1173. pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
  1174. res = tmp + (pred_word & 0xFF);
  1175. CLIP_RESULT(res);
  1176. res2 = tmp + ((pred_word >> 8) & 0xFF);
  1177. CLIP_RESULT(res2);
  1178. dst_word = (res2 << 8) | res;
  1179. res = tmp + ((pred_word >> 16) & 0xFF);
  1180. CLIP_RESULT(res);
  1181. dst_word |= (res << 16);
  1182. res = tmp + ((pred_word >> 24) & 0xFF);
  1183. CLIP_RESULT(res);
  1184. dst_word |= (res << 24);
  1185. *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
  1186. }
  1187. return;
  1188. }
  1189. void idct_row2zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
  1190. {
  1191. int32 x0, x1, x2, x4, x5;
  1192. int i = 8;
  1193. uint32 pred_word, dst_word;
  1194. int res, res2;
  1195. /* preset the offset, such that we can take advantage pre-offset addressing mode */
  1196. rec -= lx;
  1197. pred -= 16;
  1198. blk -= 8;
  1199. while (i--)
  1200. {
  1201. /* shortcut */
  1202. x4 = blk[9];
  1203. blk[9] = 0;
  1204. x0 = ((*(blk += 8)) << 8) + 8192;
  1205. *blk = 0; /* for proper rounding in the fourth stage */
  1206. /* first stage */
  1207. x5 = (W7 * x4 + 4) >> 3;
  1208. x4 = (W1 * x4 + 4) >> 3;
  1209. /* third stage */
  1210. x2 = (181 * (x4 + x5) + 128) >> 8;
  1211. x1 = (181 * (x4 - x5) + 128) >> 8;
  1212. /* fourth stage */
  1213. pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
  1214. res = (x0 + x4) >> 14;
  1215. ADD_AND_CLIP1(res);
  1216. res2 = (x0 + x2) >> 14;
  1217. ADD_AND_CLIP2(res2);
  1218. dst_word = (res2 << 8) | res;
  1219. res = (x0 + x1) >> 14;
  1220. ADD_AND_CLIP3(res);
  1221. dst_word |= (res << 16);
  1222. res = (x0 + x5) >> 14;
  1223. ADD_AND_CLIP4(res);
  1224. dst_word |= (res << 24);
  1225. *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
  1226. pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
  1227. res = (x0 - x5) >> 14;
  1228. ADD_AND_CLIP1(res);
  1229. res2 = (x0 - x1) >> 14;
  1230. ADD_AND_CLIP2(res2);
  1231. dst_word = (res2 << 8) | res;
  1232. res = (x0 - x2) >> 14;
  1233. ADD_AND_CLIP3(res);
  1234. dst_word |= (res << 16);
  1235. res = (x0 - x4) >> 14;
  1236. ADD_AND_CLIP4(res);
  1237. dst_word |= (res << 24);
  1238. *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
  1239. }
  1240. return ;
  1241. }
  1242. void idct_row3zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
  1243. {
  1244. int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
  1245. int i = 8;
  1246. uint32 pred_word, dst_word;
  1247. int res, res2;
  1248. /* preset the offset, such that we can take advantage pre-offset addressing mode */
  1249. rec -= lx;
  1250. pred -= 16;
  1251. blk -= 8;
  1252. while (i--)
  1253. {
  1254. x2 = blk[10];
  1255. blk[10] = 0;
  1256. x1 = blk[9];
  1257. blk[9] = 0;
  1258. x0 = ((*(blk += 8)) << 8) + 8192;
  1259. *blk = 0; /* for proper rounding in the fourth stage */
  1260. /* both upper and lower*/
  1261. /* both x2orx6 and x0orx4 */
  1262. x4 = x0;
  1263. x6 = (W6 * x2 + 4) >> 3;
  1264. x2 = (W2 * x2 + 4) >> 3;
  1265. x8 = x0 - x2;
  1266. x0 += x2;
  1267. x2 = x8;
  1268. x8 = x4 - x6;
  1269. x4 += x6;
  1270. x6 = x8;
  1271. x7 = (W7 * x1 + 4) >> 3;
  1272. x1 = (W1 * x1 + 4) >> 3;
  1273. x3 = x7;
  1274. x5 = (181 * (x1 - x7) + 128) >> 8;
  1275. x7 = (181 * (x1 + x7) + 128) >> 8;
  1276. pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
  1277. res = (x0 + x1) >> 14;
  1278. ADD_AND_CLIP1(res);
  1279. res2 = (x4 + x7) >> 14;
  1280. ADD_AND_CLIP2(res2);
  1281. dst_word = (res2 << 8) | res;
  1282. res = (x6 + x5) >> 14;
  1283. ADD_AND_CLIP3(res);
  1284. dst_word |= (res << 16);
  1285. res = (x2 + x3) >> 14;
  1286. ADD_AND_CLIP4(res);
  1287. dst_word |= (res << 24);
  1288. *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
  1289. pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
  1290. res = (x2 - x3) >> 14;
  1291. ADD_AND_CLIP1(res);
  1292. res2 = (x6 - x5) >> 14;
  1293. ADD_AND_CLIP2(res2);
  1294. dst_word = (res2 << 8) | res;
  1295. res = (x4 - x7) >> 14;
  1296. ADD_AND_CLIP3(res);
  1297. dst_word |= (res << 16);
  1298. res = (x0 - x1) >> 14;
  1299. ADD_AND_CLIP4(res);
  1300. dst_word |= (res << 24);
  1301. *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
  1302. }
  1303. return ;
  1304. }
  1305. void idct_row4zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
  1306. {
  1307. int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
  1308. int i = 8;
  1309. uint32 pred_word, dst_word;
  1310. int res, res2;
  1311. /* preset the offset, such that we can take advantage pre-offset addressing mode */
  1312. rec -= lx;
  1313. pred -= 16;
  1314. blk -= 8;
  1315. while (i--)
  1316. {
  1317. x2 = blk[10];
  1318. blk[10] = 0;
  1319. x1 = blk[9];
  1320. blk[9] = 0;
  1321. x3 = blk[11];
  1322. blk[11] = 0;
  1323. x0 = ((*(blk += 8)) << 8) + 8192;
  1324. *blk = 0; /* for proper rounding in the fourth stage */
  1325. x4 = x0;
  1326. x6 = (W6 * x2 + 4) >> 3;
  1327. x2 = (W2 * x2 + 4) >> 3;
  1328. x8 = x0 - x2;
  1329. x0 += x2;
  1330. x2 = x8;
  1331. x8 = x4 - x6;
  1332. x4 += x6;
  1333. x6 = x8;
  1334. x7 = (W7 * x1 + 4) >> 3;
  1335. x1 = (W1 * x1 + 4) >> 3;
  1336. x5 = (W3 * x3 + 4) >> 3;
  1337. x3 = (- W5 * x3 + 4) >> 3;
  1338. x8 = x1 - x5;
  1339. x1 += x5;
  1340. x5 = x8;
  1341. x8 = x7 - x3;
  1342. x3 += x7;
  1343. x7 = (181 * (x5 + x8) + 128) >> 8;
  1344. x5 = (181 * (x5 - x8) + 128) >> 8;
  1345. pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
  1346. res = (x0 + x1) >> 14;
  1347. ADD_AND_CLIP1(res);
  1348. res2 = (x4 + x7) >> 14;
  1349. ADD_AND_CLIP2(res2);
  1350. dst_word = (res2 << 8) | res;
  1351. res = (x6 + x5) >> 14;
  1352. ADD_AND_CLIP3(res);
  1353. dst_word |= (res << 16);
  1354. res = (x2 + x3) >> 14;
  1355. ADD_AND_CLIP4(res);
  1356. dst_word |= (res << 24);
  1357. *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
  1358. pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
  1359. res = (x2 - x3) >> 14;
  1360. ADD_AND_CLIP1(res);
  1361. res2 = (x6 - x5) >> 14;
  1362. ADD_AND_CLIP2(res2);
  1363. dst_word = (res2 << 8) | res;
  1364. res = (x4 - x7) >> 14;
  1365. ADD_AND_CLIP3(res);
  1366. dst_word |= (res << 16);
  1367. res = (x0 - x1) >> 14;
  1368. ADD_AND_CLIP4(res);
  1369. dst_word |= (res << 24);
  1370. *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
  1371. }
  1372. return ;
  1373. }
  1374. #ifndef SMALL_DCT
  1375. void idct_row0x40zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
  1376. {
  1377. int32 x1, x2, x4, x5;
  1378. int i = 8;
  1379. uint32 pred_word, dst_word;
  1380. int res, res2;
  1381. /* preset the offset, such that we can take advantage pre-offset addressing mode */
  1382. rec -= lx;
  1383. pred -= 16;
  1384. while (i--)
  1385. {
  1386. /* shortcut */
  1387. x4 = blk[1];
  1388. blk[1] = 0;
  1389. blk += 8; /* for proper rounding in the fourth stage */
  1390. /* first stage */
  1391. x5 = (W7 * x4 + 4) >> 3;
  1392. x4 = (W1 * x4 + 4) >> 3;
  1393. /* third stage */
  1394. x2 = (181 * (x4 + x5) + 128) >> 8;
  1395. x1 = (181 * (x4 - x5) + 128) >> 8;
  1396. /* fourth stage */
  1397. pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
  1398. res = (8192 + x4) >> 14;
  1399. ADD_AND_CLIP1(res);
  1400. res2 = (8192 + x2) >> 14;
  1401. ADD_AND_CLIP2(res2);
  1402. dst_word = (res2 << 8) | res;
  1403. res = (8192 + x1) >> 14;
  1404. ADD_AND_CLIP3(res);
  1405. dst_word |= (res << 16);
  1406. res = (8192 + x5) >> 14;
  1407. ADD_AND_CLIP4(res);
  1408. dst_word |= (res << 24);
  1409. *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
  1410. pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
  1411. res = (8192 - x5) >> 14;
  1412. ADD_AND_CLIP1(res);
  1413. res2 = (8192 - x1) >> 14;
  1414. ADD_AND_CLIP2(res2);
  1415. dst_word = (res2 << 8) | res;
  1416. res = (8192 - x2) >> 14;
  1417. ADD_AND_CLIP3(res);
  1418. dst_word |= (res << 16);
  1419. res = (8192 - x4) >> 14;
  1420. ADD_AND_CLIP4(res);
  1421. dst_word |= (res << 24);
  1422. *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
  1423. }
  1424. return ;
  1425. }
  1426. void idct_row0x20zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
  1427. {
  1428. int32 x0, x2, x4, x6;
  1429. int i = 8;
  1430. uint32 pred_word, dst_word;
  1431. int res, res2;
  1432. /* preset the offset, such that we can take advantage pre-offset addressing mode */
  1433. rec -= lx;
  1434. pred -= 16;
  1435. while (i--)
  1436. {
  1437. x2 = blk[2];
  1438. blk[2] = 0;
  1439. blk += 8; /* for proper rounding in the fourth stage */
  1440. /* both upper and lower*/
  1441. /* both x2orx6 and x0orx4 */
  1442. x6 = (W6 * x2 + 4) >> 3;
  1443. x2 = (W2 * x2 + 4) >> 3;
  1444. x0 = 8192 + x2;
  1445. x2 = 8192 - x2;
  1446. x4 = 8192 + x6;
  1447. x6 = 8192 - x6;
  1448. pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
  1449. res = (x0) >> 14;
  1450. ADD_AND_CLIP1(res);
  1451. res2 = (x4) >> 14;
  1452. ADD_AND_CLIP2(res2);
  1453. dst_word = (res2 << 8) | res;
  1454. res = (x6) >> 14;
  1455. ADD_AND_CLIP3(res);
  1456. dst_word |= (res << 16);
  1457. res = (x2) >> 14;
  1458. ADD_AND_CLIP4(res);
  1459. dst_word |= (res << 24);
  1460. *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
  1461. pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
  1462. res = (x2) >> 14;
  1463. ADD_AND_CLIP1(res);
  1464. res2 = (x6) >> 14;
  1465. ADD_AND_CLIP2(res2);
  1466. dst_word = (res2 << 8) | res;
  1467. res = (x4) >> 14;
  1468. ADD_AND_CLIP3(res);
  1469. dst_word |= (res << 16);
  1470. res = (x0) >> 14;
  1471. ADD_AND_CLIP4(res);
  1472. dst_word |= (res << 24);
  1473. *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
  1474. }
  1475. return ;
  1476. }
  1477. void idct_row0x10zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
  1478. {
  1479. int32 x1, x3, x5, x7;
  1480. int i = 8;
  1481. uint32 pred_word, dst_word;
  1482. int res, res2;
  1483. /* preset the offset, such that we can take advantage pre-offset addressing mode */
  1484. rec -= lx;
  1485. pred -= 16;
  1486. while (i--)
  1487. {
  1488. x3 = blk[3];
  1489. blk[3] = 0;
  1490. blk += 8;
  1491. x1 = (W3 * x3 + 4) >> 3;
  1492. x3 = (-W5 * x3 + 4) >> 3;
  1493. x7 = (-181 * (x3 + x1) + 128) >> 8;
  1494. x5 = (181 * (x3 - x1) + 128) >> 8;
  1495. pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
  1496. res = (8192 + x1) >> 14;
  1497. ADD_AND_CLIP1(res);
  1498. res2 = (8192 + x7) >> 14;
  1499. ADD_AND_CLIP2(res2);
  1500. dst_word = (res2 << 8) | res;
  1501. res = (8192 + x5) >> 14;
  1502. ADD_AND_CLIP3(res);
  1503. dst_word |= (res << 16);
  1504. res = (8192 + x3) >> 14;
  1505. ADD_AND_CLIP4(res);
  1506. dst_word |= (res << 24);
  1507. *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
  1508. pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
  1509. res = (8192 - x3) >> 14;
  1510. ADD_AND_CLIP1(res);
  1511. res2 = (8192 - x5) >> 14;
  1512. ADD_AND_CLIP2(res2);
  1513. dst_word = (res2 << 8) | res;
  1514. res = (8192 - x7) >> 14;
  1515. ADD_AND_CLIP3(res);
  1516. dst_word |= (res << 16);
  1517. res = (8192 - x1) >> 14;
  1518. ADD_AND_CLIP4(res);
  1519. dst_word |= (res << 24);
  1520. *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
  1521. }
  1522. return ;
  1523. }
  1524. #endif /* SMALL_DCT */
  1525. void idct_rowzmv(Short *blk, UChar *rec, UChar *pred, Int lx)
  1526. {
  1527. int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
  1528. int i = 8;
  1529. uint32 pred_word, dst_word;
  1530. int res, res2;
  1531. /* preset the offset, such that we can take advantage pre-offset addressing mode */
  1532. rec -= lx;
  1533. pred -= 16;
  1534. blk -= 8;
  1535. while (i--)
  1536. {
  1537. x1 = (int32)blk[12] << 8;
  1538. blk[12] = 0;
  1539. x2 = blk[14];
  1540. blk[14] = 0;
  1541. x3 = blk[10];
  1542. blk[10] = 0;
  1543. x4 = blk[9];
  1544. blk[9] = 0;
  1545. x5 = blk[15];
  1546. blk[15] = 0;
  1547. x6 = blk[13];
  1548. blk[13] = 0;
  1549. x7 = blk[11];
  1550. blk[11] = 0;
  1551. x0 = ((*(blk += 8)) << 8) + 8192;
  1552. *blk = 0; /* for proper rounding in the fourth stage */
  1553. /* first stage */
  1554. x8 = W7 * (x4 + x5) + 4;
  1555. x4 = (x8 + (W1 - W7) * x4) >> 3;
  1556. x5 = (x8 - (W1 + W7) * x5) >> 3;
  1557. x8 = W3 * (x6 + x7) + 4;
  1558. x6 = (x8 - (W3 - W5) * x6) >> 3;
  1559. x7 = (x8 - (W3 + W5) * x7) >> 3;
  1560. /* second stage */
  1561. x8 = x0 + x1;
  1562. x0 -= x1;
  1563. x1 = W6 * (x3 + x2) + 4;
  1564. x2 = (x1 - (W2 + W6) * x2) >> 3;
  1565. x3 = (x1 + (W2 - W6) * x3) >> 3;
  1566. x1 = x4 + x6;
  1567. x4 -= x6;
  1568. x6 = x5 + x7;
  1569. x5 -= x7;
  1570. /* third stage */
  1571. x7 = x8 + x3;
  1572. x8 -= x3;
  1573. x3 = x0 + x2;
  1574. x0 -= x2;
  1575. x2 = (181 * (x4 + x5) + 128) >> 8;
  1576. x4 = (181 * (x4 - x5) + 128) >> 8;
  1577. /* fourth stage */
  1578. pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
  1579. res = (x7 + x1) >> 14;
  1580. ADD_AND_CLIP1(res);
  1581. res2 = (x3 + x2) >> 14;
  1582. ADD_AND_CLIP2(res2);
  1583. dst_word = (res2 << 8) | res;
  1584. res = (x0 + x4) >> 14;
  1585. ADD_AND_CLIP3(res);
  1586. dst_word |= (res << 16);
  1587. res = (x8 + x6) >> 14;
  1588. ADD_AND_CLIP4(res);
  1589. dst_word |= (res << 24);
  1590. *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
  1591. pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
  1592. res = (x8 - x6) >> 14;
  1593. ADD_AND_CLIP1(res);
  1594. res2 = (x0 - x4) >> 14;
  1595. ADD_AND_CLIP2(res2);
  1596. dst_word = (res2 << 8) | res;
  1597. res = (x3 - x2) >> 14;
  1598. ADD_AND_CLIP3(res);
  1599. dst_word |= (res << 16);
  1600. res = (x7 - x1) >> 14;
  1601. ADD_AND_CLIP4(res);
  1602. dst_word |= (res << 24);
  1603. *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
  1604. }
  1605. return;
  1606. }
  1607. /*----------------------------------------------------------------------------
  1608. ; End Function: idctcol
  1609. ----------------------------------------------------------------------------*/
  1610. /* ======================================================================== */
  1611. /* Function : BlockIDCTMotionComp */
  1612. /* Date : 10/16/2000 */
  1613. /* Purpose : fast IDCT routine */
  1614. /* In/out : */
  1615. /* Int* coeff_in Dequantized coefficient
  1616. Int block_out output IDCT coefficient
  1617. Int maxval clip value */
  1618. /* Modified : 7/31/01, add checking for all-zero and DC-only block. */
  1619. /* do 8 columns at a time */
  1620. /* 8/2/01, do column first then row-IDCT. */
  1621. /* 8/2/01, remove clipping (included in motion comp). */
  1622. /* 8/7/01, combine with motion comp. */
  1623. /* 8/8/01, use AAN IDCT */
  1624. /* 9/4/05, use Chen's IDCT and 16 bit block */
  1625. /* ======================================================================== */
  1626. void BlockIDCTMotionComp(Short *block, UChar *bitmapcol, UChar bitmaprow,
  1627. Int dctMode, UChar *rec, UChar *pred, Int lx_intra)
  1628. {
  1629. Int i;
  1630. Int tmp, tmp2;
  1631. ULong tmp4;
  1632. Int bmap;
  1633. Short *ptr = block;
  1634. UChar *endcol;
  1635. UInt mask = 0xFF;
  1636. Int lx = lx_intra >> 1;
  1637. Int intra = (lx_intra & 1);
  1638. /* all-zero block */
  1639. if (dctMode == 0 || bitmaprow == 0)
  1640. {
  1641. if (intra)
  1642. {
  1643. *((ULong*)rec) = *((ULong*)(rec + 4)) = 0;
  1644. *((ULong*)(rec += lx)) = 0;
  1645. *((ULong*)(rec + 4)) = 0;
  1646. *((ULong*)(rec += lx)) = 0;
  1647. *((ULong*)(rec + 4)) = 0;
  1648. *((ULong*)(rec += lx)) = 0;
  1649. *((ULong*)(rec + 4)) = 0;
  1650. *((ULong*)(rec += lx)) = 0;
  1651. *((ULong*)(rec + 4)) = 0;
  1652. *((ULong*)(rec += lx)) = 0;
  1653. *((ULong*)(rec + 4)) = 0;
  1654. *((ULong*)(rec += lx)) = 0;
  1655. *((ULong*)(rec + 4)) = 0;
  1656. *((ULong*)(rec += lx)) =