/media/libstagefright/codecs/m4v_h263/enc/src/sad_inline.h

https://bitbucket.org/aways/android_frameworks_av · C Header · 539 lines · 379 code · 116 blank · 44 comment · 21 complexity · a07cb62e76789353187033f3fc26b259 MD5 · raw file

  1. /* ------------------------------------------------------------------
  2. * Copyright (C) 1998-2009 PacketVideo
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
  13. * express or implied.
  14. * See the License for the specific language governing permissions
  15. * and limitations under the License.
  16. * -------------------------------------------------------------------
  17. */
  18. /*********************************************************************************/
  19. /* Filename: sad_inline.h */
  20. /* Description: Implementation for in-line functions used in dct.cpp */
  21. /* Modified: */
  22. /*********************************************************************************/
  23. #ifndef _SAD_INLINE_H_
  24. #define _SAD_INLINE_H_
  25. #ifdef __cplusplus
  26. extern "C"
  27. {
  28. #endif
  29. #if !defined(PV_ARM_GCC_V5) && !defined(PV_ARM_GCC_V4) /* ARM GNU COMPILER */
  30. __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
  31. {
  32. tmp = tmp - tmp2;
  33. if (tmp > 0) sad += tmp;
  34. else sad -= tmp;
  35. return sad;
  36. }
  37. __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
  38. {
  39. int32 x7;
  40. x7 = src2 ^ src1; /* check odd/even combination */
  41. if ((uint32)src2 >= (uint32)src1)
  42. {
  43. src1 = src2 - src1; /* subs */
  44. }
  45. else
  46. {
  47. src1 = src1 - src2;
  48. }
  49. x7 = x7 ^ src1; /* only odd bytes need to add carry */
  50. x7 = mask & ((uint32)x7 >> 1);
  51. x7 = (x7 << 8) - x7;
  52. src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */
  53. src1 = src1 ^(x7 >> 7); /* take absolute value of negative byte */
  54. return src1;
  55. }
  56. #define NUMBER 3
  57. #define SHIFT 24
  58. #include "sad_mb_offset.h"
  59. #undef NUMBER
  60. #define NUMBER 2
  61. #undef SHIFT
  62. #define SHIFT 16
  63. #include "sad_mb_offset.h"
  64. #undef NUMBER
  65. #define NUMBER 1
  66. #undef SHIFT
  67. #define SHIFT 8
  68. #include "sad_mb_offset.h"
  69. __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
  70. {
  71. int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
  72. x9 = 0x80808080; /* const. */
  73. x8 = (uint32)ref & 0x3;
  74. if (x8 == 3)
  75. goto SadMBOffset3;
  76. if (x8 == 2)
  77. goto SadMBOffset2;
  78. if (x8 == 1)
  79. goto SadMBOffset1;
  80. // x5 = (x4<<8)-x4; /* x5 = x4*255; */
  81. x4 = x5 = 0;
  82. x6 = 0xFFFF00FF;
  83. ref -= lx;
  84. blk -= 16;
  85. x8 = 16;
  86. LOOP_SAD0:
  87. /****** process 8 pixels ******/
  88. x10 = *((uint32*)(ref += lx));
  89. x11 = *((uint32*)(ref + 4));
  90. x12 = *((uint32*)(blk += 16));
  91. x14 = *((uint32*)(blk + 4));
  92. /* process x11 & x14 */
  93. x11 = sad_4pixel(x11, x14, x9);
  94. /* process x12 & x10 */
  95. x10 = sad_4pixel(x10, x12, x9);
  96. x5 = x5 + x10; /* accumulate low bytes */
  97. x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
  98. x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
  99. x5 = x5 + x11; /* accumulate low bytes */
  100. x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
  101. x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */
  102. /****** process 8 pixels ******/
  103. x10 = *((uint32*)(ref + 8));
  104. x11 = *((uint32*)(ref + 12));
  105. x12 = *((uint32*)(blk + 8));
  106. x14 = *((uint32*)(blk + 12));
  107. /* process x11 & x14 */
  108. x11 = sad_4pixel(x11, x14, x9);
  109. /* process x12 & x10 */
  110. x10 = sad_4pixel(x10, x12, x9);
  111. x5 = x5 + x10; /* accumulate low bytes */
  112. x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
  113. x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
  114. x5 = x5 + x11; /* accumulate low bytes */
  115. x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
  116. x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */
  117. /****************/
  118. x10 = x5 - (x4 << 8); /* extract low bytes */
  119. x10 = x10 + x4; /* add with high bytes */
  120. x10 = x10 + (x10 << 16); /* add with lower half word */
  121. if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
  122. {
  123. if (--x8)
  124. {
  125. goto LOOP_SAD0;
  126. }
  127. }
  128. return ((uint32)x10 >> 16);
  129. SadMBOffset3:
  130. return sad_mb_offset3(ref, blk, lx, dmin);
  131. SadMBOffset2:
  132. return sad_mb_offset2(ref, blk, lx, dmin);
  133. SadMBOffset1:
  134. return sad_mb_offset1(ref, blk, lx, dmin);
  135. }
  136. #elif defined(__CC_ARM) /* only work with arm v5 */
  137. __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
  138. {
  139. __asm
  140. {
  141. rsbs tmp, tmp, tmp2 ;
  142. rsbmi tmp, tmp, #0 ;
  143. add sad, sad, tmp ;
  144. }
  145. return sad;
  146. }
  147. __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
  148. {
  149. int32 x7;
  150. __asm
  151. {
  152. EOR x7, src2, src1; /* check odd/even combination */
  153. SUBS src1, src2, src1;
  154. EOR x7, x7, src1;
  155. AND x7, mask, x7, lsr #1;
  156. ORRCC x7, x7, #0x80000000;
  157. RSB x7, x7, x7, lsl #8;
  158. ADD src1, src1, x7, asr #7; /* add 0xFF to the negative byte, add back carry */
  159. EOR src1, src1, x7, asr #7; /* take absolute value of negative byte */
  160. }
  161. return src1;
  162. }
  163. __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
  164. {
  165. int32 x7;
  166. __asm
  167. {
  168. EOR x7, src2, src1; /* check odd/even combination */
  169. ADDS src1, src2, src1;
  170. EOR x7, x7, src1; /* only odd bytes need to add carry */
  171. ANDS x7, mask, x7, rrx;
  172. RSB x7, x7, x7, lsl #8;
  173. SUB src1, src1, x7, asr #7; /* add 0xFF to the negative byte, add back carry */
  174. EOR src1, src1, x7, asr #7; /* take absolute value of negative byte */
  175. }
  176. return src1;
  177. }
  178. #define sum_accumulate __asm{ SBC x5, x5, x10; /* accumulate low bytes */ \
  179. BIC x10, x6, x10; /* x10 & 0xFF00FF00 */ \
  180. ADD x4, x4, x10,lsr #8; /* accumulate high bytes */ \
  181. SBC x5, x5, x11; /* accumulate low bytes */ \
  182. BIC x11, x6, x11; /* x11 & 0xFF00FF00 */ \
  183. ADD x4, x4, x11,lsr #8; } /* accumulate high bytes */
  184. #define NUMBER 3
  185. #define SHIFT 24
  186. #define INC_X8 0x08000001
  187. #include "sad_mb_offset.h"
  188. #undef NUMBER
  189. #define NUMBER 2
  190. #undef SHIFT
  191. #define SHIFT 16
  192. #undef INC_X8
  193. #define INC_X8 0x10000001
  194. #include "sad_mb_offset.h"
  195. #undef NUMBER
  196. #define NUMBER 1
  197. #undef SHIFT
  198. #define SHIFT 8
  199. #undef INC_X8
  200. #define INC_X8 0x08000001
  201. #include "sad_mb_offset.h"
  202. __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
  203. {
  204. int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
  205. x9 = 0x80808080; /* const. */
  206. x4 = x5 = 0;
  207. __asm
  208. {
  209. MOVS x8, ref, lsl #31 ;
  210. BHI SadMBOffset3;
  211. BCS SadMBOffset2;
  212. BMI SadMBOffset1;
  213. MVN x6, #0xFF00;
  214. }
  215. LOOP_SAD0:
  216. /****** process 8 pixels ******/
  217. x11 = *((int32*)(ref + 12));
  218. x10 = *((int32*)(ref + 8));
  219. x14 = *((int32*)(blk + 12));
  220. x12 = *((int32*)(blk + 8));
  221. /* process x11 & x14 */
  222. x11 = sad_4pixel(x11, x14, x9);
  223. /* process x12 & x10 */
  224. x10 = sad_4pixel(x10, x12, x9);
  225. x5 = x5 + x10; /* accumulate low bytes */
  226. x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
  227. x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
  228. x5 = x5 + x11; /* accumulate low bytes */
  229. x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
  230. x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */
  231. __asm
  232. {
  233. /****** process 8 pixels ******/
  234. LDR x11, [ref, #4];
  235. LDR x10, [ref], lx ;
  236. LDR x14, [blk, #4];
  237. LDR x12, [blk], #16 ;
  238. }
  239. /* process x11 & x14 */
  240. x11 = sad_4pixel(x11, x14, x9);
  241. /* process x12 & x10 */
  242. x10 = sad_4pixel(x10, x12, x9);
  243. x5 = x5 + x10; /* accumulate low bytes */
  244. x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
  245. x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
  246. x5 = x5 + x11; /* accumulate low bytes */
  247. x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
  248. x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */
  249. /****************/
  250. x10 = x5 - (x4 << 8); /* extract low bytes */
  251. x10 = x10 + x4; /* add with high bytes */
  252. x10 = x10 + (x10 << 16); /* add with lower half word */
  253. __asm
  254. {
  255. /****************/
  256. RSBS x11, dmin, x10, lsr #16;
  257. ADDLSS x8, x8, #0x10000001;
  258. BLS LOOP_SAD0;
  259. }
  260. return ((uint32)x10 >> 16);
  261. SadMBOffset3:
  262. return sad_mb_offset3(ref, blk, lx, dmin, x8);
  263. SadMBOffset2:
  264. return sad_mb_offset2(ref, blk, lx, dmin, x8);
  265. SadMBOffset1:
  266. return sad_mb_offset1(ref, blk, lx, dmin, x8);
  267. }
  268. #elif ( defined(PV_ARM_GCC_V5) || defined(PV_ARM_GCC_V4) ) /* ARM GNU COMPILER */
  269. __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
  270. {
  271. register int32 out;
  272. register int32 temp1;
  273. register int32 ss = sad;
  274. register int32 tt = tmp;
  275. register int32 uu = tmp2;
  276. asm volatile("rsbs %1, %4, %3\n\t"
  277. "rsbmi %1, %1, #0\n\t"
  278. "add %0, %2, %1"
  279. : "=&r"(out),
  280. "=&r"(temp1)
  281. : "r"(ss),
  282. "r"(tt),
  283. "r"(uu));
  284. return out;
  285. }
  286. __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
  287. {
  288. register int32 out;
  289. register int32 temp1;
  290. register int32 s1 = src1;
  291. register int32 s2 = src2;
  292. register int32 mm = mask;
  293. asm volatile("eor %0, %3, %2\n\t"
  294. "subs %1, %3, %2\n\t"
  295. "eor %0, %0, %1\n\t"
  296. "and %0, %4, %0, lsr #1\n\t"
  297. "orrcc %0, %0, #0x80000000\n\t"
  298. "rsb %0, %0, %0, lsl #8\n\t"
  299. "add %1, %1, %0, asr #7\n\t"
  300. "eor %1, %1, %0, asr #7"
  301. : "=&r"(out),
  302. "=&r"(temp1)
  303. : "r"(s1),
  304. "r"(s2),
  305. "r"(mm));
  306. return temp1;
  307. }
  308. __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
  309. {
  310. register int32 out;
  311. register int32 temp1;
  312. register int32 s1 = src1;
  313. register int32 s2 = src2;
  314. register int32 mm = mask;
  315. asm volatile("eor %1, %3, %2\n\t"
  316. "adds %0, %3, %2\n\t"
  317. "eor %1, %1, %0\n\t"
  318. "ands %1, %4, %1,rrx\n\t"
  319. "rsb %1, %1, %1, lsl #8\n\t"
  320. "sub %0, %0, %1, asr #7\n\t"
  321. "eor %0, %0, %1, asr #7"
  322. : "=&r"(out),
  323. "=&r"(temp1)
  324. : "r"(s1),
  325. "r"(s2),
  326. "r"(mm));
  327. return (out);
  328. }
  329. #define sum_accumulate asm volatile("sbc %0, %0, %1\n\t" \
  330. "bic %1, %4, %1\n\t" \
  331. "add %2, %2, %1, lsr #8\n\t" \
  332. "sbc %0, %0, %3\n\t" \
  333. "bic %3, %4, %3\n\t" \
  334. "add %2, %2, %3, lsr #8" \
  335. :"+r"(x5), "+r"(x10), "+r"(x4), "+r"(x11) \
  336. :"r"(x6));
  337. #define NUMBER 3
  338. #define SHIFT 24
  339. #define INC_X8 0x08000001
  340. #include "sad_mb_offset.h"
  341. #undef NUMBER
  342. #define NUMBER 2
  343. #undef SHIFT
  344. #define SHIFT 16
  345. #undef INC_X8
  346. #define INC_X8 0x10000001
  347. #include "sad_mb_offset.h"
  348. #undef NUMBER
  349. #define NUMBER 1
  350. #undef SHIFT
  351. #define SHIFT 8
  352. #undef INC_X8
  353. #define INC_X8 0x08000001
  354. #include "sad_mb_offset.h"
  355. __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
  356. {
  357. int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
  358. x9 = 0x80808080; /* const. */
  359. x4 = x5 = 0;
  360. x8 = (uint32)ref & 0x3;
  361. if (x8 == 3)
  362. goto SadMBOffset3;
  363. if (x8 == 2)
  364. goto SadMBOffset2;
  365. if (x8 == 1)
  366. goto SadMBOffset1;
  367. asm volatile("mvn %0, #0xFF00": "=r"(x6));
  368. LOOP_SAD0:
  369. /****** process 8 pixels ******/
  370. x11 = *((int32*)(ref + 12));
  371. x10 = *((int32*)(ref + 8));
  372. x14 = *((int32*)(blk + 12));
  373. x12 = *((int32*)(blk + 8));
  374. /* process x11 & x14 */
  375. x11 = sad_4pixel(x11, x14, x9);
  376. /* process x12 & x10 */
  377. x10 = sad_4pixel(x10, x12, x9);
  378. x5 = x5 + x10; /* accumulate low bytes */
  379. x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
  380. x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
  381. x5 = x5 + x11; /* accumulate low bytes */
  382. x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
  383. x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */
  384. asm volatile("ldr %0, [%4, #4]\n\t"
  385. "ldr %1, [%4], %6\n\t"
  386. "ldr %2, [%5, #4]\n\t"
  387. "ldr %3, [%5], #16"
  388. : "=r"(x11), "=r"(x10), "=r"(x14), "=r"(x12), "+r"(ref), "+r"(blk)
  389. : "r"(lx));
  390. /* process x11 & x14 */
  391. x11 = sad_4pixel(x11, x14, x9);
  392. /* process x12 & x10 */
  393. x10 = sad_4pixel(x10, x12, x9);
  394. x5 = x5 + x10; /* accumulate low bytes */
  395. x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
  396. x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
  397. x5 = x5 + x11; /* accumulate low bytes */
  398. x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
  399. x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */
  400. /****************/
  401. x10 = x5 - (x4 << 8); /* extract low bytes */
  402. x10 = x10 + x4; /* add with high bytes */
  403. x10 = x10 + (x10 << 16); /* add with lower half word */
  404. if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
  405. {
  406. if (--x8)
  407. {
  408. goto LOOP_SAD0;
  409. }
  410. }
  411. return ((uint32)x10 >> 16);
  412. SadMBOffset3:
  413. return sad_mb_offset3(ref, blk, lx, dmin);
  414. SadMBOffset2:
  415. return sad_mb_offset2(ref, blk, lx, dmin);
  416. SadMBOffset1:
  417. return sad_mb_offset1(ref, blk, lx, dmin);
  418. }
  419. #endif // OS
  420. #ifdef __cplusplus
  421. }
  422. #endif
  423. #endif // _SAD_INLINE_H_