/media/libvpx/vp8/encoder/arm/neon/variance_neon.asm

http://github.com/zpao/v8monkey · Assembly · 276 lines · 169 code · 63 blank · 44 comment · 0 complexity · f5cad4450b7c93e3321e48df159d4b3e MD5 · raw file

  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. EXPORT |vp8_variance16x16_neon|
  11. EXPORT |vp8_variance16x8_neon|
  12. EXPORT |vp8_variance8x16_neon|
  13. EXPORT |vp8_variance8x8_neon|
  14. ARM
  15. REQUIRE8
  16. PRESERVE8
  17. AREA ||.text||, CODE, READONLY, ALIGN=2
  18. ; r0 unsigned char *src_ptr
  19. ; r1 int source_stride
  20. ; r2 unsigned char *ref_ptr
  21. ; r3 int recon_stride
  22. ; stack unsigned int *sse
  23. |vp8_variance16x16_neon| PROC
  24. vmov.i8 q8, #0 ;q8 - sum
  25. vmov.i8 q9, #0 ;q9, q10 - sse
  26. vmov.i8 q10, #0
  27. mov r12, #8
  28. variance16x16_neon_loop
  29. vld1.8 {q0}, [r0], r1 ;Load up source and reference
  30. vld1.8 {q2}, [r2], r3
  31. vld1.8 {q1}, [r0], r1
  32. vld1.8 {q3}, [r2], r3
  33. vsubl.u8 q11, d0, d4 ;calculate diff
  34. vsubl.u8 q12, d1, d5
  35. vsubl.u8 q13, d2, d6
  36. vsubl.u8 q14, d3, d7
  37. ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
  38. ;the results into the elements of the destination vector. The explanation
  39. ;in ARM guide is wrong.
  40. vpadal.s16 q8, q11 ;calculate sum
  41. vmlal.s16 q9, d22, d22 ;calculate sse
  42. vmlal.s16 q10, d23, d23
  43. subs r12, r12, #1
  44. vpadal.s16 q8, q12
  45. vmlal.s16 q9, d24, d24
  46. vmlal.s16 q10, d25, d25
  47. vpadal.s16 q8, q13
  48. vmlal.s16 q9, d26, d26
  49. vmlal.s16 q10, d27, d27
  50. vpadal.s16 q8, q14
  51. vmlal.s16 q9, d28, d28
  52. vmlal.s16 q10, d29, d29
  53. bne variance16x16_neon_loop
  54. vadd.u32 q10, q9, q10 ;accumulate sse
  55. vpaddl.s32 q0, q8 ;accumulate sum
  56. ldr r12, [sp] ;load *sse from stack
  57. vpaddl.u32 q1, q10
  58. vadd.s64 d0, d0, d1
  59. vadd.u64 d1, d2, d3
  60. ;vmov.32 r0, d0[0] ;this instruction costs a lot
  61. ;vmov.32 r1, d1[0]
  62. ;mul r0, r0, r0
  63. ;str r1, [r12]
  64. ;sub r0, r1, r0, asr #8
  65. ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
  66. ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
  67. vmull.s32 q5, d0, d0
  68. vst1.32 {d1[0]}, [r12] ;store sse
  69. vshr.s32 d10, d10, #8
  70. vsub.s32 d0, d1, d10
  71. vmov.32 r0, d0[0] ;return
  72. bx lr
  73. ENDP
  74. ;================================
  75. ;unsigned int vp8_variance16x8_c(
  76. ; unsigned char *src_ptr,
  77. ; int source_stride,
  78. ; unsigned char *ref_ptr,
  79. ; int recon_stride,
  80. ; unsigned int *sse)
  81. |vp8_variance16x8_neon| PROC
  82. vmov.i8 q8, #0 ;q8 - sum
  83. vmov.i8 q9, #0 ;q9, q10 - sse
  84. vmov.i8 q10, #0
  85. mov r12, #4
  86. variance16x8_neon_loop
  87. vld1.8 {q0}, [r0], r1 ;Load up source and reference
  88. vld1.8 {q2}, [r2], r3
  89. vld1.8 {q1}, [r0], r1
  90. vld1.8 {q3}, [r2], r3
  91. vsubl.u8 q11, d0, d4 ;calculate diff
  92. vsubl.u8 q12, d1, d5
  93. vsubl.u8 q13, d2, d6
  94. vsubl.u8 q14, d3, d7
  95. vpadal.s16 q8, q11 ;calculate sum
  96. vmlal.s16 q9, d22, d22 ;calculate sse
  97. vmlal.s16 q10, d23, d23
  98. subs r12, r12, #1
  99. vpadal.s16 q8, q12
  100. vmlal.s16 q9, d24, d24
  101. vmlal.s16 q10, d25, d25
  102. vpadal.s16 q8, q13
  103. vmlal.s16 q9, d26, d26
  104. vmlal.s16 q10, d27, d27
  105. vpadal.s16 q8, q14
  106. vmlal.s16 q9, d28, d28
  107. vmlal.s16 q10, d29, d29
  108. bne variance16x8_neon_loop
  109. vadd.u32 q10, q9, q10 ;accumulate sse
  110. vpaddl.s32 q0, q8 ;accumulate sum
  111. ldr r12, [sp] ;load *sse from stack
  112. vpaddl.u32 q1, q10
  113. vadd.s64 d0, d0, d1
  114. vadd.u64 d1, d2, d3
  115. vmull.s32 q5, d0, d0
  116. vst1.32 {d1[0]}, [r12] ;store sse
  117. vshr.s32 d10, d10, #7
  118. vsub.s32 d0, d1, d10
  119. vmov.32 r0, d0[0] ;return
  120. bx lr
  121. ENDP
  122. ;=================================
  123. ;unsigned int vp8_variance8x16_c(
  124. ; unsigned char *src_ptr,
  125. ; int source_stride,
  126. ; unsigned char *ref_ptr,
  127. ; int recon_stride,
  128. ; unsigned int *sse)
  129. |vp8_variance8x16_neon| PROC
  130. vmov.i8 q8, #0 ;q8 - sum
  131. vmov.i8 q9, #0 ;q9, q10 - sse
  132. vmov.i8 q10, #0
  133. mov r12, #8
  134. variance8x16_neon_loop
  135. vld1.8 {d0}, [r0], r1 ;Load up source and reference
  136. vld1.8 {d4}, [r2], r3
  137. vld1.8 {d2}, [r0], r1
  138. vld1.8 {d6}, [r2], r3
  139. vsubl.u8 q11, d0, d4 ;calculate diff
  140. vsubl.u8 q12, d2, d6
  141. vpadal.s16 q8, q11 ;calculate sum
  142. vmlal.s16 q9, d22, d22 ;calculate sse
  143. vmlal.s16 q10, d23, d23
  144. subs r12, r12, #1
  145. vpadal.s16 q8, q12
  146. vmlal.s16 q9, d24, d24
  147. vmlal.s16 q10, d25, d25
  148. bne variance8x16_neon_loop
  149. vadd.u32 q10, q9, q10 ;accumulate sse
  150. vpaddl.s32 q0, q8 ;accumulate sum
  151. ldr r12, [sp] ;load *sse from stack
  152. vpaddl.u32 q1, q10
  153. vadd.s64 d0, d0, d1
  154. vadd.u64 d1, d2, d3
  155. vmull.s32 q5, d0, d0
  156. vst1.32 {d1[0]}, [r12] ;store sse
  157. vshr.s32 d10, d10, #7
  158. vsub.s32 d0, d1, d10
  159. vmov.32 r0, d0[0] ;return
  160. bx lr
  161. ENDP
  162. ;==================================
  163. ; r0 unsigned char *src_ptr
  164. ; r1 int source_stride
  165. ; r2 unsigned char *ref_ptr
  166. ; r3 int recon_stride
  167. ; stack unsigned int *sse
  168. |vp8_variance8x8_neon| PROC
  169. vmov.i8 q8, #0 ;q8 - sum
  170. vmov.i8 q9, #0 ;q9, q10 - sse
  171. vmov.i8 q10, #0
  172. mov r12, #2
  173. variance8x8_neon_loop
  174. vld1.8 {d0}, [r0], r1 ;Load up source and reference
  175. vld1.8 {d4}, [r2], r3
  176. vld1.8 {d1}, [r0], r1
  177. vld1.8 {d5}, [r2], r3
  178. vld1.8 {d2}, [r0], r1
  179. vld1.8 {d6}, [r2], r3
  180. vld1.8 {d3}, [r0], r1
  181. vld1.8 {d7}, [r2], r3
  182. vsubl.u8 q11, d0, d4 ;calculate diff
  183. vsubl.u8 q12, d1, d5
  184. vsubl.u8 q13, d2, d6
  185. vsubl.u8 q14, d3, d7
  186. vpadal.s16 q8, q11 ;calculate sum
  187. vmlal.s16 q9, d22, d22 ;calculate sse
  188. vmlal.s16 q10, d23, d23
  189. subs r12, r12, #1
  190. vpadal.s16 q8, q12
  191. vmlal.s16 q9, d24, d24
  192. vmlal.s16 q10, d25, d25
  193. vpadal.s16 q8, q13
  194. vmlal.s16 q9, d26, d26
  195. vmlal.s16 q10, d27, d27
  196. vpadal.s16 q8, q14
  197. vmlal.s16 q9, d28, d28
  198. vmlal.s16 q10, d29, d29
  199. bne variance8x8_neon_loop
  200. vadd.u32 q10, q9, q10 ;accumulate sse
  201. vpaddl.s32 q0, q8 ;accumulate sum
  202. ldr r12, [sp] ;load *sse from stack
  203. vpaddl.u32 q1, q10
  204. vadd.s64 d0, d0, d1
  205. vadd.u64 d1, d2, d3
  206. vmull.s32 q5, d0, d0
  207. vst1.32 {d1[0]}, [r12] ;store sse
  208. vshr.s32 d10, d10, #6
  209. vsub.s32 d0, d1, d10
  210. vmov.32 r0, d0[0] ;return
  211. bx lr
  212. ENDP
  213. END