/libavcodec/arm/ac3dsp_neon.S

http://github.com/FFmpeg/FFmpeg · Assembly · 177 lines · 147 code · 11 blank · 19 comment · 0 complexity · 98d464642c23ca6fa0337f8993263521 MD5 · raw file

  1. /*
  2. * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/arm/asm.S"
  21. function ff_ac3_max_msb_abs_int16_neon, export=1
  22. vmov.i16 q0, #0
  23. vmov.i16 q2, #0
  24. 1: vld1.16 {q1}, [r0,:128]!
  25. vabs.s16 q1, q1
  26. vld1.16 {q3}, [r0,:128]!
  27. vabs.s16 q3, q3
  28. vorr q0, q0, q1
  29. vorr q2, q2, q3
  30. subs r1, r1, #16
  31. bgt 1b
  32. vorr q0, q0, q2
  33. vorr d0, d0, d1
  34. vpmax.u16 d0, d0, d0
  35. vpmax.u16 d0, d0, d0
  36. vmov.u16 r0, d0[0]
  37. bx lr
  38. endfunc
  39. function ff_ac3_exponent_min_neon, export=1
  40. cmp r1, #0
  41. it eq
  42. bxeq lr
  43. push {lr}
  44. mov r12, #256
  45. 1:
  46. vld1.8 {q0}, [r0,:128]
  47. mov lr, r1
  48. add r3, r0, #256
  49. 2: vld1.8 {q1}, [r3,:128], r12
  50. subs lr, lr, #1
  51. vmin.u8 q0, q0, q1
  52. bgt 2b
  53. subs r2, r2, #16
  54. vst1.8 {q0}, [r0,:128]!
  55. bgt 1b
  56. pop {pc}
  57. endfunc
  58. function ff_ac3_lshift_int16_neon, export=1
  59. vdup.16 q0, r2
  60. 1: vld1.16 {q1}, [r0,:128]
  61. vshl.s16 q1, q1, q0
  62. vst1.16 {q1}, [r0,:128]!
  63. subs r1, r1, #8
  64. bgt 1b
  65. bx lr
  66. endfunc
  67. function ff_ac3_rshift_int32_neon, export=1
  68. rsb r2, r2, #0
  69. vdup.32 q0, r2
  70. 1: vld1.32 {q1}, [r0,:128]
  71. vshl.s32 q1, q1, q0
  72. vst1.32 {q1}, [r0,:128]!
  73. subs r1, r1, #4
  74. bgt 1b
  75. bx lr
  76. endfunc
  77. function ff_float_to_fixed24_neon, export=1
  78. 1: vld1.32 {q0-q1}, [r1,:128]!
  79. vcvt.s32.f32 q0, q0, #24
  80. vld1.32 {q2-q3}, [r1,:128]!
  81. vcvt.s32.f32 q1, q1, #24
  82. vcvt.s32.f32 q2, q2, #24
  83. vst1.32 {q0-q1}, [r0,:128]!
  84. vcvt.s32.f32 q3, q3, #24
  85. vst1.32 {q2-q3}, [r0,:128]!
  86. subs r2, r2, #16
  87. bgt 1b
  88. bx lr
  89. endfunc
  90. function ff_ac3_extract_exponents_neon, export=1
  91. vmov.i32 q15, #8
  92. 1:
  93. vld1.32 {q0}, [r1,:128]!
  94. vabs.s32 q1, q0
  95. vclz.i32 q3, q1
  96. vsub.i32 q3, q3, q15
  97. vmovn.i32 d6, q3
  98. vmovn.i16 d6, q3
  99. vst1.32 {d6[0]}, [r0,:32]!
  100. subs r2, r2, #4
  101. bgt 1b
  102. bx lr
  103. endfunc
  104. function ff_apply_window_int16_neon, export=1
  105. push {r4,lr}
  106. add r4, r1, r3, lsl #1
  107. add lr, r0, r3, lsl #1
  108. sub r4, r4, #16
  109. sub lr, lr, #16
  110. mov r12, #-16
  111. 1:
  112. vld1.16 {q0}, [r1,:128]!
  113. vld1.16 {q2}, [r2,:128]!
  114. vld1.16 {q1}, [r4,:128], r12
  115. vrev64.16 q3, q2
  116. vqrdmulh.s16 q0, q0, q2
  117. vqrdmulh.s16 d2, d2, d7
  118. vqrdmulh.s16 d3, d3, d6
  119. vst1.16 {q0}, [r0,:128]!
  120. vst1.16 {q1}, [lr,:128], r12
  121. subs r3, r3, #16
  122. bgt 1b
  123. pop {r4,pc}
  124. endfunc
  125. function ff_ac3_sum_square_butterfly_int32_neon, export=1
  126. vmov.i64 q0, #0
  127. vmov.i64 q1, #0
  128. vmov.i64 q2, #0
  129. vmov.i64 q3, #0
  130. 1:
  131. vld1.32 {d16}, [r1]!
  132. vld1.32 {d17}, [r2]!
  133. vadd.s32 d18, d16, d17
  134. vsub.s32 d19, d16, d17
  135. vmlal.s32 q0, d16, d16
  136. vmlal.s32 q1, d17, d17
  137. vmlal.s32 q2, d18, d18
  138. vmlal.s32 q3, d19, d19
  139. subs r3, r3, #2
  140. bgt 1b
  141. vadd.s64 d0, d0, d1
  142. vadd.s64 d1, d2, d3
  143. vadd.s64 d2, d4, d5
  144. vadd.s64 d3, d6, d7
  145. vst1.64 {q0-q1}, [r0]
  146. bx lr
  147. endfunc
  148. function ff_ac3_sum_square_butterfly_float_neon, export=1
  149. vmov.f32 q0, #0.0
  150. vmov.f32 q1, #0.0
  151. 1:
  152. vld1.32 {d16}, [r1]!
  153. vld1.32 {d17}, [r2]!
  154. vadd.f32 d18, d16, d17
  155. vsub.f32 d19, d16, d17
  156. vmla.f32 d0, d16, d16
  157. vmla.f32 d1, d17, d17
  158. vmla.f32 d2, d18, d18
  159. vmla.f32 d3, d19, d19
  160. subs r3, r3, #2
  161. bgt 1b
  162. vpadd.f32 d0, d0, d1
  163. vpadd.f32 d1, d2, d3
  164. vst1.32 {q0}, [r0]
  165. bx lr
  166. endfunc