PageRenderTime 36ms CodeModel.GetById 1ms app.highlight 25ms RepoModel.GetById 1ms app.codeStats 0ms

/libavcodec/arm/dsputil_neon.S

http://github.com/FFmpeg/FFmpeg
Assembly | 822 lines | 776 code | 46 blank | 0 comment | 1 complexity | f574eb667a62cab81d5f8bda60fbaaf8 MD5 | raw file
  1/*
  2 * ARM NEON optimised DSP functions
  3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
  4 *
  5 * This file is part of FFmpeg.
  6 *
  7 * FFmpeg is free software; you can redistribute it and/or
  8 * modify it under the terms of the GNU Lesser General Public
  9 * License as published by the Free Software Foundation; either
 10 * version 2.1 of the License, or (at your option) any later version.
 11 *
 12 * FFmpeg is distributed in the hope that it will be useful,
 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 15 * Lesser General Public License for more details.
 16 *
 17 * You should have received a copy of the GNU Lesser General Public
 18 * License along with FFmpeg; if not, write to the Free Software
 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 20 */
 21
 22#include "config.h"
 23#include "libavutil/arm/asm.S"
 24
 25function ff_clear_block_neon, export=1
 26        vmov.i16        q0,  #0
 27        .rept           8
 28        vst1.16         {q0}, [r0,:128]!
 29        .endr
 30        bx              lr
 31endfunc
 32
 33function ff_clear_blocks_neon, export=1
 34        vmov.i16        q0,  #0
 35        .rept           8*6
 36        vst1.16         {q0}, [r0,:128]!
 37        .endr
 38        bx              lr
 39endfunc
 40
 41.macro  pixels16        rnd=1, avg=0
 42  .if \avg
 43        mov             r12, r0
 44  .endif
 451:      vld1.8          {q0},     [r1], r2
 46        vld1.8          {q1},     [r1], r2
 47        vld1.8          {q2},     [r1], r2
 48        pld             [r1, r2, lsl #2]
 49        vld1.8          {q3},     [r1], r2
 50        pld             [r1]
 51        pld             [r1, r2]
 52        pld             [r1, r2, lsl #1]
 53  .if \avg
 54        vld1.8          {q8},     [r12,:128], r2
 55        vrhadd.u8       q0,  q0,  q8
 56        vld1.8          {q9},     [r12,:128], r2
 57        vrhadd.u8       q1,  q1,  q9
 58        vld1.8          {q10},    [r12,:128], r2
 59        vrhadd.u8       q2,  q2,  q10
 60        vld1.8          {q11},    [r12,:128], r2
 61        vrhadd.u8       q3,  q3,  q11
 62  .endif
 63        subs            r3,  r3,  #4
 64        vst1.64         {q0},     [r0,:128], r2
 65        vst1.64         {q1},     [r0,:128], r2
 66        vst1.64         {q2},     [r0,:128], r2
 67        vst1.64         {q3},     [r0,:128], r2
 68        bne             1b
 69        bx              lr
 70.endm
 71
 72.macro  pixels16_x2     rnd=1, avg=0
 731:      vld1.8          {d0-d2},  [r1], r2
 74        vld1.8          {d4-d6},  [r1], r2
 75        pld             [r1]
 76        pld             [r1, r2]
 77        subs            r3,  r3,  #2
 78        vext.8          q1,  q0,  q1,  #1
 79        avg             q0,  q0,  q1
 80        vext.8          q3,  q2,  q3,  #1
 81        avg             q2,  q2,  q3
 82  .if \avg
 83        vld1.8          {q1},     [r0,:128], r2
 84        vld1.8          {q3},     [r0,:128]
 85        vrhadd.u8       q0,  q0,  q1
 86        vrhadd.u8       q2,  q2,  q3
 87        sub             r0,  r0,  r2
 88  .endif
 89        vst1.8          {q0},     [r0,:128], r2
 90        vst1.8          {q2},     [r0,:128], r2
 91        bne             1b
 92        bx              lr
 93.endm
 94
 95.macro  pixels16_y2     rnd=1, avg=0
 96        sub             r3,  r3,  #2
 97        vld1.8          {q0},     [r1], r2
 98        vld1.8          {q1},     [r1], r2
 991:      subs            r3,  r3,  #2
100        avg             q2,  q0,  q1
101        vld1.8          {q0},     [r1], r2
102        avg             q3,  q0,  q1
103        vld1.8          {q1},     [r1], r2
104        pld             [r1]
105        pld             [r1, r2]
106  .if \avg
107        vld1.8          {q8},     [r0,:128], r2
108        vld1.8          {q9},     [r0,:128]
109        vrhadd.u8       q2,  q2,  q8
110        vrhadd.u8       q3,  q3,  q9
111        sub             r0,  r0,  r2
112  .endif
113        vst1.8          {q2},     [r0,:128], r2
114        vst1.8          {q3},     [r0,:128], r2
115        bne             1b
116
117        avg             q2,  q0,  q1
118        vld1.8          {q0},     [r1], r2
119        avg             q3,  q0,  q1
120  .if \avg
121        vld1.8          {q8},     [r0,:128], r2
122        vld1.8          {q9},     [r0,:128]
123        vrhadd.u8       q2,  q2,  q8
124        vrhadd.u8       q3,  q3,  q9
125        sub             r0,  r0,  r2
126  .endif
127        vst1.8          {q2},     [r0,:128], r2
128        vst1.8          {q3},     [r0,:128], r2
129
130        bx              lr
131.endm
132
133.macro  pixels16_xy2    rnd=1, avg=0
134        sub             r3,  r3,  #2
135        vld1.8          {d0-d2},  [r1], r2
136        vld1.8          {d4-d6},  [r1], r2
137NRND    vmov.i16        q13, #1
138        pld             [r1]
139        pld             [r1, r2]
140        vext.8          q1,  q0,  q1,  #1
141        vext.8          q3,  q2,  q3,  #1
142        vaddl.u8        q8,  d0,  d2
143        vaddl.u8        q10, d1,  d3
144        vaddl.u8        q9,  d4,  d6
145        vaddl.u8        q11, d5,  d7
1461:      subs            r3,  r3,  #2
147        vld1.8          {d0-d2},  [r1], r2
148        vadd.u16        q12, q8,  q9
149        pld             [r1]
150NRND    vadd.u16        q12, q12, q13
151        vext.8          q15, q0,  q1,  #1
152        vadd.u16        q1 , q10, q11
153        shrn            d28, q12, #2
154NRND    vadd.u16        q1,  q1,  q13
155        shrn            d29, q1,  #2
156  .if \avg
157        vld1.8          {q8},     [r0,:128]
158        vrhadd.u8       q14, q14, q8
159  .endif
160        vaddl.u8        q8,  d0,  d30
161        vld1.8          {d2-d4},  [r1], r2
162        vaddl.u8        q10, d1,  d31
163        vst1.8          {q14},    [r0,:128], r2
164        vadd.u16        q12, q8,  q9
165        pld             [r1, r2]
166NRND    vadd.u16        q12, q12, q13
167        vext.8          q2,  q1,  q2,  #1
168        vadd.u16        q0,  q10, q11
169        shrn            d30, q12, #2
170NRND    vadd.u16        q0,  q0,  q13
171        shrn            d31, q0,  #2
172  .if \avg
173        vld1.8          {q9},     [r0,:128]
174        vrhadd.u8       q15, q15, q9
175  .endif
176        vaddl.u8        q9,  d2,  d4
177        vaddl.u8        q11, d3,  d5
178        vst1.8          {q15},    [r0,:128], r2
179        bgt             1b
180
181        vld1.8          {d0-d2},  [r1], r2
182        vadd.u16        q12, q8,  q9
183NRND    vadd.u16        q12, q12, q13
184        vext.8          q15, q0,  q1,  #1
185        vadd.u16        q1 , q10, q11
186        shrn            d28, q12, #2
187NRND    vadd.u16        q1,  q1,  q13
188        shrn            d29, q1,  #2
189  .if \avg
190        vld1.8          {q8},     [r0,:128]
191        vrhadd.u8       q14, q14, q8
192  .endif
193        vaddl.u8        q8,  d0,  d30
194        vaddl.u8        q10, d1,  d31
195        vst1.8          {q14},    [r0,:128], r2
196        vadd.u16        q12, q8,  q9
197NRND    vadd.u16        q12, q12, q13
198        vadd.u16        q0,  q10, q11
199        shrn            d30, q12, #2
200NRND    vadd.u16        q0,  q0,  q13
201        shrn            d31, q0,  #2
202  .if \avg
203        vld1.8          {q9},     [r0,:128]
204        vrhadd.u8       q15, q15, q9
205  .endif
206        vst1.8          {q15},    [r0,:128], r2
207
208        bx              lr
209.endm
210
211.macro  pixels8         rnd=1, avg=0
2121:      vld1.8          {d0},     [r1], r2
213        vld1.8          {d1},     [r1], r2
214        vld1.8          {d2},     [r1], r2
215        pld             [r1, r2, lsl #2]
216        vld1.8          {d3},     [r1], r2
217        pld             [r1]
218        pld             [r1, r2]
219        pld             [r1, r2, lsl #1]
220  .if \avg
221        vld1.8          {d4},     [r0,:64], r2
222        vrhadd.u8       d0,  d0,  d4
223        vld1.8          {d5},     [r0,:64], r2
224        vrhadd.u8       d1,  d1,  d5
225        vld1.8          {d6},     [r0,:64], r2
226        vrhadd.u8       d2,  d2,  d6
227        vld1.8          {d7},     [r0,:64], r2
228        vrhadd.u8       d3,  d3,  d7
229        sub             r0,  r0,  r2,  lsl #2
230  .endif
231        subs            r3,  r3,  #4
232        vst1.8          {d0},     [r0,:64], r2
233        vst1.8          {d1},     [r0,:64], r2
234        vst1.8          {d2},     [r0,:64], r2
235        vst1.8          {d3},     [r0,:64], r2
236        bne             1b
237        bx              lr
238.endm
239
240.macro  pixels8_x2      rnd=1, avg=0
2411:      vld1.8          {q0},     [r1], r2
242        vext.8          d1,  d0,  d1,  #1
243        vld1.8          {q1},     [r1], r2
244        vext.8          d3,  d2,  d3,  #1
245        pld             [r1]
246        pld             [r1, r2]
247        subs            r3,  r3,  #2
248        vswp            d1,  d2
249        avg             q0,  q0,  q1
250  .if \avg
251        vld1.8          {d4},     [r0,:64], r2
252        vld1.8          {d5},     [r0,:64]
253        vrhadd.u8       q0,  q0,  q2
254        sub             r0,  r0,  r2
255  .endif
256        vst1.8          {d0},     [r0,:64], r2
257        vst1.8          {d1},     [r0,:64], r2
258        bne             1b
259        bx              lr
260.endm
261
262.macro  pixels8_y2      rnd=1, avg=0
263        sub             r3,  r3,  #2
264        vld1.8          {d0},     [r1], r2
265        vld1.8          {d1},     [r1], r2
2661:      subs            r3,  r3,  #2
267        avg             d4,  d0,  d1
268        vld1.8          {d0},     [r1], r2
269        avg             d5,  d0,  d1
270        vld1.8          {d1},     [r1], r2
271        pld             [r1]
272        pld             [r1, r2]
273  .if \avg
274        vld1.8          {d2},     [r0,:64], r2
275        vld1.8          {d3},     [r0,:64]
276        vrhadd.u8       q2,  q2,  q1
277        sub             r0,  r0,  r2
278  .endif
279        vst1.8          {d4},     [r0,:64], r2
280        vst1.8          {d5},     [r0,:64], r2
281        bne             1b
282
283        avg             d4,  d0,  d1
284        vld1.8          {d0},     [r1], r2
285        avg             d5,  d0,  d1
286  .if \avg
287        vld1.8          {d2},     [r0,:64], r2
288        vld1.8          {d3},     [r0,:64]
289        vrhadd.u8       q2,  q2,  q1
290        sub             r0,  r0,  r2
291  .endif
292        vst1.8          {d4},     [r0,:64], r2
293        vst1.8          {d5},     [r0,:64], r2
294
295        bx              lr
296.endm
297
298.macro  pixels8_xy2     rnd=1, avg=0
299        sub             r3,  r3,  #2
300        vld1.8          {q0},     [r1], r2
301        vld1.8          {q1},     [r1], r2
302NRND    vmov.i16        q11, #1
303        pld             [r1]
304        pld             [r1, r2]
305        vext.8          d4,  d0,  d1,  #1
306        vext.8          d6,  d2,  d3,  #1
307        vaddl.u8        q8,  d0,  d4
308        vaddl.u8        q9,  d2,  d6
3091:      subs            r3,  r3,  #2
310        vld1.8          {q0},     [r1], r2
311        pld             [r1]
312        vadd.u16        q10, q8,  q9
313        vext.8          d4,  d0,  d1,  #1
314NRND    vadd.u16        q10, q10, q11
315        vaddl.u8        q8,  d0,  d4
316        shrn            d5,  q10, #2
317        vld1.8          {q1},     [r1], r2
318        vadd.u16        q10, q8,  q9
319        pld             [r1, r2]
320  .if \avg
321        vld1.8          {d7},     [r0,:64]
322        vrhadd.u8       d5,  d5,  d7
323  .endif
324NRND    vadd.u16        q10, q10, q11
325        vst1.8          {d5},     [r0,:64], r2
326        shrn            d7,  q10, #2
327  .if \avg
328        vld1.8          {d5},     [r0,:64]
329        vrhadd.u8       d7,  d7,  d5
330  .endif
331        vext.8          d6,  d2,  d3,  #1
332        vaddl.u8        q9,  d2,  d6
333        vst1.8          {d7},     [r0,:64], r2
334        bgt             1b
335
336        vld1.8          {q0},     [r1], r2
337        vadd.u16        q10, q8,  q9
338        vext.8          d4,  d0,  d1,  #1
339NRND    vadd.u16        q10, q10, q11
340        vaddl.u8        q8,  d0,  d4
341        shrn            d5,  q10, #2
342        vadd.u16        q10, q8,  q9
343  .if \avg
344        vld1.8          {d7},     [r0,:64]
345        vrhadd.u8       d5,  d5,  d7
346  .endif
347NRND    vadd.u16        q10, q10, q11
348        vst1.8          {d5},     [r0,:64], r2
349        shrn            d7,  q10, #2
350  .if \avg
351        vld1.8          {d5},     [r0,:64]
352        vrhadd.u8       d7,  d7,  d5
353  .endif
354        vst1.8          {d7},     [r0,:64], r2
355
356        bx              lr
357.endm
358
359.macro  pixfunc         pfx, name, suf, rnd=1, avg=0
360  .if \rnd
361    .macro avg  rd, rn, rm
362        vrhadd.u8       \rd, \rn, \rm
363    .endm
364    .macro shrn rd, rn, rm
365        vrshrn.u16      \rd, \rn, \rm
366    .endm
367    .macro NRND insn:vararg
368    .endm
369  .else
370    .macro avg  rd, rn, rm
371        vhadd.u8        \rd, \rn, \rm
372    .endm
373    .macro shrn rd, rn, rm
374        vshrn.u16       \rd, \rn, \rm
375    .endm
376    .macro NRND insn:vararg
377        \insn
378    .endm
379  .endif
380function ff_\pfx\name\suf\()_neon, export=1
381        \name           \rnd, \avg
382endfunc
383        .purgem         avg
384        .purgem         shrn
385        .purgem         NRND
386.endm
387
388.macro  pixfunc2        pfx, name, avg=0
389        pixfunc         \pfx, \name,          rnd=1, avg=\avg
390        pixfunc         \pfx, \name, _no_rnd, rnd=0, avg=\avg
391.endm
392
393function ff_put_h264_qpel16_mc00_neon, export=1
394        mov             r3,  #16
395endfunc
396
397        pixfunc         put_, pixels16,     avg=0
398        pixfunc2        put_, pixels16_x2,  avg=0
399        pixfunc2        put_, pixels16_y2,  avg=0
400        pixfunc2        put_, pixels16_xy2, avg=0
401
402function ff_avg_h264_qpel16_mc00_neon, export=1
403        mov             r3,  #16
404endfunc
405
406        pixfunc         avg_, pixels16,     avg=1
407        pixfunc2        avg_, pixels16_x2,  avg=1
408        pixfunc2        avg_, pixels16_y2,  avg=1
409        pixfunc2        avg_, pixels16_xy2, avg=1
410
411function ff_put_h264_qpel8_mc00_neon, export=1
412        mov             r3,  #8
413endfunc
414
415        pixfunc         put_, pixels8,     avg=0
416        pixfunc2        put_, pixels8_x2,  avg=0
417        pixfunc2        put_, pixels8_y2,  avg=0
418        pixfunc2        put_, pixels8_xy2, avg=0
419
420function ff_avg_h264_qpel8_mc00_neon, export=1
421        mov             r3,  #8
422endfunc
423
424        pixfunc         avg_, pixels8,     avg=1
425        pixfunc2        avg_, pixels8_x2,  avg=1
426        pixfunc2        avg_, pixels8_y2,  avg=1
427        pixfunc2        avg_, pixels8_xy2, avg=1
428
429function ff_put_pixels_clamped_neon, export=1
430        vld1.16         {d16-d19}, [r0,:128]!
431        vqmovun.s16     d0, q8
432        vld1.16         {d20-d23}, [r0,:128]!
433        vqmovun.s16     d1, q9
434        vld1.16         {d24-d27}, [r0,:128]!
435        vqmovun.s16     d2, q10
436        vld1.16         {d28-d31}, [r0,:128]!
437        vqmovun.s16     d3, q11
438        vst1.8          {d0},      [r1,:64], r2
439        vqmovun.s16     d4, q12
440        vst1.8          {d1},      [r1,:64], r2
441        vqmovun.s16     d5, q13
442        vst1.8          {d2},      [r1,:64], r2
443        vqmovun.s16     d6, q14
444        vst1.8          {d3},      [r1,:64], r2
445        vqmovun.s16     d7, q15
446        vst1.8          {d4},      [r1,:64], r2
447        vst1.8          {d5},      [r1,:64], r2
448        vst1.8          {d6},      [r1,:64], r2
449        vst1.8          {d7},      [r1,:64], r2
450        bx              lr
451endfunc
452
453function ff_put_signed_pixels_clamped_neon, export=1
454        vmov.u8         d31, #128
455        vld1.16         {d16-d17}, [r0,:128]!
456        vqmovn.s16      d0, q8
457        vld1.16         {d18-d19}, [r0,:128]!
458        vqmovn.s16      d1, q9
459        vld1.16         {d16-d17}, [r0,:128]!
460        vqmovn.s16      d2, q8
461        vld1.16         {d18-d19}, [r0,:128]!
462        vadd.u8         d0, d0, d31
463        vld1.16         {d20-d21}, [r0,:128]!
464        vadd.u8         d1, d1, d31
465        vld1.16         {d22-d23}, [r0,:128]!
466        vadd.u8         d2, d2, d31
467        vst1.8          {d0},      [r1,:64], r2
468        vqmovn.s16      d3, q9
469        vst1.8          {d1},      [r1,:64], r2
470        vqmovn.s16      d4, q10
471        vst1.8          {d2},      [r1,:64], r2
472        vqmovn.s16      d5, q11
473        vld1.16         {d24-d25}, [r0,:128]!
474        vadd.u8         d3, d3, d31
475        vld1.16         {d26-d27}, [r0,:128]!
476        vadd.u8         d4, d4, d31
477        vadd.u8         d5, d5, d31
478        vst1.8          {d3},      [r1,:64], r2
479        vqmovn.s16      d6, q12
480        vst1.8          {d4},      [r1,:64], r2
481        vqmovn.s16      d7, q13
482        vst1.8          {d5},      [r1,:64], r2
483        vadd.u8         d6, d6, d31
484        vadd.u8         d7, d7, d31
485        vst1.8          {d6},      [r1,:64], r2
486        vst1.8          {d7},      [r1,:64], r2
487        bx              lr
488endfunc
489
490function ff_add_pixels_clamped_neon, export=1
491        mov             r3, r1
492        vld1.8          {d16},   [r1,:64], r2
493        vld1.16         {d0-d1}, [r0,:128]!
494        vaddw.u8        q0, q0, d16
495        vld1.8          {d17},   [r1,:64], r2
496        vld1.16         {d2-d3}, [r0,:128]!
497        vqmovun.s16     d0, q0
498        vld1.8          {d18},   [r1,:64], r2
499        vaddw.u8        q1, q1, d17
500        vld1.16         {d4-d5}, [r0,:128]!
501        vaddw.u8        q2, q2, d18
502        vst1.8          {d0},    [r3,:64], r2
503        vqmovun.s16     d2, q1
504        vld1.8          {d19},   [r1,:64], r2
505        vld1.16         {d6-d7}, [r0,:128]!
506        vaddw.u8        q3, q3, d19
507        vqmovun.s16     d4, q2
508        vst1.8          {d2},    [r3,:64], r2
509        vld1.8          {d16},   [r1,:64], r2
510        vqmovun.s16     d6, q3
511        vld1.16         {d0-d1}, [r0,:128]!
512        vaddw.u8        q0, q0, d16
513        vst1.8          {d4},    [r3,:64], r2
514        vld1.8          {d17},   [r1,:64], r2
515        vld1.16         {d2-d3}, [r0,:128]!
516        vaddw.u8        q1, q1, d17
517        vst1.8          {d6},    [r3,:64], r2
518        vqmovun.s16     d0, q0
519        vld1.8          {d18},   [r1,:64], r2
520        vld1.16         {d4-d5}, [r0,:128]!
521        vaddw.u8        q2, q2, d18
522        vst1.8          {d0},    [r3,:64], r2
523        vqmovun.s16     d2, q1
524        vld1.8          {d19},   [r1,:64], r2
525        vqmovun.s16     d4, q2
526        vld1.16         {d6-d7}, [r0,:128]!
527        vaddw.u8        q3, q3, d19
528        vst1.8          {d2},    [r3,:64], r2
529        vqmovun.s16     d6, q3
530        vst1.8          {d4},    [r3,:64], r2
531        vst1.8          {d6},    [r3,:64], r2
532        bx              lr
533endfunc
534
535function ff_vector_fmul_window_neon, export=1
536        push            {r4,r5,lr}
537        ldr             lr,  [sp, #12]
538        sub             r2,  r2,  #8
539        sub             r5,  lr,  #2
540        add             r2,  r2,  r5, lsl #2
541        add             r4,  r3,  r5, lsl #3
542        add             ip,  r0,  r5, lsl #3
543        mov             r5,  #-16
544        vld1.32         {d0,d1},  [r1,:128]!
545        vld1.32         {d2,d3},  [r2,:128], r5
546        vld1.32         {d4,d5},  [r3,:128]!
547        vld1.32         {d6,d7},  [r4,:128], r5
5481:      subs            lr,  lr,  #4
549        vmul.f32        d22, d0,  d4
550        vrev64.32       q3,  q3
551        vmul.f32        d23, d1,  d5
552        vrev64.32       q1,  q1
553        vmul.f32        d20, d0,  d7
554        vmul.f32        d21, d1,  d6
555        beq             2f
556        vmla.f32        d22, d3,  d7
557        vld1.32         {d0,d1},  [r1,:128]!
558        vmla.f32        d23, d2,  d6
559        vld1.32         {d18,d19},[r2,:128], r5
560        vmls.f32        d20, d3,  d4
561        vld1.32         {d24,d25},[r3,:128]!
562        vmls.f32        d21, d2,  d5
563        vld1.32         {d6,d7},  [r4,:128], r5
564        vmov            q1,  q9
565        vrev64.32       q11, q11
566        vmov            q2,  q12
567        vswp            d22, d23
568        vst1.32         {d20,d21},[r0,:128]!
569        vst1.32         {d22,d23},[ip,:128], r5
570        b               1b
5712:      vmla.f32        d22, d3,  d7
572        vmla.f32        d23, d2,  d6
573        vmls.f32        d20, d3,  d4
574        vmls.f32        d21, d2,  d5
575        vrev64.32       q11, q11
576        vswp            d22, d23
577        vst1.32         {d20,d21},[r0,:128]!
578        vst1.32         {d22,d23},[ip,:128], r5
579        pop             {r4,r5,pc}
580endfunc
581
582#if CONFIG_VORBIS_DECODER
583function ff_vorbis_inverse_coupling_neon, export=1
584        vmov.i32        q10, #1<<31
585        subs            r2,  r2,  #4
586        mov             r3,  r0
587        mov             r12, r1
588        beq             3f
589
590        vld1.32         {d24-d25},[r1,:128]!
591        vld1.32         {d22-d23},[r0,:128]!
592        vcle.s32        q8,  q12, #0
593        vand            q9,  q11, q10
594        veor            q12, q12, q9
595        vand            q2,  q12, q8
596        vbic            q3,  q12, q8
597        vadd.f32        q12, q11, q2
598        vsub.f32        q11, q11, q3
5991:      vld1.32         {d2-d3},  [r1,:128]!
600        vld1.32         {d0-d1},  [r0,:128]!
601        vcle.s32        q8,  q1,  #0
602        vand            q9,  q0,  q10
603        veor            q1,  q1,  q9
604        vst1.32         {d24-d25},[r3, :128]!
605        vst1.32         {d22-d23},[r12,:128]!
606        vand            q2,  q1,  q8
607        vbic            q3,  q1,  q8
608        vadd.f32        q1,  q0,  q2
609        vsub.f32        q0,  q0,  q3
610        subs            r2,  r2,  #8
611        ble             2f
612        vld1.32         {d24-d25},[r1,:128]!
613        vld1.32         {d22-d23},[r0,:128]!
614        vcle.s32        q8,  q12, #0
615        vand            q9,  q11, q10
616        veor            q12, q12, q9
617        vst1.32         {d2-d3},  [r3, :128]!
618        vst1.32         {d0-d1},  [r12,:128]!
619        vand            q2,  q12, q8
620        vbic            q3,  q12, q8
621        vadd.f32        q12, q11, q2
622        vsub.f32        q11, q11, q3
623        b               1b
624
6252:      vst1.32         {d2-d3},  [r3, :128]!
626        vst1.32         {d0-d1},  [r12,:128]!
627        it              lt
628        bxlt            lr
629
6303:      vld1.32         {d2-d3},  [r1,:128]
631        vld1.32         {d0-d1},  [r0,:128]
632        vcle.s32        q8,  q1,  #0
633        vand            q9,  q0,  q10
634        veor            q1,  q1,  q9
635        vand            q2,  q1,  q8
636        vbic            q3,  q1,  q8
637        vadd.f32        q1,  q0,  q2
638        vsub.f32        q0,  q0,  q3
639        vst1.32         {d2-d3},  [r0,:128]!
640        vst1.32         {d0-d1},  [r1,:128]!
641        bx              lr
642endfunc
643#endif
644
645function ff_vector_fmul_scalar_neon, export=1
646VFP     len .req r2
647NOVFP   len .req r3
648VFP     vdup.32         q8,  d0[0]
649NOVFP   vdup.32         q8,  r2
650        bics            r12, len, #15
651        beq             3f
652        vld1.32         {q0},[r1,:128]!
653        vld1.32         {q1},[r1,:128]!
6541:      vmul.f32        q0,  q0,  q8
655        vld1.32         {q2},[r1,:128]!
656        vmul.f32        q1,  q1,  q8
657        vld1.32         {q3},[r1,:128]!
658        vmul.f32        q2,  q2,  q8
659        vst1.32         {q0},[r0,:128]!
660        vmul.f32        q3,  q3,  q8
661        vst1.32         {q1},[r0,:128]!
662        subs            r12, r12, #16
663        beq             2f
664        vld1.32         {q0},[r1,:128]!
665        vst1.32         {q2},[r0,:128]!
666        vld1.32         {q1},[r1,:128]!
667        vst1.32         {q3},[r0,:128]!
668        b               1b
6692:      vst1.32         {q2},[r0,:128]!
670        vst1.32         {q3},[r0,:128]!
671        ands            len, len, #15
672        it              eq
673        bxeq            lr
6743:      vld1.32         {q0},[r1,:128]!
675        vmul.f32        q0,  q0,  q8
676        vst1.32         {q0},[r0,:128]!
677        subs            len, len, #4
678        bgt             3b
679        bx              lr
680        .unreq          len
681endfunc
682
683function ff_butterflies_float_neon, export=1
6841:      vld1.32         {q0},[r0,:128]
685        vld1.32         {q1},[r1,:128]
686        vsub.f32        q2,  q0,  q1
687        vadd.f32        q1,  q0,  q1
688        vst1.32         {q2},[r1,:128]!
689        vst1.32         {q1},[r0,:128]!
690        subs            r2,  r2,  #4
691        bgt             1b
692        bx              lr
693endfunc
694
695function ff_scalarproduct_float_neon, export=1
696        vmov.f32        q2,  #0.0
6971:      vld1.32         {q0},[r0,:128]!
698        vld1.32         {q1},[r1,:128]!
699        vmla.f32        q2,  q0,  q1
700        subs            r2,  r2,  #4
701        bgt             1b
702        vadd.f32        d0,  d4,  d5
703        vpadd.f32       d0,  d0,  d0
704NOVFP   vmov.32         r0,  d0[0]
705        bx              lr
706endfunc
707
708function ff_vector_fmul_reverse_neon, export=1
709        add             r2,  r2,  r3,  lsl #2
710        sub             r2,  r2,  #32
711        mov             r12, #-32
712        vld1.32         {q0-q1},  [r1,:128]!
713        vld1.32         {q2-q3},  [r2,:128], r12
7141:      pld             [r1, #32]
715        vrev64.32       q3,  q3
716        vmul.f32        d16, d0,  d7
717        vmul.f32        d17, d1,  d6
718        pld             [r2, #-32]
719        vrev64.32       q2,  q2
720        vmul.f32        d18, d2,  d5
721        vmul.f32        d19, d3,  d4
722        subs            r3,  r3,  #8
723        beq             2f
724        vld1.32         {q0-q1},  [r1,:128]!
725        vld1.32         {q2-q3},  [r2,:128], r12
726        vst1.32         {q8-q9},  [r0,:128]!
727        b               1b
7282:      vst1.32         {q8-q9},  [r0,:128]!
729        bx              lr
730endfunc
731
732function ff_vector_fmul_add_neon, export=1
733        ldr             r12, [sp]
734        vld1.32         {q0-q1},  [r1,:128]!
735        vld1.32         {q8-q9},  [r2,:128]!
736        vld1.32         {q2-q3},  [r3,:128]!
737        vmul.f32        q10, q0,  q8
738        vmul.f32        q11, q1,  q9
7391:      vadd.f32        q12, q2,  q10
740        vadd.f32        q13, q3,  q11
741        pld             [r1, #16]
742        pld             [r2, #16]
743        pld             [r3, #16]
744        subs            r12, r12, #8
745        beq             2f
746        vld1.32         {q0},     [r1,:128]!
747        vld1.32         {q8},     [r2,:128]!
748        vmul.f32        q10, q0,  q8
749        vld1.32         {q1},     [r1,:128]!
750        vld1.32         {q9},     [r2,:128]!
751        vmul.f32        q11, q1,  q9
752        vld1.32         {q2-q3},  [r3,:128]!
753        vst1.32         {q12-q13},[r0,:128]!
754        b               1b
7552:      vst1.32         {q12-q13},[r0,:128]!
756        bx              lr
757endfunc
758
759function ff_vector_clipf_neon, export=1
760VFP     vdup.32         q1,  d0[1]
761VFP     vdup.32         q0,  d0[0]
762NOVFP   vdup.32         q0,  r2
763NOVFP   vdup.32         q1,  r3
764NOVFP   ldr             r2,  [sp]
765        vld1.f32        {q2},[r1,:128]!
766        vmin.f32        q10, q2,  q1
767        vld1.f32        {q3},[r1,:128]!
768        vmin.f32        q11, q3,  q1
7691:      vmax.f32        q8,  q10, q0
770        vmax.f32        q9,  q11, q0
771        subs            r2,  r2,  #8
772        beq             2f
773        vld1.f32        {q2},[r1,:128]!
774        vmin.f32        q10, q2,  q1
775        vld1.f32        {q3},[r1,:128]!
776        vmin.f32        q11, q3,  q1
777        vst1.f32        {q8},[r0,:128]!
778        vst1.f32        {q9},[r0,:128]!
779        b               1b
7802:      vst1.f32        {q8},[r0,:128]!
781        vst1.f32        {q9},[r0,:128]!
782        bx              lr
783endfunc
784
785function ff_apply_window_int16_neon, export=1
786        push            {r4,lr}
787        add             r4,  r1,  r3,  lsl #1
788        add             lr,  r0,  r3,  lsl #1
789        sub             r4,  r4,  #16
790        sub             lr,  lr,  #16
791        mov             r12, #-16
7921:
793        vld1.16         {q0},     [r1,:128]!
794        vld1.16         {q2},     [r2,:128]!
795        vld1.16         {q1},     [r4,:128], r12
796        vrev64.16       q3,  q2
797        vqrdmulh.s16    q0,  q0,  q2
798        vqrdmulh.s16    d2,  d2,  d7
799        vqrdmulh.s16    d3,  d3,  d6
800        vst1.16         {q0},     [r0,:128]!
801        vst1.16         {q1},     [lr,:128], r12
802        subs            r3,  r3,  #16
803        bgt             1b
804
805        pop             {r4,pc}
806endfunc
807
808function ff_vector_clip_int32_neon, export=1
809        vdup.32         q0,  r2
810        vdup.32         q1,  r3
811        ldr             r2,  [sp]
8121:
813        vld1.32         {q2-q3},  [r1,:128]!
814        vmin.s32        q2,  q2,  q1
815        vmin.s32        q3,  q3,  q1
816        vmax.s32        q2,  q2,  q0
817        vmax.s32        q3,  q3,  q0
818        vst1.32         {q2-q3},  [r0,:128]!
819        subs            r2,  r2,  #8
820        bgt             1b
821        bx              lr
822endfunc