PageRenderTime 13ms CodeModel.GetById 7ms app.highlight 4ms RepoModel.GetById 1ms app.codeStats 0ms

/libavcodec/arm/dsputil_vfp.S

http://github.com/FFmpeg/FFmpeg
Assembly | 106 lines | 102 code | 4 blank | 0 comment | 8 complexity | cd52f3b2eeaab13ff70bfd269f2fe2e9 MD5 | raw file
  1/*
  2 * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
  3 *
  4 * This file is part of FFmpeg.
  5 *
  6 * FFmpeg is free software; you can redistribute it and/or
  7 * modify it under the terms of the GNU Lesser General Public
  8 * License as published by the Free Software Foundation; either
  9 * version 2.1 of the License, or (at your option) any later version.
 10 *
 11 * FFmpeg is distributed in the hope that it will be useful,
 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 * Lesser General Public License for more details.
 15 *
 16 * You should have received a copy of the GNU Lesser General Public
 17 * License along with FFmpeg; if not, write to the Free Software
 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 19 */
 20
 21#include "config.h"
 22#include "libavutil/arm/asm.S"
 23
 24/*
 25 * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle
 26 * throughput for almost all the instructions (except for double precision
 27 * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles
 28 * for arithmetic operations. Scheduling code to avoid pipeline stalls is very
 29 * important for performance. One more interesting feature is that VFP has
 30 * independent load/store and arithmetics pipelines, so it is possible to make
 31 * them work simultaneously and get more than 1 operation per cycle. Load/store
 32 * pipeline can process 2 single precision floating point values per cycle and
 33 * supports bulk loads and stores for large sets of registers. Arithmetic operations
 34 * can be done on vectors, which allows to keep the arithmetics pipeline busy,
 35 * while the processor may issue and execute other instructions. Detailed
 36 * optimization manuals can be found at http://www.arm.com
 37 */
 38
 39/**
 40 * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
 41 * Assume that len is a positive number and is multiple of 8
 42 */
 43@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
 44@                                 const float *src1, int len)
 45function ff_vector_fmul_reverse_vfp, export=1
 46        vpush           {d8-d15}
 47        add             r2,  r2,  r3, lsl #2
 48        vldmdb          r2!, {s0-s3}
 49        vldmia          r1!, {s8-s11}
 50        vldmdb          r2!, {s4-s7}
 51        vldmia          r1!, {s12-s15}
 52        vmul.f32        s8,  s3,  s8
 53        vmul.f32        s9,  s2,  s9
 54        vmul.f32        s10, s1,  s10
 55        vmul.f32        s11, s0,  s11
 561:
 57        subs            r3,  r3,  #16
 58        it              ge
 59        vldmdbge        r2!, {s16-s19}
 60        vmul.f32        s12, s7,  s12
 61        it              ge
 62        vldmiage        r1!, {s24-s27}
 63        vmul.f32        s13, s6,  s13
 64        it              ge
 65        vldmdbge        r2!, {s20-s23}
 66        vmul.f32        s14, s5,  s14
 67        it              ge
 68        vldmiage        r1!, {s28-s31}
 69        vmul.f32        s15, s4,  s15
 70        it              ge
 71        vmulge.f32      s24, s19, s24
 72        it              gt
 73        vldmdbgt        r2!, {s0-s3}
 74        it              ge
 75        vmulge.f32      s25, s18, s25
 76        vstmia          r0!, {s8-s13}
 77        it              ge
 78        vmulge.f32      s26, s17, s26
 79        it              gt
 80        vldmiagt        r1!, {s8-s11}
 81        itt             ge
 82        vmulge.f32      s27, s16, s27
 83        vmulge.f32      s28, s23, s28
 84        it              gt
 85        vldmdbgt        r2!, {s4-s7}
 86        it              ge
 87        vmulge.f32      s29, s22, s29
 88        vstmia          r0!, {s14-s15}
 89        ittt            ge
 90        vmulge.f32      s30, s21, s30
 91        vmulge.f32      s31, s20, s31
 92        vmulge.f32      s8,  s3,  s8
 93        it              gt
 94        vldmiagt        r1!, {s12-s15}
 95        itttt           ge
 96        vmulge.f32      s9,  s2,  s9
 97        vmulge.f32      s10, s1,  s10
 98        vstmiage        r0!, {s24-s27}
 99        vmulge.f32      s11, s0,  s11
100        it              ge
101        vstmiage        r0!, {s28-s31}
102        bgt             1b
103
104        vpop            {d8-d15}
105        bx              lr
106endfunc