PageRenderTime 44ms CodeModel.GetById 9ms app.highlight 28ms RepoModel.GetById 1ms app.codeStats 1ms

/media/libvpx/vp8/encoder/arm/neon/variance_neon.asm

http://github.com/zpao/v8monkey
Assembly | 276 lines | 169 code | 63 blank | 44 comment | 0 complexity | f5cad4450b7c93e3321e48df159d4b3e MD5 | raw file
  1;
  2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3;
  4;  Use of this source code is governed by a BSD-style license
  5;  that can be found in the LICENSE file in the root of the source
  6;  tree. An additional intellectual property rights grant can be found
  7;  in the file PATENTS.  All contributing project authors may
  8;  be found in the AUTHORS file in the root of the source tree.
  9;
 10
 11
 12    EXPORT  |vp8_variance16x16_neon|
 13    EXPORT  |vp8_variance16x8_neon|
 14    EXPORT  |vp8_variance8x16_neon|
 15    EXPORT  |vp8_variance8x8_neon|
 16
 17    ARM
 18    REQUIRE8
 19    PRESERVE8
 20
 21    AREA ||.text||, CODE, READONLY, ALIGN=2
 22
 23; r0    unsigned char *src_ptr
 24; r1    int source_stride
 25; r2    unsigned char *ref_ptr
 26; r3    int  recon_stride
 27; stack unsigned int *sse
 28|vp8_variance16x16_neon| PROC
 29    vmov.i8         q8, #0                      ;q8 - sum
 30    vmov.i8         q9, #0                      ;q9, q10 - sse
 31    vmov.i8         q10, #0
 32
 33    mov             r12, #8
 34
 35variance16x16_neon_loop
 36    vld1.8          {q0}, [r0], r1              ;Load up source and reference
 37    vld1.8          {q2}, [r2], r3
 38    vld1.8          {q1}, [r0], r1
 39    vld1.8          {q3}, [r2], r3
 40
 41    vsubl.u8        q11, d0, d4                 ;calculate diff
 42    vsubl.u8        q12, d1, d5
 43    vsubl.u8        q13, d2, d6
 44    vsubl.u8        q14, d3, d7
 45
 46    ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
 47    ;the results into the elements of the destination vector. The explanation
 48    ;in ARM guide is wrong.
 49    vpadal.s16      q8, q11                     ;calculate sum
 50    vmlal.s16       q9, d22, d22                ;calculate sse
 51    vmlal.s16       q10, d23, d23
 52
 53    subs            r12, r12, #1
 54
 55    vpadal.s16      q8, q12
 56    vmlal.s16       q9, d24, d24
 57    vmlal.s16       q10, d25, d25
 58    vpadal.s16      q8, q13
 59    vmlal.s16       q9, d26, d26
 60    vmlal.s16       q10, d27, d27
 61    vpadal.s16      q8, q14
 62    vmlal.s16       q9, d28, d28
 63    vmlal.s16       q10, d29, d29
 64
 65    bne             variance16x16_neon_loop
 66
 67    vadd.u32        q10, q9, q10                ;accumulate sse
 68    vpaddl.s32      q0, q8                      ;accumulate sum
 69
 70    ldr             r12, [sp]                   ;load *sse from stack
 71
 72    vpaddl.u32      q1, q10
 73    vadd.s64        d0, d0, d1
 74    vadd.u64        d1, d2, d3
 75
 76    ;vmov.32        r0, d0[0]                   ;this instruction costs a lot
 77    ;vmov.32        r1, d1[0]
 78    ;mul            r0, r0, r0
 79    ;str            r1, [r12]
 80    ;sub            r0, r1, r0, asr #8
 81
 82    ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
 83    ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
 84    vmull.s32       q5, d0, d0
 85    vst1.32         {d1[0]}, [r12]              ;store sse
 86    vshr.s32        d10, d10, #8
 87    vsub.s32        d0, d1, d10
 88
 89    vmov.32         r0, d0[0]                   ;return
 90    bx              lr
 91
 92    ENDP
 93
 94;================================
 95;unsigned int vp8_variance16x8_c(
 96;    unsigned char *src_ptr,
 97;    int  source_stride,
 98;    unsigned char *ref_ptr,
 99;    int  recon_stride,
100;   unsigned int *sse)
101|vp8_variance16x8_neon| PROC
102    vmov.i8         q8, #0                      ;q8 - sum
103    vmov.i8         q9, #0                      ;q9, q10 - sse
104    vmov.i8         q10, #0
105
106    mov             r12, #4
107
108variance16x8_neon_loop
109    vld1.8          {q0}, [r0], r1              ;Load up source and reference
110    vld1.8          {q2}, [r2], r3
111    vld1.8          {q1}, [r0], r1
112    vld1.8          {q3}, [r2], r3
113
114    vsubl.u8        q11, d0, d4                 ;calculate diff
115    vsubl.u8        q12, d1, d5
116    vsubl.u8        q13, d2, d6
117    vsubl.u8        q14, d3, d7
118
119    vpadal.s16      q8, q11                     ;calculate sum
120    vmlal.s16       q9, d22, d22                ;calculate sse
121    vmlal.s16       q10, d23, d23
122
123    subs            r12, r12, #1
124
125    vpadal.s16      q8, q12
126    vmlal.s16       q9, d24, d24
127    vmlal.s16       q10, d25, d25
128    vpadal.s16      q8, q13
129    vmlal.s16       q9, d26, d26
130    vmlal.s16       q10, d27, d27
131    vpadal.s16      q8, q14
132    vmlal.s16       q9, d28, d28
133    vmlal.s16       q10, d29, d29
134
135    bne             variance16x8_neon_loop
136
137    vadd.u32        q10, q9, q10                ;accumulate sse
138    vpaddl.s32      q0, q8                      ;accumulate sum
139
140    ldr             r12, [sp]                   ;load *sse from stack
141
142    vpaddl.u32      q1, q10
143    vadd.s64        d0, d0, d1
144    vadd.u64        d1, d2, d3
145
146    vmull.s32       q5, d0, d0
147    vst1.32         {d1[0]}, [r12]              ;store sse
148    vshr.s32        d10, d10, #7
149    vsub.s32        d0, d1, d10
150
151    vmov.32         r0, d0[0]                   ;return
152    bx              lr
153
154    ENDP
155
156;=================================
157;unsigned int vp8_variance8x16_c(
158;    unsigned char *src_ptr,
159;    int  source_stride,
160;    unsigned char *ref_ptr,
161;    int  recon_stride,
162;   unsigned int *sse)
163
164|vp8_variance8x16_neon| PROC
165    vmov.i8         q8, #0                      ;q8 - sum
166    vmov.i8         q9, #0                      ;q9, q10 - sse
167    vmov.i8         q10, #0
168
169    mov             r12, #8
170
171variance8x16_neon_loop
172    vld1.8          {d0}, [r0], r1              ;Load up source and reference
173    vld1.8          {d4}, [r2], r3
174    vld1.8          {d2}, [r0], r1
175    vld1.8          {d6}, [r2], r3
176
177    vsubl.u8        q11, d0, d4                 ;calculate diff
178    vsubl.u8        q12, d2, d6
179
180    vpadal.s16      q8, q11                     ;calculate sum
181    vmlal.s16       q9, d22, d22                ;calculate sse
182    vmlal.s16       q10, d23, d23
183
184    subs            r12, r12, #1
185
186    vpadal.s16      q8, q12
187    vmlal.s16       q9, d24, d24
188    vmlal.s16       q10, d25, d25
189
190    bne             variance8x16_neon_loop
191
192    vadd.u32        q10, q9, q10                ;accumulate sse
193    vpaddl.s32      q0, q8                      ;accumulate sum
194
195    ldr             r12, [sp]                   ;load *sse from stack
196
197    vpaddl.u32      q1, q10
198    vadd.s64        d0, d0, d1
199    vadd.u64        d1, d2, d3
200
201    vmull.s32       q5, d0, d0
202    vst1.32         {d1[0]}, [r12]              ;store sse
203    vshr.s32        d10, d10, #7
204    vsub.s32        d0, d1, d10
205
206    vmov.32         r0, d0[0]                   ;return
207    bx              lr
208
209    ENDP
210
211;==================================
212; r0    unsigned char *src_ptr
213; r1    int source_stride
214; r2    unsigned char *ref_ptr
215; r3    int  recon_stride
216; stack unsigned int *sse
217|vp8_variance8x8_neon| PROC
218    vmov.i8         q8, #0                      ;q8 - sum
219    vmov.i8         q9, #0                      ;q9, q10 - sse
220    vmov.i8         q10, #0
221
222    mov             r12, #2
223
224variance8x8_neon_loop
225    vld1.8          {d0}, [r0], r1              ;Load up source and reference
226    vld1.8          {d4}, [r2], r3
227    vld1.8          {d1}, [r0], r1
228    vld1.8          {d5}, [r2], r3
229    vld1.8          {d2}, [r0], r1
230    vld1.8          {d6}, [r2], r3
231    vld1.8          {d3}, [r0], r1
232    vld1.8          {d7}, [r2], r3
233
234    vsubl.u8        q11, d0, d4                 ;calculate diff
235    vsubl.u8        q12, d1, d5
236    vsubl.u8        q13, d2, d6
237    vsubl.u8        q14, d3, d7
238
239    vpadal.s16      q8, q11                     ;calculate sum
240    vmlal.s16       q9, d22, d22                ;calculate sse
241    vmlal.s16       q10, d23, d23
242
243    subs            r12, r12, #1
244
245    vpadal.s16      q8, q12
246    vmlal.s16       q9, d24, d24
247    vmlal.s16       q10, d25, d25
248    vpadal.s16      q8, q13
249    vmlal.s16       q9, d26, d26
250    vmlal.s16       q10, d27, d27
251    vpadal.s16      q8, q14
252    vmlal.s16       q9, d28, d28
253    vmlal.s16       q10, d29, d29
254
255    bne             variance8x8_neon_loop
256
257    vadd.u32        q10, q9, q10                ;accumulate sse
258    vpaddl.s32      q0, q8                      ;accumulate sum
259
260    ldr             r12, [sp]                   ;load *sse from stack
261
262    vpaddl.u32      q1, q10
263    vadd.s64        d0, d0, d1
264    vadd.u64        d1, d2, d3
265
266    vmull.s32       q5, d0, d0
267    vst1.32         {d1[0]}, [r12]              ;store sse
268    vshr.s32        d10, d10, #6
269    vsub.s32        d0, d1, d10
270
271    vmov.32         r0, d0[0]                   ;return
272    bx              lr
273
274    ENDP
275
276    END