PageRenderTime 31ms CodeModel.GetById 13ms app.highlight 13ms RepoModel.GetById 1ms app.codeStats 1ms

/media/libvpx/vp8/encoder/arm/neon/fastfdct8x4_neon.asm

http://github.com/zpao/v8monkey
Assembly | 177 lines | 121 code | 27 blank | 29 comment | 0 complexity | ca6498f6323cdf89b24511ee86f01cdd MD5 | raw file
  1;
  2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3;
  4;  Use of this source code is governed by a BSD-style license
  5;  that can be found in the LICENSE file in the root of the source
  6;  tree. An additional intellectual property rights grant can be found
  7;  in the file PATENTS.  All contributing project authors may
  8;  be found in the AUTHORS file in the root of the source tree.
  9;
 10
 11
 12    EXPORT  |vp8_fast_fdct8x4_neon|
 13
 14    ARM
 15    REQUIRE8
 16    PRESERVE8
 17
 18    AREA ||.text||, CODE, READONLY, ALIGN=2
 19;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
 20;NOTE:
 21;The input *src_diff. src_diff is calculated as:
 22;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)
 23;In which *src_ptr and *pred_ptr both are unsigned char.
 24;Therefore, *src_diff should be in the range of [-255, 255].
 25;CAUTION:
 26;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].
 27;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes
 28;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.
 29
 30|vp8_fast_fdct8x4_neon| PROC
 31    vld1.16         {q1}, [r0], r2              ;load input
 32    ldr             r12, _ffdct8_coeff_
 33    vld1.16         {q2}, [r0], r2
 34    vld1.16         {q3}, [r0], r2
 35    vld1.16         {d0}, [r12]
 36    vld1.16         {q4}, [r0], r2
 37
 38    ;First for-loop
 39    ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[1], d6=ip[2], d8=ip[3]
 40    ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[1], d7=ip[2], d9=ip[3]
 41    vtrn.32         d2, d6
 42    vtrn.32         d3, d7
 43    vtrn.32         d4, d8
 44    vtrn.32         d5, d9
 45    vtrn.16         d2, d4
 46    vtrn.16         d3, d5
 47    vtrn.16         d6, d8
 48    vtrn.16         d7, d9
 49
 50    vadd.s16        d10, d2, d8             ;ip[0]+ip[3]
 51    vadd.s16        d11, d4, d6             ;ip[1]+ip[2]
 52    vsub.s16        d12, d4, d6             ;ip[1]-ip[2]
 53    vsub.s16        d13, d2, d8             ;ip[0]-ip[3]
 54    vadd.s16        d22, d3, d9
 55    vadd.s16        d23, d5, d7
 56    vsub.s16        d24, d5, d7
 57    vsub.s16        d25, d3, d9
 58
 59    vshl.i16        q5, q5, #1              ; a1, b1
 60    vshl.i16        q6, q6, #1              ; c1, d1
 61    vshl.i16        q1, q11, #1
 62    vshl.i16        q2, q12, #1
 63
 64    vadd.s16        d14, d10, d11           ;temp1 = a1 + b1
 65    vsub.s16        d15, d10, d11           ;temp2 = a1 - b1
 66    vadd.s16        d24, d2, d3
 67    vsub.s16        d25, d2, d3
 68
 69    vqdmulh.s16     q8, q7, d0[1]
 70    vqdmulh.s16     q13, q12, d0[1]
 71    vqdmulh.s16     q10, q6, d0[0]
 72    vqdmulh.s16     q15, q2, d0[0]
 73    vqdmulh.s16     q9, q6, d0[2]
 74    vqdmulh.s16     q14, q2, d0[2]
 75
 76    vshr.s16        q8, q8, #1
 77    vshr.s16        q13, q13, #1
 78    vshr.s16        q10, q10, #1
 79    vshr.s16        q15, q15, #1
 80    vshr.s16        q9, q9, #1              ;d18:temp1 = ( c1 * x_c3)>>16;  d19:temp1 =  (d1 * x_c3)>>16
 81    vshr.s16        q14, q14, #1            ;d28:temp1 = ( c1 * x_c3)>>16;  d29:temp1 =  (d1 * x_c3)>>16
 82    vadd.s16        q10, q6, q10            ;d20:temp2 = ((c1 * x_c1)>>16) + c1;  d21:temp2 = ((d1 * x_c1)>>16) + d1
 83    vadd.s16        q15, q2, q15            ;d30:temp2 = ((c1 * x_c1)>>16) + c1;  d31:temp2 = ((d1 * x_c1)>>16) + d1
 84
 85    vadd.s16        d2, d14, d16            ;op[0] = ((temp1 * x_c2 )>>16) + temp1
 86    vadd.s16        d3, d24, d26            ;op[0] = ((temp1 * x_c2 )>>16) + temp1
 87    vadd.s16        d6, d15, d17            ;op[2] = ((temp2 * x_c2 )>>16) + temp2
 88    vadd.s16        d7, d25, d27            ;op[2] = ((temp2 * x_c2 )>>16) + temp2
 89    vadd.s16        d4, d18, d21            ;op[1] = temp1 + temp2  -- q is not necessary, just for protection
 90    vadd.s16        d5, d28, d31            ;op[1] = temp1 + temp2  -- q is not necessary, just for protection
 91    vsub.s16        d8, d19, d20            ;op[3] = temp1 - temp2
 92    vsub.s16        d9, d29, d30            ;op[3] = temp1 - temp2
 93
 94    ;Second for-loop
 95    ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[4], d6=ip[8], d8=ip[12]
 96    ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[4], d7=ip[8], d9=ip[12]
 97    vtrn.32         d2, d6
 98    vtrn.32         d3, d7
 99    vtrn.32         d4, d8
100    vtrn.32         d5, d9
101    vtrn.16         d2, d4
102    vtrn.16         d3, d5
103    vtrn.16         d6, d8
104    vtrn.16         d7, d9
105
106    vadd.s16        d10, d2, d8             ;a1 = ip[0]+ip[12]
107    vadd.s16        d11, d4, d6             ;b1 = ip[4]+ip[8]
108    vsub.s16        d12, d4, d6             ;c1 = ip[4]-ip[8]
109    vsub.s16        d13, d2, d8             ;d1 = ip[0]-ip[12]
110    vadd.s16        d2, d3, d9
111    vadd.s16        d4, d5, d7
112    vsub.s16        d24, d5, d7
113    vsub.s16        d25, d3, d9
114
115    vadd.s16        d14, d10, d11           ;temp1 = a1 + b1
116    vsub.s16        d15, d10, d11           ;temp2 = a1 - b1
117    vadd.s16        d22, d2, d4
118    vsub.s16        d23, d2, d4
119
120    vqdmulh.s16     q8, q7, d0[1]
121    vqdmulh.s16     q13, q11, d0[1]
122    vqdmulh.s16     q10, q6, d0[0]
123    vqdmulh.s16     q15, q12, d0[0]
124    vqdmulh.s16     q9, q6, d0[2]
125    vqdmulh.s16     q14, q12, d0[2]
126
127    vshr.s16        q8, q8, #1
128    vshr.s16        q13, q13, #1
129    vshr.s16        q10, q10, #1
130    vshr.s16        q15, q15, #1
131    vshr.s16        q9, q9, #1              ;d18:temp1 = ( c1 * x_c3)>>16;  d19:temp1 =  (d1 * x_c3)>>16
132    vshr.s16        q14, q14, #1            ;d28:temp1 = ( c1 * x_c3)>>16;  d29:temp1 =  (d1 * x_c3)>>16
133    vadd.s16        q10, q6, q10            ;d20:temp2 = ((c1 * x_c1)>>16) + c1;  d21:temp2 = ((d1 * x_c1)>>16) + d1
134    vadd.s16        q15, q12, q15           ;d30:temp2 = ((c1 * x_c1)>>16) + c1;  d31:temp2 = ((d1 * x_c1)>>16) + d1
135
136    vadd.s16        d2, d14, d16            ;a2 = ((temp1 * x_c2 )>>16) + temp1
137    vadd.s16        d6, d22, d26            ;a2 = ((temp1 * x_c2 )>>16) + temp1
138    vadd.s16        d4, d15, d17            ;c2 = ((temp2 * x_c2 )>>16) + temp2
139    vadd.s16        d8, d23, d27            ;c2 = ((temp2 * x_c2 )>>16) + temp2
140    vadd.s16        d3, d18, d21            ;b2 = temp1 + temp2  -- q is not necessary, just for protection
141    vadd.s16        d7, d28, d31            ;b2 = temp1 + temp2  -- q is not necessary, just for protection
142    vsub.s16        d5, d19, d20            ;d2 = temp1 - temp2
143    vsub.s16        d9, d29, d30            ;d2 = temp1 - temp2
144
145    vclt.s16        q5, q1, #0
146    vclt.s16        q6, q2, #0
147    vclt.s16        q7, q3, #0
148    vclt.s16        q8, q4, #0
149
150    vsub.s16        q1, q1, q5
151    vsub.s16        q2, q2, q6
152    vsub.s16        q3, q3, q7
153    vsub.s16        q4, q4, q8
154
155    vshr.s16        q1, q1, #1
156    vshr.s16        q2, q2, #1
157    vshr.s16        q3, q3, #1
158    vshr.s16        q4, q4, #1
159
160    vst1.16         {q1, q2}, [r1]!
161    vst1.16         {q3, q4}, [r1]
162
163    bx              lr
164
165    ENDP
166
167;-----------------
168
169_ffdct8_coeff_
170    DCD     ffdct8_coeff
171ffdct8_coeff
172; 60547 =  0xEC83
173; 46341 =  0xB505
174; 25080 =  0x61F8
175    DCD     0xB505EC83, 0x000061F8
176
177    END