PageRenderTime 26ms CodeModel.GetById 13ms app.highlight 10ms RepoModel.GetById 1ms app.codeStats 1ms

/media/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm

http://github.com/zpao/v8monkey
Assembly | 183 lines | 128 code | 34 blank | 21 comment | 0 complexity | ee06f1b3f6424b4555141d2b9a9f32c9 MD5 | raw file
  1;
  2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3;
  4;  Use of this source code is governed by a BSD-style license
  5;  that can be found in the LICENSE file in the root of the source
  6;  tree. An additional intellectual property rights grant can be found
  7;  in the file PATENTS.  All contributing project authors may
  8;  be found in the AUTHORS file in the root of the source tree.
  9;
 10
 11
 12    EXPORT  |vp8_bilinear_predict8x8_neon|
 13    ARM
 14    REQUIRE8
 15    PRESERVE8
 16
 17    AREA ||.text||, CODE, READONLY, ALIGN=2
 18; r0    unsigned char  *src_ptr,
 19; r1    int  src_pixels_per_line,
 20; r2    int  xoffset,
 21; r3    int  yoffset,
 22; r4    unsigned char *dst_ptr,
 23; stack(lr) int  dst_pitch
 24
 25|vp8_bilinear_predict8x8_neon| PROC
 26    push            {r4, lr}
 27
 28    adr             r12, bifilter8_coeff
 29    ldr             r4, [sp, #8]            ;load parameters from stack
 30    ldr             lr, [sp, #12]           ;load parameters from stack
 31
 32    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
 33    beq             skip_firstpass_filter
 34
 35;First pass: output_height lines x output_width columns (9x8)
 36    add             r2, r12, r2, lsl #3     ;calculate filter location
 37
 38    vld1.u8         {q1}, [r0], r1          ;load src data
 39    vld1.u32        {d31}, [r2]             ;load first_pass filter
 40    vld1.u8         {q2}, [r0], r1
 41    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
 42    vld1.u8         {q3}, [r0], r1
 43    vdup.8          d1, d31[4]
 44    vld1.u8         {q4}, [r0], r1
 45
 46    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp8_filter[0])
 47    vmull.u8        q7, d4, d0
 48    vmull.u8        q8, d6, d0
 49    vmull.u8        q9, d8, d0
 50
 51    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
 52    vext.8          d5, d4, d5, #1
 53    vext.8          d7, d6, d7, #1
 54    vext.8          d9, d8, d9, #1
 55
 56    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp8_filter[1])
 57    vmlal.u8        q7, d5, d1
 58    vmlal.u8        q8, d7, d1
 59    vmlal.u8        q9, d9, d1
 60
 61    vld1.u8         {q1}, [r0], r1          ;load src data
 62    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
 63    vld1.u8         {q2}, [r0], r1
 64    vqrshrn.u16    d23, q7, #7
 65    vld1.u8         {q3}, [r0], r1
 66    vqrshrn.u16    d24, q8, #7
 67    vld1.u8         {q4}, [r0], r1
 68    vqrshrn.u16    d25, q9, #7
 69
 70    ;first_pass filtering on the rest 5-line data
 71    vld1.u8         {q5}, [r0], r1
 72
 73    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp8_filter[0])
 74    vmull.u8        q7, d4, d0
 75    vmull.u8        q8, d6, d0
 76    vmull.u8        q9, d8, d0
 77    vmull.u8        q10, d10, d0
 78
 79    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
 80    vext.8          d5, d4, d5, #1
 81    vext.8          d7, d6, d7, #1
 82    vext.8          d9, d8, d9, #1
 83    vext.8          d11, d10, d11, #1
 84
 85    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp8_filter[1])
 86    vmlal.u8        q7, d5, d1
 87    vmlal.u8        q8, d7, d1
 88    vmlal.u8        q9, d9, d1
 89    vmlal.u8        q10, d11, d1
 90
 91    vqrshrn.u16    d26, q6, #7              ;shift/round/saturate to u8
 92    vqrshrn.u16    d27, q7, #7
 93    vqrshrn.u16    d28, q8, #7
 94    vqrshrn.u16    d29, q9, #7
 95    vqrshrn.u16    d30, q10, #7
 96
 97;Second pass: 8x8
 98secondpass_filter
 99    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
100    beq             skip_secondpass_filter
101
102    add             r3, r12, r3, lsl #3
103    add             r0, r4, lr
104
105    vld1.u32        {d31}, [r3]             ;load second_pass filter
106    add             r1, r0, lr
107
108    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
109    vdup.8          d1, d31[4]
110
111    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp8_filter[0])
112    vmull.u8        q2, d23, d0
113    vmull.u8        q3, d24, d0
114    vmull.u8        q4, d25, d0
115    vmull.u8        q5, d26, d0
116    vmull.u8        q6, d27, d0
117    vmull.u8        q7, d28, d0
118    vmull.u8        q8, d29, d0
119
120    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * vp8_filter[1])
121    vmlal.u8        q2, d24, d1
122    vmlal.u8        q3, d25, d1
123    vmlal.u8        q4, d26, d1
124    vmlal.u8        q5, d27, d1
125    vmlal.u8        q6, d28, d1
126    vmlal.u8        q7, d29, d1
127    vmlal.u8        q8, d30, d1
128
129    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
130    vqrshrn.u16    d3, q2, #7
131    vqrshrn.u16    d4, q3, #7
132    vqrshrn.u16    d5, q4, #7
133    vqrshrn.u16    d6, q5, #7
134    vqrshrn.u16    d7, q6, #7
135    vqrshrn.u16    d8, q7, #7
136    vqrshrn.u16    d9, q8, #7
137
138    vst1.u8         {d2}, [r4]              ;store result
139    vst1.u8         {d3}, [r0]
140    vst1.u8         {d4}, [r1], lr
141    vst1.u8         {d5}, [r1], lr
142    vst1.u8         {d6}, [r1], lr
143    vst1.u8         {d7}, [r1], lr
144    vst1.u8         {d8}, [r1], lr
145    vst1.u8         {d9}, [r1], lr
146
147    pop             {r4, pc}
148
149;--------------------
150skip_firstpass_filter
151    vld1.u8         {d22}, [r0], r1         ;load src data
152    vld1.u8         {d23}, [r0], r1
153    vld1.u8         {d24}, [r0], r1
154    vld1.u8         {d25}, [r0], r1
155    vld1.u8         {d26}, [r0], r1
156    vld1.u8         {d27}, [r0], r1
157    vld1.u8         {d28}, [r0], r1
158    vld1.u8         {d29}, [r0], r1
159    vld1.u8         {d30}, [r0], r1
160
161    b               secondpass_filter
162
163;---------------------
164skip_secondpass_filter
165    vst1.u8         {d22}, [r4], lr         ;store result
166    vst1.u8         {d23}, [r4], lr
167    vst1.u8         {d24}, [r4], lr
168    vst1.u8         {d25}, [r4], lr
169    vst1.u8         {d26}, [r4], lr
170    vst1.u8         {d27}, [r4], lr
171    vst1.u8         {d28}, [r4], lr
172    vst1.u8         {d29}, [r4], lr
173
174    pop             {r4, pc}
175
176    ENDP
177
178;-----------------
179
180bifilter8_coeff
181    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
182
183    END