PageRenderTime 33ms CodeModel.GetById 14ms app.highlight 15ms RepoModel.GetById 2ms app.codeStats 0ms

/media/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm

http://github.com/zpao/v8monkey
Assembly | 237 lines | 148 code | 63 blank | 26 comment | 0 complexity | cf8ed62b97b6e8e7c283dd3d7088202e MD5 | raw file
  1;
  2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3;
  4;  Use of this source code is governed by a BSD-style license
  5;  that can be found in the LICENSE file in the root of the source
  6;  tree. An additional intellectual property rights grant can be found
  7;  in the file PATENTS.  All contributing project authors may
  8;  be found in the AUTHORS file in the root of the source tree.
  9;
 10
 11
 12    EXPORT  |vp8_filter_block2d_bil_first_pass_armv6|
 13    EXPORT  |vp8_filter_block2d_bil_second_pass_armv6|
 14
 15    AREA    |.text|, CODE, READONLY  ; name this block of code
 16
 17;-------------------------------------
 18; r0    unsigned char  *src_ptr,
 19; r1    unsigned short *dst_ptr,
 20; r2    unsigned int    src_pitch,
 21; r3    unsigned int    height,
 22; stack unsigned int    width,
 23; stack const short    *vp8_filter
 24;-------------------------------------
 25; The output is transposed stroed in output array to make it easy for second pass filtering.
 26|vp8_filter_block2d_bil_first_pass_armv6| PROC
 27    stmdb   sp!, {r4 - r11, lr}
 28
 29    ldr     r11, [sp, #40]                  ; vp8_filter address
 30    ldr     r4, [sp, #36]                   ; width
 31
 32    mov     r12, r3                         ; outer-loop counter
 33
 34    add     r7, r2, r4                      ; preload next row
 35    pld     [r0, r7]
 36
 37    sub     r2, r2, r4                      ; src increment for height loop
 38
 39    ldr     r5, [r11]                       ; load up filter coefficients
 40
 41    mov     r3, r3, lsl #1                  ; height*2
 42    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
 43
 44    mov     r11, r1                         ; save dst_ptr for each row
 45
 46    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
 47    beq     bil_null_1st_filter
 48
 49|bil_height_loop_1st_v6|
 50    ldrb    r6, [r0]                        ; load source data
 51    ldrb    r7, [r0, #1]
 52    ldrb    r8, [r0, #2]
 53    mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter
 54
 55|bil_width_loop_1st_v6|
 56    ldrb    r9, [r0, #3]
 57    ldrb    r10, [r0, #4]
 58
 59    pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0]
 60    pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1]
 61
 62    smuad   r6, r6, r5                      ; apply the filter
 63    pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2]
 64    smuad   r7, r7, r5
 65    pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3]
 66
 67    smuad   r8, r8, r5
 68    smuad   r9, r9, r5
 69
 70    add     r0, r0, #4
 71    subs    lr, lr, #1
 72
 73    add     r6, r6, #0x40                   ; round_shift_and_clamp
 74    add     r7, r7, #0x40
 75    usat    r6, #16, r6, asr #7
 76    usat    r7, #16, r7, asr #7
 77
 78    strh    r6, [r1], r3                    ; result is transposed and stored
 79
 80    add     r8, r8, #0x40                   ; round_shift_and_clamp
 81    strh    r7, [r1], r3
 82    add     r9, r9, #0x40
 83    usat    r8, #16, r8, asr #7
 84    usat    r9, #16, r9, asr #7
 85
 86    strh    r8, [r1], r3                    ; result is transposed and stored
 87
 88    ldrneb  r6, [r0]                        ; load source data
 89    strh    r9, [r1], r3
 90
 91    ldrneb  r7, [r0, #1]
 92    ldrneb  r8, [r0, #2]
 93
 94    bne     bil_width_loop_1st_v6
 95
 96    add     r0, r0, r2                      ; move to next input row
 97    subs    r12, r12, #1
 98
 99    add     r9, r2, r4, lsl #1              ; adding back block width
100    pld     [r0, r9]                        ; preload next row
101
102    add     r11, r11, #2                    ; move over to next column
103    mov     r1, r11
104
105    bne     bil_height_loop_1st_v6
106
107    ldmia   sp!, {r4 - r11, pc}
108
109|bil_null_1st_filter|
110|bil_height_loop_null_1st|
111    mov     lr, r4, lsr #2                  ; loop counter
112
113|bil_width_loop_null_1st|
114    ldrb    r6, [r0]                        ; load data
115    ldrb    r7, [r0, #1]
116    ldrb    r8, [r0, #2]
117    ldrb    r9, [r0, #3]
118
119    strh    r6, [r1], r3                    ; store it to immediate buffer
120    add     r0, r0, #4
121    strh    r7, [r1], r3
122    subs    lr, lr, #1
123    strh    r8, [r1], r3
124    strh    r9, [r1], r3
125
126    bne     bil_width_loop_null_1st
127
128    subs    r12, r12, #1
129    add     r0, r0, r2                      ; move to next input line
130    add     r11, r11, #2                    ; move over to next column
131    mov     r1, r11
132
133    bne     bil_height_loop_null_1st
134
135    ldmia   sp!, {r4 - r11, pc}
136
137    ENDP  ; |vp8_filter_block2d_bil_first_pass_armv6|
138
139
140;---------------------------------
141; r0    unsigned short *src_ptr,
142; r1    unsigned char  *dst_ptr,
143; r2    int             dst_pitch,
144; r3    unsigned int    height,
145; stack unsigned int    width,
146; stack const short    *vp8_filter
147;---------------------------------
148|vp8_filter_block2d_bil_second_pass_armv6| PROC
149    stmdb   sp!, {r4 - r11, lr}
150
151    ldr     r11, [sp, #40]                  ; vp8_filter address
152    ldr     r4, [sp, #36]                   ; width
153
154    ldr     r5, [r11]                       ; load up filter coefficients
155    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
156    mov     r11, r1
157
158    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
159    beq     bil_null_2nd_filter
160
161|bil_height_loop_2nd|
162    ldr     r6, [r0]                        ; load the data
163    ldr     r8, [r0, #4]
164    ldrh    r10, [r0, #8]
165    mov     lr, r3, lsr #2                  ; loop counter
166
167|bil_width_loop_2nd|
168    pkhtb   r7, r6, r8                      ; src[1] | src[2]
169    pkhtb   r9, r8, r10                     ; src[3] | src[4]
170
171    smuad   r6, r6, r5                      ; apply filter
172    smuad   r8, r8, r5                      ; apply filter
173
174    subs    lr, lr, #1
175
176    smuadx  r7, r7, r5                      ; apply filter
177    smuadx  r9, r9, r5                      ; apply filter
178
179    add     r0, r0, #8
180
181    add     r6, r6, #0x40                   ; round_shift_and_clamp
182    add     r7, r7, #0x40
183    usat    r6, #8, r6, asr #7
184    usat    r7, #8, r7, asr #7
185    strb    r6, [r1], r2                    ; the result is transposed back and stored
186
187    add     r8, r8, #0x40                   ; round_shift_and_clamp
188    strb    r7, [r1], r2
189    add     r9, r9, #0x40
190    usat    r8, #8, r8, asr #7
191    usat    r9, #8, r9, asr #7
192    strb    r8, [r1], r2                    ; the result is transposed back and stored
193
194    ldrne   r6, [r0]                        ; load data
195    strb    r9, [r1], r2
196    ldrne   r8, [r0, #4]
197    ldrneh  r10, [r0, #8]
198
199    bne     bil_width_loop_2nd
200
201    subs    r12, r12, #1
202    add     r0, r0, #4                      ; update src for next row
203    add     r11, r11, #1
204    mov     r1, r11
205
206    bne     bil_height_loop_2nd
207    ldmia   sp!, {r4 - r11, pc}
208
209|bil_null_2nd_filter|
210|bil_height_loop_null_2nd|
211    mov     lr, r3, lsr #2
212
213|bil_width_loop_null_2nd|
214    ldr     r6, [r0], #4                    ; load data
215    subs    lr, lr, #1
216    ldr     r8, [r0], #4
217
218    strb    r6, [r1], r2                    ; store data
219    mov     r7, r6, lsr #16
220    strb    r7, [r1], r2
221    mov     r9, r8, lsr #16
222    strb    r8, [r1], r2
223    strb    r9, [r1], r2
224
225    bne     bil_width_loop_null_2nd
226
227    subs    r12, r12, #1
228    add     r0, r0, #4
229    add     r11, r11, #1
230    mov     r1, r11
231
232    bne     bil_height_loop_null_2nd
233
234    ldmia   sp!, {r4 - r11, pc}
235    ENDP  ; |vp8_filter_block2d_second_pass_armv6|
236
237    END