/media/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm
http://github.com/zpao/v8monkey · Assembly · 237 lines · 148 code · 63 blank · 26 comment · 0 complexity · cf8ed62b97b6e8e7c283dd3d7088202e MD5 · raw file
- ;
- ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- ;
- ; Use of this source code is governed by a BSD-style license
- ; that can be found in the LICENSE file in the root of the source
- ; tree. An additional intellectual property rights grant can be found
- ; in the file PATENTS. All contributing project authors may
- ; be found in the AUTHORS file in the root of the source tree.
- ;
- EXPORT |vp8_filter_block2d_bil_first_pass_armv6|
- EXPORT |vp8_filter_block2d_bil_second_pass_armv6|
- AREA |.text|, CODE, READONLY ; name this block of code
- ;-------------------------------------
- ; r0 unsigned char *src_ptr,
- ; r1 unsigned short *dst_ptr,
- ; r2 unsigned int src_pitch,
- ; r3 unsigned int height,
- ; stack unsigned int width,
- ; stack const short *vp8_filter
- ;-------------------------------------
- ; The output is transposed stroed in output array to make it easy for second pass filtering.
- |vp8_filter_block2d_bil_first_pass_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
- ldr r11, [sp, #40] ; vp8_filter address
- ldr r4, [sp, #36] ; width
- mov r12, r3 ; outer-loop counter
- add r7, r2, r4 ; preload next row
- pld [r0, r7]
- sub r2, r2, r4 ; src increment for height loop
- ldr r5, [r11] ; load up filter coefficients
- mov r3, r3, lsl #1 ; height*2
- add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
- mov r11, r1 ; save dst_ptr for each row
- cmp r5, #128 ; if filter coef = 128, then skip the filter
- beq bil_null_1st_filter
- |bil_height_loop_1st_v6|
- ldrb r6, [r0] ; load source data
- ldrb r7, [r0, #1]
- ldrb r8, [r0, #2]
- mov lr, r4, lsr #2 ; 4-in-parellel loop counter
- |bil_width_loop_1st_v6|
- ldrb r9, [r0, #3]
- ldrb r10, [r0, #4]
- pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0]
- pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1]
- smuad r6, r6, r5 ; apply the filter
- pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2]
- smuad r7, r7, r5
- pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3]
- smuad r8, r8, r5
- smuad r9, r9, r5
- add r0, r0, #4
- subs lr, lr, #1
- add r6, r6, #0x40 ; round_shift_and_clamp
- add r7, r7, #0x40
- usat r6, #16, r6, asr #7
- usat r7, #16, r7, asr #7
- strh r6, [r1], r3 ; result is transposed and stored
- add r8, r8, #0x40 ; round_shift_and_clamp
- strh r7, [r1], r3
- add r9, r9, #0x40
- usat r8, #16, r8, asr #7
- usat r9, #16, r9, asr #7
- strh r8, [r1], r3 ; result is transposed and stored
- ldrneb r6, [r0] ; load source data
- strh r9, [r1], r3
- ldrneb r7, [r0, #1]
- ldrneb r8, [r0, #2]
- bne bil_width_loop_1st_v6
- add r0, r0, r2 ; move to next input row
- subs r12, r12, #1
- add r9, r2, r4, lsl #1 ; adding back block width
- pld [r0, r9] ; preload next row
- add r11, r11, #2 ; move over to next column
- mov r1, r11
- bne bil_height_loop_1st_v6
- ldmia sp!, {r4 - r11, pc}
- |bil_null_1st_filter|
- |bil_height_loop_null_1st|
- mov lr, r4, lsr #2 ; loop counter
- |bil_width_loop_null_1st|
- ldrb r6, [r0] ; load data
- ldrb r7, [r0, #1]
- ldrb r8, [r0, #2]
- ldrb r9, [r0, #3]
- strh r6, [r1], r3 ; store it to immediate buffer
- add r0, r0, #4
- strh r7, [r1], r3
- subs lr, lr, #1
- strh r8, [r1], r3
- strh r9, [r1], r3
- bne bil_width_loop_null_1st
- subs r12, r12, #1
- add r0, r0, r2 ; move to next input line
- add r11, r11, #2 ; move over to next column
- mov r1, r11
- bne bil_height_loop_null_1st
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_filter_block2d_bil_first_pass_armv6|
- ;---------------------------------
- ; r0 unsigned short *src_ptr,
- ; r1 unsigned char *dst_ptr,
- ; r2 int dst_pitch,
- ; r3 unsigned int height,
- ; stack unsigned int width,
- ; stack const short *vp8_filter
- ;---------------------------------
- |vp8_filter_block2d_bil_second_pass_armv6| PROC
- stmdb sp!, {r4 - r11, lr}
- ldr r11, [sp, #40] ; vp8_filter address
- ldr r4, [sp, #36] ; width
- ldr r5, [r11] ; load up filter coefficients
- mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix
- mov r11, r1
- cmp r5, #128 ; if filter coef = 128, then skip the filter
- beq bil_null_2nd_filter
- |bil_height_loop_2nd|
- ldr r6, [r0] ; load the data
- ldr r8, [r0, #4]
- ldrh r10, [r0, #8]
- mov lr, r3, lsr #2 ; loop counter
- |bil_width_loop_2nd|
- pkhtb r7, r6, r8 ; src[1] | src[2]
- pkhtb r9, r8, r10 ; src[3] | src[4]
- smuad r6, r6, r5 ; apply filter
- smuad r8, r8, r5 ; apply filter
- subs lr, lr, #1
- smuadx r7, r7, r5 ; apply filter
- smuadx r9, r9, r5 ; apply filter
- add r0, r0, #8
- add r6, r6, #0x40 ; round_shift_and_clamp
- add r7, r7, #0x40
- usat r6, #8, r6, asr #7
- usat r7, #8, r7, asr #7
- strb r6, [r1], r2 ; the result is transposed back and stored
- add r8, r8, #0x40 ; round_shift_and_clamp
- strb r7, [r1], r2
- add r9, r9, #0x40
- usat r8, #8, r8, asr #7
- usat r9, #8, r9, asr #7
- strb r8, [r1], r2 ; the result is transposed back and stored
- ldrne r6, [r0] ; load data
- strb r9, [r1], r2
- ldrne r8, [r0, #4]
- ldrneh r10, [r0, #8]
- bne bil_width_loop_2nd
- subs r12, r12, #1
- add r0, r0, #4 ; update src for next row
- add r11, r11, #1
- mov r1, r11
- bne bil_height_loop_2nd
- ldmia sp!, {r4 - r11, pc}
- |bil_null_2nd_filter|
- |bil_height_loop_null_2nd|
- mov lr, r3, lsr #2
- |bil_width_loop_null_2nd|
- ldr r6, [r0], #4 ; load data
- subs lr, lr, #1
- ldr r8, [r0], #4
- strb r6, [r1], r2 ; store data
- mov r7, r6, lsr #16
- strb r7, [r1], r2
- mov r9, r8, lsr #16
- strb r8, [r1], r2
- strb r9, [r1], r2
- bne bil_width_loop_null_2nd
- subs r12, r12, #1
- add r0, r0, #4
- add r11, r11, #1
- mov r1, r11
- bne bil_height_loop_null_2nd
- ldmia sp!, {r4 - r11, pc}
- ENDP ; |vp8_filter_block2d_second_pass_armv6|
- END