/media/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm

http://github.com/zpao/v8monkey · Assembly · 237 lines · 148 code · 63 blank · 26 comment · 0 complexity · cf8ed62b97b6e8e7c283dd3d7088202e MD5 · raw file

  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. EXPORT |vp8_filter_block2d_bil_first_pass_armv6|
  11. EXPORT |vp8_filter_block2d_bil_second_pass_armv6|
  12. AREA |.text|, CODE, READONLY ; name this block of code
  13. ;-------------------------------------
  14. ; r0 unsigned char *src_ptr,
  15. ; r1 unsigned short *dst_ptr,
  16. ; r2 unsigned int src_pitch,
  17. ; r3 unsigned int height,
  18. ; stack unsigned int width,
  19. ; stack const short *vp8_filter
  20. ;-------------------------------------
  21. ; The output is transposed stroed in output array to make it easy for second pass filtering.
  22. |vp8_filter_block2d_bil_first_pass_armv6| PROC
  23. stmdb sp!, {r4 - r11, lr}
  24. ldr r11, [sp, #40] ; vp8_filter address
  25. ldr r4, [sp, #36] ; width
  26. mov r12, r3 ; outer-loop counter
  27. add r7, r2, r4 ; preload next row
  28. pld [r0, r7]
  29. sub r2, r2, r4 ; src increment for height loop
  30. ldr r5, [r11] ; load up filter coefficients
  31. mov r3, r3, lsl #1 ; height*2
  32. add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
  33. mov r11, r1 ; save dst_ptr for each row
  34. cmp r5, #128 ; if filter coef = 128, then skip the filter
  35. beq bil_null_1st_filter
  36. |bil_height_loop_1st_v6|
  37. ldrb r6, [r0] ; load source data
  38. ldrb r7, [r0, #1]
  39. ldrb r8, [r0, #2]
  40. mov lr, r4, lsr #2 ; 4-in-parellel loop counter
  41. |bil_width_loop_1st_v6|
  42. ldrb r9, [r0, #3]
  43. ldrb r10, [r0, #4]
  44. pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0]
  45. pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1]
  46. smuad r6, r6, r5 ; apply the filter
  47. pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2]
  48. smuad r7, r7, r5
  49. pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3]
  50. smuad r8, r8, r5
  51. smuad r9, r9, r5
  52. add r0, r0, #4
  53. subs lr, lr, #1
  54. add r6, r6, #0x40 ; round_shift_and_clamp
  55. add r7, r7, #0x40
  56. usat r6, #16, r6, asr #7
  57. usat r7, #16, r7, asr #7
  58. strh r6, [r1], r3 ; result is transposed and stored
  59. add r8, r8, #0x40 ; round_shift_and_clamp
  60. strh r7, [r1], r3
  61. add r9, r9, #0x40
  62. usat r8, #16, r8, asr #7
  63. usat r9, #16, r9, asr #7
  64. strh r8, [r1], r3 ; result is transposed and stored
  65. ldrneb r6, [r0] ; load source data
  66. strh r9, [r1], r3
  67. ldrneb r7, [r0, #1]
  68. ldrneb r8, [r0, #2]
  69. bne bil_width_loop_1st_v6
  70. add r0, r0, r2 ; move to next input row
  71. subs r12, r12, #1
  72. add r9, r2, r4, lsl #1 ; adding back block width
  73. pld [r0, r9] ; preload next row
  74. add r11, r11, #2 ; move over to next column
  75. mov r1, r11
  76. bne bil_height_loop_1st_v6
  77. ldmia sp!, {r4 - r11, pc}
  78. |bil_null_1st_filter|
  79. |bil_height_loop_null_1st|
  80. mov lr, r4, lsr #2 ; loop counter
  81. |bil_width_loop_null_1st|
  82. ldrb r6, [r0] ; load data
  83. ldrb r7, [r0, #1]
  84. ldrb r8, [r0, #2]
  85. ldrb r9, [r0, #3]
  86. strh r6, [r1], r3 ; store it to immediate buffer
  87. add r0, r0, #4
  88. strh r7, [r1], r3
  89. subs lr, lr, #1
  90. strh r8, [r1], r3
  91. strh r9, [r1], r3
  92. bne bil_width_loop_null_1st
  93. subs r12, r12, #1
  94. add r0, r0, r2 ; move to next input line
  95. add r11, r11, #2 ; move over to next column
  96. mov r1, r11
  97. bne bil_height_loop_null_1st
  98. ldmia sp!, {r4 - r11, pc}
  99. ENDP ; |vp8_filter_block2d_bil_first_pass_armv6|
  100. ;---------------------------------
  101. ; r0 unsigned short *src_ptr,
  102. ; r1 unsigned char *dst_ptr,
  103. ; r2 int dst_pitch,
  104. ; r3 unsigned int height,
  105. ; stack unsigned int width,
  106. ; stack const short *vp8_filter
  107. ;---------------------------------
  108. |vp8_filter_block2d_bil_second_pass_armv6| PROC
  109. stmdb sp!, {r4 - r11, lr}
  110. ldr r11, [sp, #40] ; vp8_filter address
  111. ldr r4, [sp, #36] ; width
  112. ldr r5, [r11] ; load up filter coefficients
  113. mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix
  114. mov r11, r1
  115. cmp r5, #128 ; if filter coef = 128, then skip the filter
  116. beq bil_null_2nd_filter
  117. |bil_height_loop_2nd|
  118. ldr r6, [r0] ; load the data
  119. ldr r8, [r0, #4]
  120. ldrh r10, [r0, #8]
  121. mov lr, r3, lsr #2 ; loop counter
  122. |bil_width_loop_2nd|
  123. pkhtb r7, r6, r8 ; src[1] | src[2]
  124. pkhtb r9, r8, r10 ; src[3] | src[4]
  125. smuad r6, r6, r5 ; apply filter
  126. smuad r8, r8, r5 ; apply filter
  127. subs lr, lr, #1
  128. smuadx r7, r7, r5 ; apply filter
  129. smuadx r9, r9, r5 ; apply filter
  130. add r0, r0, #8
  131. add r6, r6, #0x40 ; round_shift_and_clamp
  132. add r7, r7, #0x40
  133. usat r6, #8, r6, asr #7
  134. usat r7, #8, r7, asr #7
  135. strb r6, [r1], r2 ; the result is transposed back and stored
  136. add r8, r8, #0x40 ; round_shift_and_clamp
  137. strb r7, [r1], r2
  138. add r9, r9, #0x40
  139. usat r8, #8, r8, asr #7
  140. usat r9, #8, r9, asr #7
  141. strb r8, [r1], r2 ; the result is transposed back and stored
  142. ldrne r6, [r0] ; load data
  143. strb r9, [r1], r2
  144. ldrne r8, [r0, #4]
  145. ldrneh r10, [r0, #8]
  146. bne bil_width_loop_2nd
  147. subs r12, r12, #1
  148. add r0, r0, #4 ; update src for next row
  149. add r11, r11, #1
  150. mov r1, r11
  151. bne bil_height_loop_2nd
  152. ldmia sp!, {r4 - r11, pc}
  153. |bil_null_2nd_filter|
  154. |bil_height_loop_null_2nd|
  155. mov lr, r3, lsr #2
  156. |bil_width_loop_null_2nd|
  157. ldr r6, [r0], #4 ; load data
  158. subs lr, lr, #1
  159. ldr r8, [r0], #4
  160. strb r6, [r1], r2 ; store data
  161. mov r7, r6, lsr #16
  162. strb r7, [r1], r2
  163. mov r9, r8, lsr #16
  164. strb r8, [r1], r2
  165. strb r9, [r1], r2
  166. bne bil_width_loop_null_2nd
  167. subs r12, r12, #1
  168. add r0, r0, #4
  169. add r11, r11, #1
  170. mov r1, r11
  171. bne bil_height_loop_null_2nd
  172. ldmia sp!, {r4 - r11, pc}
  173. ENDP ; |vp8_filter_block2d_second_pass_armv6|
  174. END