/media/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm

http://github.com/zpao/v8monkey · Assembly · 207 lines · 139 code · 28 blank · 40 comment · 0 complexity · 79da8281245c2ca513662429c9d44676 MD5 · raw file

  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. ; void vp8_temporal_filter_apply_sse2 | arg
  12. ; (unsigned char *frame1, | 0
  13. ; unsigned int stride, | 1
  14. ; unsigned char *frame2, | 2
  15. ; unsigned int block_size, | 3
  16. ; int strength, | 4
  17. ; int filter_weight, | 5
  18. ; unsigned int *accumulator, | 6
  19. ; unsigned short *count) | 7
  20. global sym(vp8_temporal_filter_apply_sse2)
  21. sym(vp8_temporal_filter_apply_sse2):
  22. push rbp
  23. mov rbp, rsp
  24. SHADOW_ARGS_TO_STACK 8
  25. SAVE_XMM 7
  26. GET_GOT rbx
  27. push rsi
  28. push rdi
  29. ALIGN_STACK 16, rax
  30. %define block_size 0
  31. %define strength 16
  32. %define filter_weight 32
  33. %define rounding_bit 48
  34. %define rbp_backup 64
  35. %define stack_size 80
  36. sub rsp, stack_size
  37. mov [rsp + rbp_backup], rbp
  38. ; end prolog
  39. mov rdx, arg(3)
  40. mov [rsp + block_size], rdx
  41. movd xmm6, arg(4)
  42. movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
  43. ; calculate the rounding bit outside the loop
  44. ; 0x8000 >> (16 - strength)
  45. mov rdx, 16
  46. sub rdx, arg(4) ; 16 - strength
  47. movd xmm4, rdx ; can't use rdx w/ shift
  48. movdqa xmm5, [GLOBAL(_const_top_bit)]
  49. psrlw xmm5, xmm4
  50. movdqa [rsp + rounding_bit], xmm5
  51. mov rsi, arg(0) ; src/frame1
  52. mov rdx, arg(2) ; predictor frame
  53. mov rdi, arg(6) ; accumulator
  54. mov rax, arg(7) ; count
  55. ; dup the filter weight and store for later
  56. movd xmm0, arg(5) ; filter_weight
  57. pshuflw xmm0, xmm0, 0
  58. punpcklwd xmm0, xmm0
  59. movdqa [rsp + filter_weight], xmm0
  60. mov rbp, arg(1) ; stride
  61. pxor xmm7, xmm7 ; zero for extraction
  62. lea rcx, [rdx + 16*16*1]
  63. cmp dword ptr [rsp + block_size], 8
  64. jne temporal_filter_apply_load_16
  65. lea rcx, [rdx + 8*8*1]
  66. temporal_filter_apply_load_8:
  67. movq xmm0, [rsi] ; first row
  68. lea rsi, [rsi + rbp] ; += stride
  69. punpcklbw xmm0, xmm7 ; src[ 0- 7]
  70. movq xmm1, [rsi] ; second row
  71. lea rsi, [rsi + rbp] ; += stride
  72. punpcklbw xmm1, xmm7 ; src[ 8-15]
  73. jmp temporal_filter_apply_load_finished
  74. temporal_filter_apply_load_16:
  75. movdqa xmm0, [rsi] ; src (frame1)
  76. lea rsi, [rsi + rbp] ; += stride
  77. movdqa xmm1, xmm0
  78. punpcklbw xmm0, xmm7 ; src[ 0- 7]
  79. punpckhbw xmm1, xmm7 ; src[ 8-15]
  80. temporal_filter_apply_load_finished:
  81. movdqa xmm2, [rdx] ; predictor (frame2)
  82. movdqa xmm3, xmm2
  83. punpcklbw xmm2, xmm7 ; pred[ 0- 7]
  84. punpckhbw xmm3, xmm7 ; pred[ 8-15]
  85. ; modifier = src_byte - pixel_value
  86. psubw xmm0, xmm2 ; src - pred[ 0- 7]
  87. psubw xmm1, xmm3 ; src - pred[ 8-15]
  88. ; modifier *= modifier
  89. pmullw xmm0, xmm0 ; modifer[ 0- 7]^2
  90. pmullw xmm1, xmm1 ; modifer[ 8-15]^2
  91. ; modifier *= 3
  92. pmullw xmm0, [GLOBAL(_const_3w)]
  93. pmullw xmm1, [GLOBAL(_const_3w)]
  94. ; modifer += 0x8000 >> (16 - strength)
  95. paddw xmm0, [rsp + rounding_bit]
  96. paddw xmm1, [rsp + rounding_bit]
  97. ; modifier >>= strength
  98. psrlw xmm0, [rsp + strength]
  99. psrlw xmm1, [rsp + strength]
  100. ; modifier = 16 - modifier
  101. ; saturation takes care of modifier > 16
  102. movdqa xmm3, [GLOBAL(_const_16w)]
  103. movdqa xmm2, [GLOBAL(_const_16w)]
  104. psubusw xmm3, xmm1
  105. psubusw xmm2, xmm0
  106. ; modifier *= filter_weight
  107. pmullw xmm2, [rsp + filter_weight]
  108. pmullw xmm3, [rsp + filter_weight]
  109. ; count
  110. movdqa xmm4, [rax]
  111. movdqa xmm5, [rax+16]
  112. ; += modifier
  113. paddw xmm4, xmm2
  114. paddw xmm5, xmm3
  115. ; write back
  116. movdqa [rax], xmm4
  117. movdqa [rax+16], xmm5
  118. lea rax, [rax + 16*2] ; count += 16*(sizeof(short))
  119. ; load and extract the predictor up to shorts
  120. pxor xmm7, xmm7
  121. movdqa xmm0, [rdx]
  122. lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char))
  123. movdqa xmm1, xmm0
  124. punpcklbw xmm0, xmm7 ; pred[ 0- 7]
  125. punpckhbw xmm1, xmm7 ; pred[ 8-15]
  126. ; modifier *= pixel_value
  127. pmullw xmm0, xmm2
  128. pmullw xmm1, xmm3
  129. ; expand to double words
  130. movdqa xmm2, xmm0
  131. punpcklwd xmm0, xmm7 ; [ 0- 3]
  132. punpckhwd xmm2, xmm7 ; [ 4- 7]
  133. movdqa xmm3, xmm1
  134. punpcklwd xmm1, xmm7 ; [ 8-11]
  135. punpckhwd xmm3, xmm7 ; [12-15]
  136. ; accumulator
  137. movdqa xmm4, [rdi]
  138. movdqa xmm5, [rdi+16]
  139. movdqa xmm6, [rdi+32]
  140. movdqa xmm7, [rdi+48]
  141. ; += modifier
  142. paddd xmm4, xmm0
  143. paddd xmm5, xmm2
  144. paddd xmm6, xmm1
  145. paddd xmm7, xmm3
  146. ; write back
  147. movdqa [rdi], xmm4
  148. movdqa [rdi+16], xmm5
  149. movdqa [rdi+32], xmm6
  150. movdqa [rdi+48], xmm7
  151. lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int))
  152. cmp rdx, rcx
  153. je temporal_filter_apply_epilog
  154. pxor xmm7, xmm7 ; zero for extraction
  155. cmp dword ptr [rsp + block_size], 16
  156. je temporal_filter_apply_load_16
  157. jmp temporal_filter_apply_load_8
  158. temporal_filter_apply_epilog:
  159. ; begin epilog
  160. mov rbp, [rsp + rbp_backup]
  161. add rsp, stack_size
  162. pop rsp
  163. pop rdi
  164. pop rsi
  165. RESTORE_GOT
  166. RESTORE_XMM
  167. UNSHADOW_ARGS
  168. pop rbp
  169. ret
  170. SECTION_RODATA
  171. align 16
  172. _const_3w:
  173. times 8 dw 3
  174. align 16
  175. _const_top_bit:
  176. times 8 dw 1<<15
  177. align 16
  178. _const_16w
  179. times 8 dw 16