/media/libvpx/vp8/encoder/x86/variance_impl_ssse3.asm

http://github.com/zpao/v8monkey · Assembly · 364 lines · 264 code · 74 blank · 26 comment · 0 complexity · ad8249bbcc257d22147c3709978449a9 MD5 · raw file

  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. %define xmm_filter_shift 7
  12. ;void vp8_filter_block2d_bil_var_ssse3
  13. ;(
  14. ; unsigned char *ref_ptr,
  15. ; int ref_pixels_per_line,
  16. ; unsigned char *src_ptr,
  17. ; int src_pixels_per_line,
  18. ; unsigned int Height,
  19. ; int xoffset,
  20. ; int yoffset,
  21. ; int *sum,
  22. ; unsigned int *sumsquared;;
  23. ;
  24. ;)
  25. ;Note: The filter coefficient at offset=0 is 128. Since the second register
  26. ;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
  27. global sym(vp8_filter_block2d_bil_var_ssse3)
  28. sym(vp8_filter_block2d_bil_var_ssse3):
  29. push rbp
  30. mov rbp, rsp
  31. SHADOW_ARGS_TO_STACK 9
  32. SAVE_XMM 7
  33. GET_GOT rbx
  34. push rsi
  35. push rdi
  36. ; end prolog
  37. pxor xmm6, xmm6
  38. pxor xmm7, xmm7
  39. lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
  40. movsxd rax, dword ptr arg(5) ; xoffset
  41. cmp rax, 0 ; skip first_pass filter if xoffset=0
  42. je filter_block2d_bil_var_ssse3_sp_only
  43. shl rax, 4 ; point to filter coeff with xoffset
  44. lea rax, [rax + rcx] ; HFilter
  45. movsxd rdx, dword ptr arg(6) ; yoffset
  46. cmp rdx, 0 ; skip second_pass filter if yoffset=0
  47. je filter_block2d_bil_var_ssse3_fp_only
  48. shl rdx, 4
  49. lea rdx, [rdx + rcx] ; VFilter
  50. mov rsi, arg(0) ;ref_ptr
  51. mov rdi, arg(2) ;src_ptr
  52. movsxd rcx, dword ptr arg(4) ;Height
  53. movdqu xmm0, XMMWORD PTR [rsi]
  54. movdqu xmm1, XMMWORD PTR [rsi+1]
  55. movdqa xmm2, xmm0
  56. punpcklbw xmm0, xmm1
  57. punpckhbw xmm2, xmm1
  58. pmaddubsw xmm0, [rax]
  59. pmaddubsw xmm2, [rax]
  60. paddw xmm0, [GLOBAL(xmm_bi_rd)]
  61. paddw xmm2, [GLOBAL(xmm_bi_rd)]
  62. psraw xmm0, xmm_filter_shift
  63. psraw xmm2, xmm_filter_shift
  64. packuswb xmm0, xmm2
  65. %if ABI_IS_32BIT
  66. add rsi, dword ptr arg(1) ;ref_pixels_per_line
  67. %else
  68. movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
  69. movsxd r9, dword ptr arg(3) ;src_pixels_per_line
  70. lea rsi, [rsi + r8]
  71. %endif
  72. filter_block2d_bil_var_ssse3_loop:
  73. movdqu xmm1, XMMWORD PTR [rsi]
  74. movdqu xmm2, XMMWORD PTR [rsi+1]
  75. movdqa xmm3, xmm1
  76. punpcklbw xmm1, xmm2
  77. punpckhbw xmm3, xmm2
  78. pmaddubsw xmm1, [rax]
  79. pmaddubsw xmm3, [rax]
  80. paddw xmm1, [GLOBAL(xmm_bi_rd)]
  81. paddw xmm3, [GLOBAL(xmm_bi_rd)]
  82. psraw xmm1, xmm_filter_shift
  83. psraw xmm3, xmm_filter_shift
  84. packuswb xmm1, xmm3
  85. movdqa xmm2, xmm0
  86. movdqa xmm0, xmm1
  87. movdqa xmm3, xmm2
  88. punpcklbw xmm2, xmm1
  89. punpckhbw xmm3, xmm1
  90. pmaddubsw xmm2, [rdx]
  91. pmaddubsw xmm3, [rdx]
  92. paddw xmm2, [GLOBAL(xmm_bi_rd)]
  93. paddw xmm3, [GLOBAL(xmm_bi_rd)]
  94. psraw xmm2, xmm_filter_shift
  95. psraw xmm3, xmm_filter_shift
  96. movq xmm1, QWORD PTR [rdi]
  97. pxor xmm4, xmm4
  98. punpcklbw xmm1, xmm4
  99. movq xmm5, QWORD PTR [rdi+8]
  100. punpcklbw xmm5, xmm4
  101. psubw xmm2, xmm1
  102. psubw xmm3, xmm5
  103. paddw xmm6, xmm2
  104. paddw xmm6, xmm3
  105. pmaddwd xmm2, xmm2
  106. pmaddwd xmm3, xmm3
  107. paddd xmm7, xmm2
  108. paddd xmm7, xmm3
  109. %if ABI_IS_32BIT
  110. add rsi, dword ptr arg(1) ;ref_pixels_per_line
  111. add rdi, dword ptr arg(3) ;src_pixels_per_line
  112. %else
  113. lea rsi, [rsi + r8]
  114. lea rdi, [rdi + r9]
  115. %endif
  116. sub rcx, 1
  117. jnz filter_block2d_bil_var_ssse3_loop
  118. jmp filter_block2d_bil_variance
  119. filter_block2d_bil_var_ssse3_sp_only:
  120. movsxd rdx, dword ptr arg(6) ; yoffset
  121. cmp rdx, 0 ; Both xoffset =0 and yoffset=0
  122. je filter_block2d_bil_var_ssse3_full_pixel
  123. shl rdx, 4
  124. lea rdx, [rdx + rcx] ; VFilter
  125. mov rsi, arg(0) ;ref_ptr
  126. mov rdi, arg(2) ;src_ptr
  127. movsxd rcx, dword ptr arg(4) ;Height
  128. movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
  129. movdqu xmm1, XMMWORD PTR [rsi]
  130. movdqa xmm0, xmm1
  131. %if ABI_IS_32BIT=0
  132. movsxd r9, dword ptr arg(3) ;src_pixels_per_line
  133. %endif
  134. lea rsi, [rsi + rax]
  135. filter_block2d_bil_sp_only_loop:
  136. movdqu xmm3, XMMWORD PTR [rsi]
  137. movdqa xmm2, xmm1
  138. movdqa xmm0, xmm3
  139. punpcklbw xmm1, xmm3
  140. punpckhbw xmm2, xmm3
  141. pmaddubsw xmm1, [rdx]
  142. pmaddubsw xmm2, [rdx]
  143. paddw xmm1, [GLOBAL(xmm_bi_rd)]
  144. paddw xmm2, [GLOBAL(xmm_bi_rd)]
  145. psraw xmm1, xmm_filter_shift
  146. psraw xmm2, xmm_filter_shift
  147. movq xmm3, QWORD PTR [rdi]
  148. pxor xmm4, xmm4
  149. punpcklbw xmm3, xmm4
  150. movq xmm5, QWORD PTR [rdi+8]
  151. punpcklbw xmm5, xmm4
  152. psubw xmm1, xmm3
  153. psubw xmm2, xmm5
  154. paddw xmm6, xmm1
  155. paddw xmm6, xmm2
  156. pmaddwd xmm1, xmm1
  157. pmaddwd xmm2, xmm2
  158. paddd xmm7, xmm1
  159. paddd xmm7, xmm2
  160. movdqa xmm1, xmm0
  161. lea rsi, [rsi + rax] ;ref_pixels_per_line
  162. %if ABI_IS_32BIT
  163. add rdi, dword ptr arg(3) ;src_pixels_per_line
  164. %else
  165. lea rdi, [rdi + r9]
  166. %endif
  167. sub rcx, 1
  168. jnz filter_block2d_bil_sp_only_loop
  169. jmp filter_block2d_bil_variance
  170. filter_block2d_bil_var_ssse3_full_pixel:
  171. mov rsi, arg(0) ;ref_ptr
  172. mov rdi, arg(2) ;src_ptr
  173. movsxd rcx, dword ptr arg(4) ;Height
  174. movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
  175. movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
  176. pxor xmm0, xmm0
  177. filter_block2d_bil_full_pixel_loop:
  178. movq xmm1, QWORD PTR [rsi]
  179. punpcklbw xmm1, xmm0
  180. movq xmm2, QWORD PTR [rsi+8]
  181. punpcklbw xmm2, xmm0
  182. movq xmm3, QWORD PTR [rdi]
  183. punpcklbw xmm3, xmm0
  184. movq xmm4, QWORD PTR [rdi+8]
  185. punpcklbw xmm4, xmm0
  186. psubw xmm1, xmm3
  187. psubw xmm2, xmm4
  188. paddw xmm6, xmm1
  189. paddw xmm6, xmm2
  190. pmaddwd xmm1, xmm1
  191. pmaddwd xmm2, xmm2
  192. paddd xmm7, xmm1
  193. paddd xmm7, xmm2
  194. lea rsi, [rsi + rax] ;ref_pixels_per_line
  195. lea rdi, [rdi + rdx] ;src_pixels_per_line
  196. sub rcx, 1
  197. jnz filter_block2d_bil_full_pixel_loop
  198. jmp filter_block2d_bil_variance
  199. filter_block2d_bil_var_ssse3_fp_only:
  200. mov rsi, arg(0) ;ref_ptr
  201. mov rdi, arg(2) ;src_ptr
  202. movsxd rcx, dword ptr arg(4) ;Height
  203. movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
  204. pxor xmm0, xmm0
  205. %if ABI_IS_32BIT=0
  206. movsxd r9, dword ptr arg(3) ;src_pixels_per_line
  207. %endif
  208. filter_block2d_bil_fp_only_loop:
  209. movdqu xmm1, XMMWORD PTR [rsi]
  210. movdqu xmm2, XMMWORD PTR [rsi+1]
  211. movdqa xmm3, xmm1
  212. punpcklbw xmm1, xmm2
  213. punpckhbw xmm3, xmm2
  214. pmaddubsw xmm1, [rax]
  215. pmaddubsw xmm3, [rax]
  216. paddw xmm1, [GLOBAL(xmm_bi_rd)]
  217. paddw xmm3, [GLOBAL(xmm_bi_rd)]
  218. psraw xmm1, xmm_filter_shift
  219. psraw xmm3, xmm_filter_shift
  220. movq xmm2, XMMWORD PTR [rdi]
  221. pxor xmm4, xmm4
  222. punpcklbw xmm2, xmm4
  223. movq xmm5, QWORD PTR [rdi+8]
  224. punpcklbw xmm5, xmm4
  225. psubw xmm1, xmm2
  226. psubw xmm3, xmm5
  227. paddw xmm6, xmm1
  228. paddw xmm6, xmm3
  229. pmaddwd xmm1, xmm1
  230. pmaddwd xmm3, xmm3
  231. paddd xmm7, xmm1
  232. paddd xmm7, xmm3
  233. lea rsi, [rsi + rdx]
  234. %if ABI_IS_32BIT
  235. add rdi, dword ptr arg(3) ;src_pixels_per_line
  236. %else
  237. lea rdi, [rdi + r9]
  238. %endif
  239. sub rcx, 1
  240. jnz filter_block2d_bil_fp_only_loop
  241. jmp filter_block2d_bil_variance
  242. filter_block2d_bil_variance:
  243. pxor xmm0, xmm0
  244. pxor xmm1, xmm1
  245. pxor xmm5, xmm5
  246. punpcklwd xmm0, xmm6
  247. punpckhwd xmm1, xmm6
  248. psrad xmm0, 16
  249. psrad xmm1, 16
  250. paddd xmm0, xmm1
  251. movdqa xmm1, xmm0
  252. movdqa xmm6, xmm7
  253. punpckldq xmm6, xmm5
  254. punpckhdq xmm7, xmm5
  255. paddd xmm6, xmm7
  256. punpckldq xmm0, xmm5
  257. punpckhdq xmm1, xmm5
  258. paddd xmm0, xmm1
  259. movdqa xmm7, xmm6
  260. movdqa xmm1, xmm0
  261. psrldq xmm7, 8
  262. psrldq xmm1, 8
  263. paddd xmm6, xmm7
  264. paddd xmm0, xmm1
  265. mov rsi, arg(7) ;[Sum]
  266. mov rdi, arg(8) ;[SSE]
  267. movd [rsi], xmm0
  268. movd [rdi], xmm6
  269. ; begin epilog
  270. pop rdi
  271. pop rsi
  272. RESTORE_GOT
  273. RESTORE_XMM
  274. UNSHADOW_ARGS
  275. pop rbp
  276. ret
  277. SECTION_RODATA
  278. align 16
  279. xmm_bi_rd:
  280. times 8 dw 64
  281. align 16
  282. vp8_bilinear_filters_ssse3:
  283. times 8 db 128, 0
  284. times 8 db 112, 16
  285. times 8 db 96, 32
  286. times 8 db 80, 48
  287. times 8 db 64, 64
  288. times 8 db 48, 80
  289. times 8 db 32, 96
  290. times 8 db 16, 112