/media/libvpx/vp8/encoder/x86/sad_sse2.asm

http://github.com/zpao/v8monkey · Assembly · 410 lines · 255 code · 102 blank · 53 comment · 0 complexity · 1ec3e7a8f4593fc866a2f5bf69260e91 MD5 · raw file

  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. ;unsigned int vp8_sad16x16_wmt(
  12. ; unsigned char *src_ptr,
  13. ; int src_stride,
  14. ; unsigned char *ref_ptr,
  15. ; int ref_stride)
  16. global sym(vp8_sad16x16_wmt)
  17. sym(vp8_sad16x16_wmt):
  18. push rbp
  19. mov rbp, rsp
  20. SHADOW_ARGS_TO_STACK 4
  21. SAVE_XMM 6
  22. push rsi
  23. push rdi
  24. ; end prolog
  25. mov rsi, arg(0) ;src_ptr
  26. mov rdi, arg(2) ;ref_ptr
  27. movsxd rax, dword ptr arg(1) ;src_stride
  28. movsxd rdx, dword ptr arg(3) ;ref_stride
  29. lea rcx, [rsi+rax*8]
  30. lea rcx, [rcx+rax*8]
  31. pxor xmm6, xmm6
  32. x16x16sad_wmt_loop:
  33. movq xmm0, QWORD PTR [rsi]
  34. movq xmm2, QWORD PTR [rsi+8]
  35. movq xmm1, QWORD PTR [rdi]
  36. movq xmm3, QWORD PTR [rdi+8]
  37. movq xmm4, QWORD PTR [rsi+rax]
  38. movq xmm5, QWORD PTR [rdi+rdx]
  39. punpcklbw xmm0, xmm2
  40. punpcklbw xmm1, xmm3
  41. psadbw xmm0, xmm1
  42. movq xmm2, QWORD PTR [rsi+rax+8]
  43. movq xmm3, QWORD PTR [rdi+rdx+8]
  44. lea rsi, [rsi+rax*2]
  45. lea rdi, [rdi+rdx*2]
  46. punpcklbw xmm4, xmm2
  47. punpcklbw xmm5, xmm3
  48. psadbw xmm4, xmm5
  49. paddw xmm6, xmm0
  50. paddw xmm6, xmm4
  51. cmp rsi, rcx
  52. jne x16x16sad_wmt_loop
  53. movq xmm0, xmm6
  54. psrldq xmm6, 8
  55. paddw xmm0, xmm6
  56. movq rax, xmm0
  57. ; begin epilog
  58. pop rdi
  59. pop rsi
  60. RESTORE_XMM
  61. UNSHADOW_ARGS
  62. pop rbp
  63. ret
  64. ;unsigned int vp8_sad8x16_wmt(
  65. ; unsigned char *src_ptr,
  66. ; int src_stride,
  67. ; unsigned char *ref_ptr,
  68. ; int ref_stride,
  69. ; int max_err)
  70. global sym(vp8_sad8x16_wmt)
  71. sym(vp8_sad8x16_wmt):
  72. push rbp
  73. mov rbp, rsp
  74. SHADOW_ARGS_TO_STACK 5
  75. push rbx
  76. push rsi
  77. push rdi
  78. ; end prolog
  79. mov rsi, arg(0) ;src_ptr
  80. mov rdi, arg(2) ;ref_ptr
  81. movsxd rbx, dword ptr arg(1) ;src_stride
  82. movsxd rdx, dword ptr arg(3) ;ref_stride
  83. lea rcx, [rsi+rbx*8]
  84. lea rcx, [rcx+rbx*8]
  85. pxor mm7, mm7
  86. x8x16sad_wmt_loop:
  87. movq rax, mm7
  88. cmp eax, arg(4)
  89. jg x8x16sad_wmt_early_exit
  90. movq mm0, QWORD PTR [rsi]
  91. movq mm1, QWORD PTR [rdi]
  92. movq mm2, QWORD PTR [rsi+rbx]
  93. movq mm3, QWORD PTR [rdi+rdx]
  94. psadbw mm0, mm1
  95. psadbw mm2, mm3
  96. lea rsi, [rsi+rbx*2]
  97. lea rdi, [rdi+rdx*2]
  98. paddw mm7, mm0
  99. paddw mm7, mm2
  100. cmp rsi, rcx
  101. jne x8x16sad_wmt_loop
  102. movq rax, mm7
  103. x8x16sad_wmt_early_exit:
  104. ; begin epilog
  105. pop rdi
  106. pop rsi
  107. pop rbx
  108. UNSHADOW_ARGS
  109. pop rbp
  110. ret
  111. ;unsigned int vp8_sad8x8_wmt(
  112. ; unsigned char *src_ptr,
  113. ; int src_stride,
  114. ; unsigned char *ref_ptr,
  115. ; int ref_stride)
  116. global sym(vp8_sad8x8_wmt)
  117. sym(vp8_sad8x8_wmt):
  118. push rbp
  119. mov rbp, rsp
  120. SHADOW_ARGS_TO_STACK 5
  121. push rbx
  122. push rsi
  123. push rdi
  124. ; end prolog
  125. mov rsi, arg(0) ;src_ptr
  126. mov rdi, arg(2) ;ref_ptr
  127. movsxd rbx, dword ptr arg(1) ;src_stride
  128. movsxd rdx, dword ptr arg(3) ;ref_stride
  129. lea rcx, [rsi+rbx*8]
  130. pxor mm7, mm7
  131. x8x8sad_wmt_loop:
  132. movq rax, mm7
  133. cmp eax, arg(4)
  134. jg x8x8sad_wmt_early_exit
  135. movq mm0, QWORD PTR [rsi]
  136. movq mm1, QWORD PTR [rdi]
  137. psadbw mm0, mm1
  138. lea rsi, [rsi+rbx]
  139. add rdi, rdx
  140. paddw mm7, mm0
  141. cmp rsi, rcx
  142. jne x8x8sad_wmt_loop
  143. movq rax, mm7
  144. x8x8sad_wmt_early_exit:
  145. ; begin epilog
  146. pop rdi
  147. pop rsi
  148. pop rbx
  149. UNSHADOW_ARGS
  150. pop rbp
  151. ret
  152. ;unsigned int vp8_sad4x4_wmt(
  153. ; unsigned char *src_ptr,
  154. ; int src_stride,
  155. ; unsigned char *ref_ptr,
  156. ; int ref_stride)
  157. global sym(vp8_sad4x4_wmt)
  158. sym(vp8_sad4x4_wmt):
  159. push rbp
  160. mov rbp, rsp
  161. SHADOW_ARGS_TO_STACK 4
  162. push rsi
  163. push rdi
  164. ; end prolog
  165. mov rsi, arg(0) ;src_ptr
  166. mov rdi, arg(2) ;ref_ptr
  167. movsxd rax, dword ptr arg(1) ;src_stride
  168. movsxd rdx, dword ptr arg(3) ;ref_stride
  169. movd mm0, DWORD PTR [rsi]
  170. movd mm1, DWORD PTR [rdi]
  171. movd mm2, DWORD PTR [rsi+rax]
  172. movd mm3, DWORD PTR [rdi+rdx]
  173. punpcklbw mm0, mm2
  174. punpcklbw mm1, mm3
  175. psadbw mm0, mm1
  176. lea rsi, [rsi+rax*2]
  177. lea rdi, [rdi+rdx*2]
  178. movd mm4, DWORD PTR [rsi]
  179. movd mm5, DWORD PTR [rdi]
  180. movd mm6, DWORD PTR [rsi+rax]
  181. movd mm7, DWORD PTR [rdi+rdx]
  182. punpcklbw mm4, mm6
  183. punpcklbw mm5, mm7
  184. psadbw mm4, mm5
  185. paddw mm0, mm4
  186. movq rax, mm0
  187. ; begin epilog
  188. pop rdi
  189. pop rsi
  190. UNSHADOW_ARGS
  191. pop rbp
  192. ret
  193. ;unsigned int vp8_sad16x8_wmt(
  194. ; unsigned char *src_ptr,
  195. ; int src_stride,
  196. ; unsigned char *ref_ptr,
  197. ; int ref_stride)
  198. global sym(vp8_sad16x8_wmt)
  199. sym(vp8_sad16x8_wmt):
  200. push rbp
  201. mov rbp, rsp
  202. SHADOW_ARGS_TO_STACK 5
  203. push rbx
  204. push rsi
  205. push rdi
  206. ; end prolog
  207. mov rsi, arg(0) ;src_ptr
  208. mov rdi, arg(2) ;ref_ptr
  209. movsxd rbx, dword ptr arg(1) ;src_stride
  210. movsxd rdx, dword ptr arg(3) ;ref_stride
  211. lea rcx, [rsi+rbx*8]
  212. pxor mm7, mm7
  213. x16x8sad_wmt_loop:
  214. movq rax, mm7
  215. cmp eax, arg(4)
  216. jg x16x8sad_wmt_early_exit
  217. movq mm0, QWORD PTR [rsi]
  218. movq mm2, QWORD PTR [rsi+8]
  219. movq mm1, QWORD PTR [rdi]
  220. movq mm3, QWORD PTR [rdi+8]
  221. movq mm4, QWORD PTR [rsi+rbx]
  222. movq mm5, QWORD PTR [rdi+rdx]
  223. psadbw mm0, mm1
  224. psadbw mm2, mm3
  225. movq mm1, QWORD PTR [rsi+rbx+8]
  226. movq mm3, QWORD PTR [rdi+rdx+8]
  227. psadbw mm4, mm5
  228. psadbw mm1, mm3
  229. lea rsi, [rsi+rbx*2]
  230. lea rdi, [rdi+rdx*2]
  231. paddw mm0, mm2
  232. paddw mm4, mm1
  233. paddw mm7, mm0
  234. paddw mm7, mm4
  235. cmp rsi, rcx
  236. jne x16x8sad_wmt_loop
  237. movq rax, mm7
  238. x16x8sad_wmt_early_exit:
  239. ; begin epilog
  240. pop rdi
  241. pop rsi
  242. pop rbx
  243. UNSHADOW_ARGS
  244. pop rbp
  245. ret
  246. ;void vp8_copy32xn_sse2(
  247. ; unsigned char *src_ptr,
  248. ; int src_stride,
  249. ; unsigned char *dst_ptr,
  250. ; int dst_stride,
  251. ; int height);
  252. global sym(vp8_copy32xn_sse2)
  253. sym(vp8_copy32xn_sse2):
  254. push rbp
  255. mov rbp, rsp
  256. SHADOW_ARGS_TO_STACK 5
  257. SAVE_XMM 7
  258. push rsi
  259. push rdi
  260. ; end prolog
  261. mov rsi, arg(0) ;src_ptr
  262. mov rdi, arg(2) ;dst_ptr
  263. movsxd rax, dword ptr arg(1) ;src_stride
  264. movsxd rdx, dword ptr arg(3) ;dst_stride
  265. movsxd rcx, dword ptr arg(4) ;height
  266. block_copy_sse2_loopx4:
  267. movdqu xmm0, XMMWORD PTR [rsi]
  268. movdqu xmm1, XMMWORD PTR [rsi + 16]
  269. movdqu xmm2, XMMWORD PTR [rsi + rax]
  270. movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
  271. lea rsi, [rsi+rax*2]
  272. movdqu xmm4, XMMWORD PTR [rsi]
  273. movdqu xmm5, XMMWORD PTR [rsi + 16]
  274. movdqu xmm6, XMMWORD PTR [rsi + rax]
  275. movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
  276. lea rsi, [rsi+rax*2]
  277. movdqa XMMWORD PTR [rdi], xmm0
  278. movdqa XMMWORD PTR [rdi + 16], xmm1
  279. movdqa XMMWORD PTR [rdi + rdx], xmm2
  280. movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
  281. lea rdi, [rdi+rdx*2]
  282. movdqa XMMWORD PTR [rdi], xmm4
  283. movdqa XMMWORD PTR [rdi + 16], xmm5
  284. movdqa XMMWORD PTR [rdi + rdx], xmm6
  285. movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
  286. lea rdi, [rdi+rdx*2]
  287. sub rcx, 4
  288. cmp rcx, 4
  289. jge block_copy_sse2_loopx4
  290. cmp rcx, 0
  291. je copy_is_done
  292. block_copy_sse2_loop:
  293. movdqu xmm0, XMMWORD PTR [rsi]
  294. movdqu xmm1, XMMWORD PTR [rsi + 16]
  295. lea rsi, [rsi+rax]
  296. movdqa XMMWORD PTR [rdi], xmm0
  297. movdqa XMMWORD PTR [rdi + 16], xmm1
  298. lea rdi, [rdi+rdx]
  299. sub rcx, 1
  300. jne block_copy_sse2_loop
  301. copy_is_done:
  302. ; begin epilog
  303. pop rdi
  304. pop rsi
  305. RESTORE_XMM
  306. UNSHADOW_ARGS
  307. pop rbp
  308. ret