/media/libvpx/vp8/encoder/x86/variance_impl_mmx.asm

http://github.com/zpao/v8monkey · Assembly · 851 lines · 579 code · 179 blank · 93 comment · 0 complexity · c873f374701f250c3cf103d0bc103675 MD5 · raw file

  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. ;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
  12. global sym(vp8_get_mb_ss_mmx)
  13. sym(vp8_get_mb_ss_mmx):
  14. push rbp
  15. mov rbp, rsp
  16. SHADOW_ARGS_TO_STACK 7
  17. GET_GOT rbx
  18. push rsi
  19. push rdi
  20. sub rsp, 8
  21. ; end prolog
  22. mov rax, arg(0) ;src_ptr
  23. mov rcx, 16
  24. pxor mm4, mm4
  25. NEXTROW:
  26. movq mm0, [rax]
  27. movq mm1, [rax+8]
  28. movq mm2, [rax+16]
  29. movq mm3, [rax+24]
  30. pmaddwd mm0, mm0
  31. pmaddwd mm1, mm1
  32. pmaddwd mm2, mm2
  33. pmaddwd mm3, mm3
  34. paddd mm4, mm0
  35. paddd mm4, mm1
  36. paddd mm4, mm2
  37. paddd mm4, mm3
  38. add rax, 32
  39. dec rcx
  40. ja NEXTROW
  41. movq QWORD PTR [rsp], mm4
  42. ;return sum[0]+sum[1];
  43. movsxd rax, dword ptr [rsp]
  44. movsxd rcx, dword ptr [rsp+4]
  45. add rax, rcx
  46. ; begin epilog
  47. add rsp, 8
  48. pop rdi
  49. pop rsi
  50. RESTORE_GOT
  51. UNSHADOW_ARGS
  52. pop rbp
  53. ret
  54. ;unsigned int vp8_get8x8var_mmx
  55. ;(
  56. ; unsigned char *src_ptr,
  57. ; int source_stride,
  58. ; unsigned char *ref_ptr,
  59. ; int recon_stride,
  60. ; unsigned int *SSE,
  61. ; int *Sum
  62. ;)
  63. global sym(vp8_get8x8var_mmx)
  64. sym(vp8_get8x8var_mmx):
  65. push rbp
  66. mov rbp, rsp
  67. SHADOW_ARGS_TO_STACK 6
  68. push rsi
  69. push rdi
  70. push rbx
  71. sub rsp, 16
  72. ; end prolog
  73. pxor mm5, mm5 ; Blank mmx6
  74. pxor mm6, mm6 ; Blank mmx7
  75. pxor mm7, mm7 ; Blank mmx7
  76. mov rax, arg(0) ;[src_ptr] ; Load base addresses
  77. mov rbx, arg(2) ;[ref_ptr]
  78. movsxd rcx, dword ptr arg(1) ;[source_stride]
  79. movsxd rdx, dword ptr arg(3) ;[recon_stride]
  80. ; Row 1
  81. movq mm0, [rax] ; Copy eight bytes to mm0
  82. movq mm1, [rbx] ; Copy eight bytes to mm1
  83. movq mm2, mm0 ; Take copies
  84. movq mm3, mm1 ; Take copies
  85. punpcklbw mm0, mm6 ; unpack to higher prrcision
  86. punpcklbw mm1, mm6
  87. punpckhbw mm2, mm6 ; unpack to higher prrcision
  88. punpckhbw mm3, mm6
  89. psubsw mm0, mm1 ; A-B (low order) to MM0
  90. psubsw mm2, mm3 ; A-B (high order) to MM2
  91. paddw mm5, mm0 ; accumulate differences in mm5
  92. paddw mm5, mm2 ; accumulate differences in mm5
  93. pmaddwd mm0, mm0 ; square and accumulate
  94. pmaddwd mm2, mm2 ; square and accumulate
  95. add rbx,rdx ; Inc pointer into ref data
  96. add rax,rcx ; Inc pointer into the new data
  97. movq mm1, [rbx] ; Copy eight bytes to mm1
  98. paddd mm7, mm0 ; accumulate in mm7
  99. paddd mm7, mm2 ; accumulate in mm7
  100. ; Row 2
  101. movq mm0, [rax] ; Copy eight bytes to mm0
  102. movq mm2, mm0 ; Take copies
  103. movq mm3, mm1 ; Take copies
  104. punpcklbw mm0, mm6 ; unpack to higher prrcision
  105. punpcklbw mm1, mm6
  106. punpckhbw mm2, mm6 ; unpack to higher prrcision
  107. punpckhbw mm3, mm6
  108. psubsw mm0, mm1 ; A-B (low order) to MM0
  109. psubsw mm2, mm3 ; A-B (high order) to MM2
  110. paddw mm5, mm0 ; accumulate differences in mm5
  111. paddw mm5, mm2 ; accumulate differences in mm5
  112. pmaddwd mm0, mm0 ; square and accumulate
  113. pmaddwd mm2, mm2 ; square and accumulate
  114. add rbx,rdx ; Inc pointer into ref data
  115. add rax,rcx ; Inc pointer into the new data
  116. movq mm1, [rbx] ; Copy eight bytes to mm1
  117. paddd mm7, mm0 ; accumulate in mm7
  118. paddd mm7, mm2 ; accumulate in mm7
  119. ; Row 3
  120. movq mm0, [rax] ; Copy eight bytes to mm0
  121. movq mm2, mm0 ; Take copies
  122. movq mm3, mm1 ; Take copies
  123. punpcklbw mm0, mm6 ; unpack to higher prrcision
  124. punpcklbw mm1, mm6
  125. punpckhbw mm2, mm6 ; unpack to higher prrcision
  126. punpckhbw mm3, mm6
  127. psubsw mm0, mm1 ; A-B (low order) to MM0
  128. psubsw mm2, mm3 ; A-B (high order) to MM2
  129. paddw mm5, mm0 ; accumulate differences in mm5
  130. paddw mm5, mm2 ; accumulate differences in mm5
  131. pmaddwd mm0, mm0 ; square and accumulate
  132. pmaddwd mm2, mm2 ; square and accumulate
  133. add rbx,rdx ; Inc pointer into ref data
  134. add rax,rcx ; Inc pointer into the new data
  135. movq mm1, [rbx] ; Copy eight bytes to mm1
  136. paddd mm7, mm0 ; accumulate in mm7
  137. paddd mm7, mm2 ; accumulate in mm7
  138. ; Row 4
  139. movq mm0, [rax] ; Copy eight bytes to mm0
  140. movq mm2, mm0 ; Take copies
  141. movq mm3, mm1 ; Take copies
  142. punpcklbw mm0, mm6 ; unpack to higher prrcision
  143. punpcklbw mm1, mm6
  144. punpckhbw mm2, mm6 ; unpack to higher prrcision
  145. punpckhbw mm3, mm6
  146. psubsw mm0, mm1 ; A-B (low order) to MM0
  147. psubsw mm2, mm3 ; A-B (high order) to MM2
  148. paddw mm5, mm0 ; accumulate differences in mm5
  149. paddw mm5, mm2 ; accumulate differences in mm5
  150. pmaddwd mm0, mm0 ; square and accumulate
  151. pmaddwd mm2, mm2 ; square and accumulate
  152. add rbx,rdx ; Inc pointer into ref data
  153. add rax,rcx ; Inc pointer into the new data
  154. movq mm1, [rbx] ; Copy eight bytes to mm1
  155. paddd mm7, mm0 ; accumulate in mm7
  156. paddd mm7, mm2 ; accumulate in mm7
  157. ; Row 5
  158. movq mm0, [rax] ; Copy eight bytes to mm0
  159. movq mm2, mm0 ; Take copies
  160. movq mm3, mm1 ; Take copies
  161. punpcklbw mm0, mm6 ; unpack to higher prrcision
  162. punpcklbw mm1, mm6
  163. punpckhbw mm2, mm6 ; unpack to higher prrcision
  164. punpckhbw mm3, mm6
  165. psubsw mm0, mm1 ; A-B (low order) to MM0
  166. psubsw mm2, mm3 ; A-B (high order) to MM2
  167. paddw mm5, mm0 ; accumulate differences in mm5
  168. paddw mm5, mm2 ; accumulate differences in mm5
  169. pmaddwd mm0, mm0 ; square and accumulate
  170. pmaddwd mm2, mm2 ; square and accumulate
  171. add rbx,rdx ; Inc pointer into ref data
  172. add rax,rcx ; Inc pointer into the new data
  173. movq mm1, [rbx] ; Copy eight bytes to mm1
  174. ; movq mm4, [rbx + rdx]
  175. paddd mm7, mm0 ; accumulate in mm7
  176. paddd mm7, mm2 ; accumulate in mm7
  177. ; Row 6
  178. movq mm0, [rax] ; Copy eight bytes to mm0
  179. movq mm2, mm0 ; Take copies
  180. movq mm3, mm1 ; Take copies
  181. punpcklbw mm0, mm6 ; unpack to higher prrcision
  182. punpcklbw mm1, mm6
  183. punpckhbw mm2, mm6 ; unpack to higher prrcision
  184. punpckhbw mm3, mm6
  185. psubsw mm0, mm1 ; A-B (low order) to MM0
  186. psubsw mm2, mm3 ; A-B (high order) to MM2
  187. paddw mm5, mm0 ; accumulate differences in mm5
  188. paddw mm5, mm2 ; accumulate differences in mm5
  189. pmaddwd mm0, mm0 ; square and accumulate
  190. pmaddwd mm2, mm2 ; square and accumulate
  191. add rbx,rdx ; Inc pointer into ref data
  192. add rax,rcx ; Inc pointer into the new data
  193. movq mm1, [rbx] ; Copy eight bytes to mm1
  194. paddd mm7, mm0 ; accumulate in mm7
  195. paddd mm7, mm2 ; accumulate in mm7
  196. ; Row 7
  197. movq mm0, [rax] ; Copy eight bytes to mm0
  198. movq mm2, mm0 ; Take copies
  199. movq mm3, mm1 ; Take copies
  200. punpcklbw mm0, mm6 ; unpack to higher prrcision
  201. punpcklbw mm1, mm6
  202. punpckhbw mm2, mm6 ; unpack to higher prrcision
  203. punpckhbw mm3, mm6
  204. psubsw mm0, mm1 ; A-B (low order) to MM0
  205. psubsw mm2, mm3 ; A-B (high order) to MM2
  206. paddw mm5, mm0 ; accumulate differences in mm5
  207. paddw mm5, mm2 ; accumulate differences in mm5
  208. pmaddwd mm0, mm0 ; square and accumulate
  209. pmaddwd mm2, mm2 ; square and accumulate
  210. add rbx,rdx ; Inc pointer into ref data
  211. add rax,rcx ; Inc pointer into the new data
  212. movq mm1, [rbx] ; Copy eight bytes to mm1
  213. paddd mm7, mm0 ; accumulate in mm7
  214. paddd mm7, mm2 ; accumulate in mm7
  215. ; Row 8
  216. movq mm0, [rax] ; Copy eight bytes to mm0
  217. movq mm2, mm0 ; Take copies
  218. movq mm3, mm1 ; Take copies
  219. punpcklbw mm0, mm6 ; unpack to higher prrcision
  220. punpcklbw mm1, mm6
  221. punpckhbw mm2, mm6 ; unpack to higher prrcision
  222. punpckhbw mm3, mm6
  223. psubsw mm0, mm1 ; A-B (low order) to MM0
  224. psubsw mm2, mm3 ; A-B (high order) to MM2
  225. paddw mm5, mm0 ; accumulate differences in mm5
  226. paddw mm5, mm2 ; accumulate differences in mm5
  227. pmaddwd mm0, mm0 ; square and accumulate
  228. pmaddwd mm2, mm2 ; square and accumulate
  229. add rbx,rdx ; Inc pointer into ref data
  230. add rax,rcx ; Inc pointer into the new data
  231. paddd mm7, mm0 ; accumulate in mm7
  232. paddd mm7, mm2 ; accumulate in mm7
  233. ; Now accumulate the final results.
  234. movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
  235. movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
  236. movsx rdx, WORD PTR [rsp+8]
  237. movsx rcx, WORD PTR [rsp+10]
  238. movsx rbx, WORD PTR [rsp+12]
  239. movsx rax, WORD PTR [rsp+14]
  240. add rdx, rcx
  241. add rbx, rax
  242. add rdx, rbx ;XSum
  243. movsxd rax, DWORD PTR [rsp]
  244. movsxd rcx, DWORD PTR [rsp+4]
  245. add rax, rcx ;XXSum
  246. mov rsi, arg(4) ;SSE
  247. mov rdi, arg(5) ;Sum
  248. mov dword ptr [rsi], eax
  249. mov dword ptr [rdi], edx
  250. xor rax, rax ; return 0
  251. ; begin epilog
  252. add rsp, 16
  253. pop rbx
  254. pop rdi
  255. pop rsi
  256. UNSHADOW_ARGS
  257. pop rbp
  258. ret
  259. ;unsigned int
  260. ;vp8_get4x4var_mmx
  261. ;(
  262. ; unsigned char *src_ptr,
  263. ; int source_stride,
  264. ; unsigned char *ref_ptr,
  265. ; int recon_stride,
  266. ; unsigned int *SSE,
  267. ; int *Sum
  268. ;)
  269. global sym(vp8_get4x4var_mmx)
  270. sym(vp8_get4x4var_mmx):
  271. push rbp
  272. mov rbp, rsp
  273. SHADOW_ARGS_TO_STACK 6
  274. push rsi
  275. push rdi
  276. push rbx
  277. sub rsp, 16
  278. ; end prolog
  279. pxor mm5, mm5 ; Blank mmx6
  280. pxor mm6, mm6 ; Blank mmx7
  281. pxor mm7, mm7 ; Blank mmx7
  282. mov rax, arg(0) ;[src_ptr] ; Load base addresses
  283. mov rbx, arg(2) ;[ref_ptr]
  284. movsxd rcx, dword ptr arg(1) ;[source_stride]
  285. movsxd rdx, dword ptr arg(3) ;[recon_stride]
  286. ; Row 1
  287. movq mm0, [rax] ; Copy eight bytes to mm0
  288. movq mm1, [rbx] ; Copy eight bytes to mm1
  289. punpcklbw mm0, mm6 ; unpack to higher prrcision
  290. punpcklbw mm1, mm6
  291. psubsw mm0, mm1 ; A-B (low order) to MM0
  292. paddw mm5, mm0 ; accumulate differences in mm5
  293. pmaddwd mm0, mm0 ; square and accumulate
  294. add rbx,rdx ; Inc pointer into ref data
  295. add rax,rcx ; Inc pointer into the new data
  296. movq mm1, [rbx] ; Copy eight bytes to mm1
  297. paddd mm7, mm0 ; accumulate in mm7
  298. ; Row 2
  299. movq mm0, [rax] ; Copy eight bytes to mm0
  300. punpcklbw mm0, mm6 ; unpack to higher prrcision
  301. punpcklbw mm1, mm6
  302. psubsw mm0, mm1 ; A-B (low order) to MM0
  303. paddw mm5, mm0 ; accumulate differences in mm5
  304. pmaddwd mm0, mm0 ; square and accumulate
  305. add rbx,rdx ; Inc pointer into ref data
  306. add rax,rcx ; Inc pointer into the new data
  307. movq mm1, [rbx] ; Copy eight bytes to mm1
  308. paddd mm7, mm0 ; accumulate in mm7
  309. ; Row 3
  310. movq mm0, [rax] ; Copy eight bytes to mm0
  311. punpcklbw mm0, mm6 ; unpack to higher prrcision
  312. punpcklbw mm1, mm6
  313. psubsw mm0, mm1 ; A-B (low order) to MM0
  314. paddw mm5, mm0 ; accumulate differences in mm5
  315. pmaddwd mm0, mm0 ; square and accumulate
  316. add rbx,rdx ; Inc pointer into ref data
  317. add rax,rcx ; Inc pointer into the new data
  318. movq mm1, [rbx] ; Copy eight bytes to mm1
  319. paddd mm7, mm0 ; accumulate in mm7
  320. ; Row 4
  321. movq mm0, [rax] ; Copy eight bytes to mm0
  322. punpcklbw mm0, mm6 ; unpack to higher prrcision
  323. punpcklbw mm1, mm6
  324. psubsw mm0, mm1 ; A-B (low order) to MM0
  325. paddw mm5, mm0 ; accumulate differences in mm5
  326. pmaddwd mm0, mm0 ; square and accumulate
  327. paddd mm7, mm0 ; accumulate in mm7
  328. ; Now accumulate the final results.
  329. movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
  330. movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
  331. movsx rdx, WORD PTR [rsp+8]
  332. movsx rcx, WORD PTR [rsp+10]
  333. movsx rbx, WORD PTR [rsp+12]
  334. movsx rax, WORD PTR [rsp+14]
  335. add rdx, rcx
  336. add rbx, rax
  337. add rdx, rbx ;XSum
  338. movsxd rax, DWORD PTR [rsp]
  339. movsxd rcx, DWORD PTR [rsp+4]
  340. add rax, rcx ;XXSum
  341. mov rsi, arg(4) ;SSE
  342. mov rdi, arg(5) ;Sum
  343. mov dword ptr [rsi], eax
  344. mov dword ptr [rdi], edx
  345. xor rax, rax ; return 0
  346. ; begin epilog
  347. add rsp, 16
  348. pop rbx
  349. pop rdi
  350. pop rsi
  351. UNSHADOW_ARGS
  352. pop rbp
  353. ret
  354. ;unsigned int
  355. ;vp8_get4x4sse_cs_mmx
  356. ;(
  357. ; unsigned char *src_ptr,
  358. ; int source_stride,
  359. ; unsigned char *ref_ptr,
  360. ; int recon_stride
  361. ;)
  362. global sym(vp8_get4x4sse_cs_mmx)
  363. sym(vp8_get4x4sse_cs_mmx):
  364. push rbp
  365. mov rbp, rsp
  366. SHADOW_ARGS_TO_STACK 4
  367. push rsi
  368. push rdi
  369. push rbx
  370. ; end prolog
  371. pxor mm6, mm6 ; Blank mmx7
  372. pxor mm7, mm7 ; Blank mmx7
  373. mov rax, arg(0) ;[src_ptr] ; Load base addresses
  374. mov rbx, arg(2) ;[ref_ptr]
  375. movsxd rcx, dword ptr arg(1) ;[source_stride]
  376. movsxd rdx, dword ptr arg(3) ;[recon_stride]
  377. ; Row 1
  378. movd mm0, [rax] ; Copy eight bytes to mm0
  379. movd mm1, [rbx] ; Copy eight bytes to mm1
  380. punpcklbw mm0, mm6 ; unpack to higher prrcision
  381. punpcklbw mm1, mm6
  382. psubsw mm0, mm1 ; A-B (low order) to MM0
  383. pmaddwd mm0, mm0 ; square and accumulate
  384. add rbx,rdx ; Inc pointer into ref data
  385. add rax,rcx ; Inc pointer into the new data
  386. movd mm1, [rbx] ; Copy eight bytes to mm1
  387. paddd mm7, mm0 ; accumulate in mm7
  388. ; Row 2
  389. movd mm0, [rax] ; Copy eight bytes to mm0
  390. punpcklbw mm0, mm6 ; unpack to higher prrcision
  391. punpcklbw mm1, mm6
  392. psubsw mm0, mm1 ; A-B (low order) to MM0
  393. pmaddwd mm0, mm0 ; square and accumulate
  394. add rbx,rdx ; Inc pointer into ref data
  395. add rax,rcx ; Inc pointer into the new data
  396. movd mm1, [rbx] ; Copy eight bytes to mm1
  397. paddd mm7, mm0 ; accumulate in mm7
  398. ; Row 3
  399. movd mm0, [rax] ; Copy eight bytes to mm0
  400. punpcklbw mm1, mm6
  401. punpcklbw mm0, mm6 ; unpack to higher prrcision
  402. psubsw mm0, mm1 ; A-B (low order) to MM0
  403. pmaddwd mm0, mm0 ; square and accumulate
  404. add rbx,rdx ; Inc pointer into ref data
  405. add rax,rcx ; Inc pointer into the new data
  406. movd mm1, [rbx] ; Copy eight bytes to mm1
  407. paddd mm7, mm0 ; accumulate in mm7
  408. ; Row 4
  409. movd mm0, [rax] ; Copy eight bytes to mm0
  410. punpcklbw mm0, mm6 ; unpack to higher prrcision
  411. punpcklbw mm1, mm6
  412. psubsw mm0, mm1 ; A-B (low order) to MM0
  413. pmaddwd mm0, mm0 ; square and accumulate
  414. paddd mm7, mm0 ; accumulate in mm7
  415. movq mm0, mm7 ;
  416. psrlq mm7, 32
  417. paddd mm0, mm7
  418. movq rax, mm0
  419. ; begin epilog
  420. pop rbx
  421. pop rdi
  422. pop rsi
  423. UNSHADOW_ARGS
  424. pop rbp
  425. ret
  426. %define mmx_filter_shift 7
  427. ;void vp8_filter_block2d_bil4x4_var_mmx
  428. ;(
  429. ; unsigned char *ref_ptr,
  430. ; int ref_pixels_per_line,
  431. ; unsigned char *src_ptr,
  432. ; int src_pixels_per_line,
  433. ; unsigned short *HFilter,
  434. ; unsigned short *VFilter,
  435. ; int *sum,
  436. ; unsigned int *sumsquared
  437. ;)
  438. global sym(vp8_filter_block2d_bil4x4_var_mmx)
  439. sym(vp8_filter_block2d_bil4x4_var_mmx):
  440. push rbp
  441. mov rbp, rsp
  442. SHADOW_ARGS_TO_STACK 8
  443. GET_GOT rbx
  444. push rsi
  445. push rdi
  446. sub rsp, 16
  447. ; end prolog
  448. pxor mm6, mm6 ;
  449. pxor mm7, mm7 ;
  450. mov rax, arg(4) ;HFilter ;
  451. mov rdx, arg(5) ;VFilter ;
  452. mov rsi, arg(0) ;ref_ptr ;
  453. mov rdi, arg(2) ;src_ptr ;
  454. mov rcx, 4 ;
  455. pxor mm0, mm0 ;
  456. movd mm1, [rsi] ;
  457. movd mm3, [rsi+1] ;
  458. punpcklbw mm1, mm0 ;
  459. pmullw mm1, [rax] ;
  460. punpcklbw mm3, mm0 ;
  461. pmullw mm3, [rax+8] ;
  462. paddw mm1, mm3 ;
  463. paddw mm1, [GLOBAL(mmx_bi_rd)] ;
  464. psraw mm1, mmx_filter_shift ;
  465. movq mm5, mm1
  466. %if ABI_IS_32BIT
  467. add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
  468. %else
  469. movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
  470. add rsi, r8
  471. %endif
  472. filter_block2d_bil4x4_var_mmx_loop:
  473. movd mm1, [rsi] ;
  474. movd mm3, [rsi+1] ;
  475. punpcklbw mm1, mm0 ;
  476. pmullw mm1, [rax] ;
  477. punpcklbw mm3, mm0 ;
  478. pmullw mm3, [rax+8] ;
  479. paddw mm1, mm3 ;
  480. paddw mm1, [GLOBAL(mmx_bi_rd)] ;
  481. psraw mm1, mmx_filter_shift ;
  482. movq mm3, mm5 ;
  483. movq mm5, mm1 ;
  484. pmullw mm3, [rdx] ;
  485. pmullw mm1, [rdx+8] ;
  486. paddw mm1, mm3 ;
  487. paddw mm1, [GLOBAL(mmx_bi_rd)] ;
  488. psraw mm1, mmx_filter_shift ;
  489. movd mm3, [rdi] ;
  490. punpcklbw mm3, mm0 ;
  491. psubw mm1, mm3 ;
  492. paddw mm6, mm1 ;
  493. pmaddwd mm1, mm1 ;
  494. paddd mm7, mm1 ;
  495. %if ABI_IS_32BIT
  496. add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
  497. add rdi, dword ptr arg(3) ;src_pixels_per_line ;
  498. %else
  499. movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
  500. movsxd r9, dword ptr arg(3) ;src_pixels_per_line
  501. add rsi, r8
  502. add rdi, r9
  503. %endif
  504. sub rcx, 1 ;
  505. jnz filter_block2d_bil4x4_var_mmx_loop ;
  506. pxor mm3, mm3 ;
  507. pxor mm2, mm2 ;
  508. punpcklwd mm2, mm6 ;
  509. punpckhwd mm3, mm6 ;
  510. paddd mm2, mm3 ;
  511. movq mm6, mm2 ;
  512. psrlq mm6, 32 ;
  513. paddd mm2, mm6 ;
  514. psrad mm2, 16 ;
  515. movq mm4, mm7 ;
  516. psrlq mm4, 32 ;
  517. paddd mm4, mm7 ;
  518. mov rdi, arg(6) ;sum
  519. mov rsi, arg(7) ;sumsquared
  520. movd dword ptr [rdi], mm2 ;
  521. movd dword ptr [rsi], mm4 ;
  522. ; begin epilog
  523. add rsp, 16
  524. pop rdi
  525. pop rsi
  526. RESTORE_GOT
  527. UNSHADOW_ARGS
  528. pop rbp
  529. ret
  530. ;void vp8_filter_block2d_bil_var_mmx
  531. ;(
  532. ; unsigned char *ref_ptr,
  533. ; int ref_pixels_per_line,
  534. ; unsigned char *src_ptr,
  535. ; int src_pixels_per_line,
  536. ; unsigned int Height,
  537. ; unsigned short *HFilter,
  538. ; unsigned short *VFilter,
  539. ; int *sum,
  540. ; unsigned int *sumsquared
  541. ;)
  542. global sym(vp8_filter_block2d_bil_var_mmx)
  543. sym(vp8_filter_block2d_bil_var_mmx):
  544. push rbp
  545. mov rbp, rsp
  546. SHADOW_ARGS_TO_STACK 9
  547. GET_GOT rbx
  548. push rsi
  549. push rdi
  550. sub rsp, 16
  551. ; end prolog
  552. pxor mm6, mm6 ;
  553. pxor mm7, mm7 ;
  554. mov rax, arg(5) ;HFilter ;
  555. mov rdx, arg(6) ;VFilter ;
  556. mov rsi, arg(0) ;ref_ptr ;
  557. mov rdi, arg(2) ;src_ptr ;
  558. movsxd rcx, dword ptr arg(4) ;Height ;
  559. pxor mm0, mm0 ;
  560. movq mm1, [rsi] ;
  561. movq mm3, [rsi+1] ;
  562. movq mm2, mm1 ;
  563. movq mm4, mm3 ;
  564. punpcklbw mm1, mm0 ;
  565. punpckhbw mm2, mm0 ;
  566. pmullw mm1, [rax] ;
  567. pmullw mm2, [rax] ;
  568. punpcklbw mm3, mm0 ;
  569. punpckhbw mm4, mm0 ;
  570. pmullw mm3, [rax+8] ;
  571. pmullw mm4, [rax+8] ;
  572. paddw mm1, mm3 ;
  573. paddw mm2, mm4 ;
  574. paddw mm1, [GLOBAL(mmx_bi_rd)] ;
  575. psraw mm1, mmx_filter_shift ;
  576. paddw mm2, [GLOBAL(mmx_bi_rd)] ;
  577. psraw mm2, mmx_filter_shift ;
  578. movq mm5, mm1
  579. packuswb mm5, mm2 ;
  580. %if ABI_IS_32BIT
  581. add rsi, dword ptr arg(1) ;ref_pixels_per_line
  582. %else
  583. movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
  584. add rsi, r8
  585. %endif
  586. filter_block2d_bil_var_mmx_loop:
  587. movq mm1, [rsi] ;
  588. movq mm3, [rsi+1] ;
  589. movq mm2, mm1 ;
  590. movq mm4, mm3 ;
  591. punpcklbw mm1, mm0 ;
  592. punpckhbw mm2, mm0 ;
  593. pmullw mm1, [rax] ;
  594. pmullw mm2, [rax] ;
  595. punpcklbw mm3, mm0 ;
  596. punpckhbw mm4, mm0 ;
  597. pmullw mm3, [rax+8] ;
  598. pmullw mm4, [rax+8] ;
  599. paddw mm1, mm3 ;
  600. paddw mm2, mm4 ;
  601. paddw mm1, [GLOBAL(mmx_bi_rd)] ;
  602. psraw mm1, mmx_filter_shift ;
  603. paddw mm2, [GLOBAL(mmx_bi_rd)] ;
  604. psraw mm2, mmx_filter_shift ;
  605. movq mm3, mm5 ;
  606. movq mm4, mm5 ;
  607. punpcklbw mm3, mm0 ;
  608. punpckhbw mm4, mm0 ;
  609. movq mm5, mm1 ;
  610. packuswb mm5, mm2 ;
  611. pmullw mm3, [rdx] ;
  612. pmullw mm4, [rdx] ;
  613. pmullw mm1, [rdx+8] ;
  614. pmullw mm2, [rdx+8] ;
  615. paddw mm1, mm3 ;
  616. paddw mm2, mm4 ;
  617. paddw mm1, [GLOBAL(mmx_bi_rd)] ;
  618. paddw mm2, [GLOBAL(mmx_bi_rd)] ;
  619. psraw mm1, mmx_filter_shift ;
  620. psraw mm2, mmx_filter_shift ;
  621. movq mm3, [rdi] ;
  622. movq mm4, mm3 ;
  623. punpcklbw mm3, mm0 ;
  624. punpckhbw mm4, mm0 ;
  625. psubw mm1, mm3 ;
  626. psubw mm2, mm4 ;
  627. paddw mm6, mm1 ;
  628. pmaddwd mm1, mm1 ;
  629. paddw mm6, mm2 ;
  630. pmaddwd mm2, mm2 ;
  631. paddd mm7, mm1 ;
  632. paddd mm7, mm2 ;
  633. %if ABI_IS_32BIT
  634. add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
  635. add rdi, dword ptr arg(3) ;src_pixels_per_line ;
  636. %else
  637. movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
  638. movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
  639. add rsi, r8
  640. add rdi, r9
  641. %endif
  642. sub rcx, 1 ;
  643. jnz filter_block2d_bil_var_mmx_loop ;
  644. pxor mm3, mm3 ;
  645. pxor mm2, mm2 ;
  646. punpcklwd mm2, mm6 ;
  647. punpckhwd mm3, mm6 ;
  648. paddd mm2, mm3 ;
  649. movq mm6, mm2 ;
  650. psrlq mm6, 32 ;
  651. paddd mm2, mm6 ;
  652. psrad mm2, 16 ;
  653. movq mm4, mm7 ;
  654. psrlq mm4, 32 ;
  655. paddd mm4, mm7 ;
  656. mov rdi, arg(7) ;sum
  657. mov rsi, arg(8) ;sumsquared
  658. movd dword ptr [rdi], mm2 ;
  659. movd dword ptr [rsi], mm4 ;
  660. ; begin epilog
  661. add rsp, 16
  662. pop rdi
  663. pop rsi
  664. RESTORE_GOT
  665. UNSHADOW_ARGS
  666. pop rbp
  667. ret
  668. SECTION_RODATA
  669. ;short mmx_bi_rd[4] = { 64, 64, 64, 64};
  670. align 16
  671. mmx_bi_rd:
  672. times 4 dw 64