PageRenderTime 65ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 1ms

/libavcodec/x86/h264_intrapred.asm

http://github.com/FFmpeg/FFmpeg
Assembly | 2757 lines | 2482 code | 134 blank | 141 comment | 14 complexity | 54c2ba2d59cfdaf4f91ee096a84aef53 MD5 | raw file
Possible License(s): GPL-2.0, GPL-3.0, LGPL-2.1, LGPL-3.0, CC-BY-SA-3.0
  1. ;******************************************************************************
  2. ;* H.264 intra prediction asm optimizations
  3. ;* Copyright (c) 2010 Fiona Glaser
  4. ;* Copyright (c) 2010 Holger Lubitz
  5. ;* Copyright (c) 2010 Loren Merritt
  6. ;* Copyright (c) 2010 Ronald S. Bultje
  7. ;*
  8. ;* This file is part of FFmpeg.
  9. ;*
  10. ;* FFmpeg is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* FFmpeg is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with FFmpeg; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "libavutil/x86/x86util.asm"
  25. SECTION_RODATA
  26. tm_shuf: times 8 db 0x03, 0x80
  27. pw_ff00: times 8 dw 0xff00
  28. plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
  29. db 1, 2, 3, 4, 5, 6, 7, 8
  30. plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
  31. db 1, 2, 3, 4, 0, 0, 0, 0
  32. pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
  33. pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
  34. pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
  35. pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
  36. SECTION .text
  37. cextern pb_1
  38. cextern pb_3
  39. cextern pw_4
  40. cextern pw_5
  41. cextern pw_8
  42. cextern pw_16
  43. cextern pw_17
  44. cextern pw_32
  45. ;-----------------------------------------------------------------------------
  46. ; void ff_pred16x16_vertical_8(uint8_t *src, ptrdiff_t stride)
  47. ;-----------------------------------------------------------------------------
  48. INIT_MMX mmx
  49. cglobal pred16x16_vertical_8, 2,3
  50. sub r0, r1
  51. mov r2, 8
  52. movq mm0, [r0+0]
  53. movq mm1, [r0+8]
  54. .loop:
  55. movq [r0+r1*1+0], mm0
  56. movq [r0+r1*1+8], mm1
  57. movq [r0+r1*2+0], mm0
  58. movq [r0+r1*2+8], mm1
  59. lea r0, [r0+r1*2]
  60. dec r2
  61. jg .loop
  62. REP_RET
  63. INIT_XMM sse
  64. cglobal pred16x16_vertical_8, 2,3
  65. sub r0, r1
  66. mov r2, 4
  67. movaps xmm0, [r0]
  68. .loop:
  69. movaps [r0+r1*1], xmm0
  70. movaps [r0+r1*2], xmm0
  71. lea r0, [r0+r1*2]
  72. movaps [r0+r1*1], xmm0
  73. movaps [r0+r1*2], xmm0
  74. lea r0, [r0+r1*2]
  75. dec r2
  76. jg .loop
  77. REP_RET
  78. ;-----------------------------------------------------------------------------
  79. ; void ff_pred16x16_horizontal_8(uint8_t *src, ptrdiff_t stride)
  80. ;-----------------------------------------------------------------------------
  81. %macro PRED16x16_H 0
  82. cglobal pred16x16_horizontal_8, 2,3
  83. mov r2, 8
  84. %if cpuflag(ssse3)
  85. mova m2, [pb_3]
  86. %endif
  87. .loop:
  88. movd m0, [r0+r1*0-4]
  89. movd m1, [r0+r1*1-4]
  90. %if cpuflag(ssse3)
  91. pshufb m0, m2
  92. pshufb m1, m2
  93. %else
  94. punpcklbw m0, m0
  95. punpcklbw m1, m1
  96. SPLATW m0, m0, 3
  97. SPLATW m1, m1, 3
  98. mova [r0+r1*0+8], m0
  99. mova [r0+r1*1+8], m1
  100. %endif
  101. mova [r0+r1*0], m0
  102. mova [r0+r1*1], m1
  103. lea r0, [r0+r1*2]
  104. dec r2
  105. jg .loop
  106. REP_RET
  107. %endmacro
  108. INIT_MMX mmx
  109. PRED16x16_H
  110. INIT_MMX mmxext
  111. PRED16x16_H
  112. INIT_XMM ssse3
  113. PRED16x16_H
  114. ;-----------------------------------------------------------------------------
  115. ; void ff_pred16x16_dc_8(uint8_t *src, ptrdiff_t stride)
  116. ;-----------------------------------------------------------------------------
  117. %macro PRED16x16_DC 0
  118. cglobal pred16x16_dc_8, 2,7
  119. mov r4, r0
  120. sub r0, r1
  121. pxor mm0, mm0
  122. pxor mm1, mm1
  123. psadbw mm0, [r0+0]
  124. psadbw mm1, [r0+8]
  125. dec r0
  126. movzx r5d, byte [r0+r1*1]
  127. paddw mm0, mm1
  128. movd r6d, mm0
  129. lea r0, [r0+r1*2]
  130. %rep 7
  131. movzx r2d, byte [r0+r1*0]
  132. movzx r3d, byte [r0+r1*1]
  133. add r5d, r2d
  134. add r6d, r3d
  135. lea r0, [r0+r1*2]
  136. %endrep
  137. movzx r2d, byte [r0+r1*0]
  138. add r5d, r6d
  139. lea r2d, [r2+r5+16]
  140. shr r2d, 5
  141. %if cpuflag(ssse3)
  142. pxor m1, m1
  143. %endif
  144. SPLATB_REG m0, r2, m1
  145. %if mmsize==8
  146. mov r3d, 8
  147. .loop:
  148. mova [r4+r1*0+0], m0
  149. mova [r4+r1*0+8], m0
  150. mova [r4+r1*1+0], m0
  151. mova [r4+r1*1+8], m0
  152. %else
  153. mov r3d, 4
  154. .loop:
  155. mova [r4+r1*0], m0
  156. mova [r4+r1*1], m0
  157. lea r4, [r4+r1*2]
  158. mova [r4+r1*0], m0
  159. mova [r4+r1*1], m0
  160. %endif
  161. lea r4, [r4+r1*2]
  162. dec r3d
  163. jg .loop
  164. REP_RET
  165. %endmacro
  166. INIT_MMX mmxext
  167. PRED16x16_DC
  168. INIT_XMM sse2
  169. PRED16x16_DC
  170. INIT_XMM ssse3
  171. PRED16x16_DC
  172. ;-----------------------------------------------------------------------------
  173. ; void ff_pred16x16_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
  174. ;-----------------------------------------------------------------------------
  175. %macro PRED16x16_TM 0
  176. cglobal pred16x16_tm_vp8_8, 2,5
  177. sub r0, r1
  178. pxor mm7, mm7
  179. movq mm0, [r0+0]
  180. movq mm2, [r0+8]
  181. movq mm1, mm0
  182. movq mm3, mm2
  183. punpcklbw mm0, mm7
  184. punpckhbw mm1, mm7
  185. punpcklbw mm2, mm7
  186. punpckhbw mm3, mm7
  187. movzx r3d, byte [r0-1]
  188. mov r4d, 16
  189. .loop:
  190. movzx r2d, byte [r0+r1-1]
  191. sub r2d, r3d
  192. movd mm4, r2d
  193. SPLATW mm4, mm4, 0
  194. movq mm5, mm4
  195. movq mm6, mm4
  196. movq mm7, mm4
  197. paddw mm4, mm0
  198. paddw mm5, mm1
  199. paddw mm6, mm2
  200. paddw mm7, mm3
  201. packuswb mm4, mm5
  202. packuswb mm6, mm7
  203. movq [r0+r1+0], mm4
  204. movq [r0+r1+8], mm6
  205. add r0, r1
  206. dec r4d
  207. jg .loop
  208. REP_RET
  209. %endmacro
  210. INIT_MMX mmx
  211. PRED16x16_TM
  212. INIT_MMX mmxext
  213. PRED16x16_TM
  214. INIT_XMM sse2
  215. cglobal pred16x16_tm_vp8_8, 2,6,6
  216. sub r0, r1
  217. pxor xmm2, xmm2
  218. movdqa xmm0, [r0]
  219. movdqa xmm1, xmm0
  220. punpcklbw xmm0, xmm2
  221. punpckhbw xmm1, xmm2
  222. movzx r4d, byte [r0-1]
  223. mov r5d, 8
  224. .loop:
  225. movzx r2d, byte [r0+r1*1-1]
  226. movzx r3d, byte [r0+r1*2-1]
  227. sub r2d, r4d
  228. sub r3d, r4d
  229. movd xmm2, r2d
  230. movd xmm4, r3d
  231. pshuflw xmm2, xmm2, 0
  232. pshuflw xmm4, xmm4, 0
  233. punpcklqdq xmm2, xmm2
  234. punpcklqdq xmm4, xmm4
  235. movdqa xmm3, xmm2
  236. movdqa xmm5, xmm4
  237. paddw xmm2, xmm0
  238. paddw xmm3, xmm1
  239. paddw xmm4, xmm0
  240. paddw xmm5, xmm1
  241. packuswb xmm2, xmm3
  242. packuswb xmm4, xmm5
  243. movdqa [r0+r1*1], xmm2
  244. movdqa [r0+r1*2], xmm4
  245. lea r0, [r0+r1*2]
  246. dec r5d
  247. jg .loop
  248. REP_RET
  249. %if HAVE_AVX2_EXTERNAL
  250. INIT_YMM avx2
  251. cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration
  252. sub dstq, strideq
  253. pmovzxbw m0, [dstq]
  254. vpbroadcastb xm1, [r0-1]
  255. pmovzxbw m1, xm1
  256. psubw m0, m1
  257. mov iterationd, 4
  258. lea stride3q, [strideq*3]
  259. .loop:
  260. vpbroadcastb xm1, [dstq+strideq*1-1]
  261. vpbroadcastb xm2, [dstq+strideq*2-1]
  262. vpbroadcastb xm3, [dstq+stride3q-1]
  263. vpbroadcastb xm4, [dstq+strideq*4-1]
  264. pmovzxbw m1, xm1
  265. pmovzxbw m2, xm2
  266. pmovzxbw m3, xm3
  267. pmovzxbw m4, xm4
  268. paddw m1, m0
  269. paddw m2, m0
  270. paddw m3, m0
  271. paddw m4, m0
  272. vpackuswb m1, m1, m2
  273. vpackuswb m3, m3, m4
  274. vpermq m1, m1, q3120
  275. vpermq m3, m3, q3120
  276. movdqa [dstq+strideq*1], xm1
  277. vextracti128 [dstq+strideq*2], m1, 1
  278. movdqa [dstq+stride3q*1], xm3
  279. vextracti128 [dstq+strideq*4], m3, 1
  280. lea dstq, [dstq+strideq*4]
  281. dec iterationd
  282. jg .loop
  283. REP_RET
  284. %endif
  285. ;-----------------------------------------------------------------------------
  286. ; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride)
  287. ;-----------------------------------------------------------------------------
  288. %macro H264_PRED16x16_PLANE 1
  289. cglobal pred16x16_plane_%1_8, 2,9,7
  290. mov r2, r1 ; +stride
  291. neg r1 ; -stride
  292. movh m0, [r0+r1 -1]
  293. %if mmsize == 8
  294. pxor m4, m4
  295. movh m1, [r0+r1 +3 ]
  296. movh m2, [r0+r1 +8 ]
  297. movh m3, [r0+r1 +12]
  298. punpcklbw m0, m4
  299. punpcklbw m1, m4
  300. punpcklbw m2, m4
  301. punpcklbw m3, m4
  302. pmullw m0, [pw_m8tom1 ]
  303. pmullw m1, [pw_m8tom1+8]
  304. pmullw m2, [pw_1to8 ]
  305. pmullw m3, [pw_1to8 +8]
  306. paddw m0, m2
  307. paddw m1, m3
  308. %else ; mmsize == 16
  309. %if cpuflag(ssse3)
  310. movhps m0, [r0+r1 +8]
  311. pmaddubsw m0, [plane_shuf] ; H coefficients
  312. %else ; sse2
  313. pxor m2, m2
  314. movh m1, [r0+r1 +8]
  315. punpcklbw m0, m2
  316. punpcklbw m1, m2
  317. pmullw m0, [pw_m8tom1]
  318. pmullw m1, [pw_1to8]
  319. paddw m0, m1
  320. %endif
  321. movhlps m1, m0
  322. %endif
  323. paddw m0, m1
  324. %if cpuflag(mmxext)
  325. PSHUFLW m1, m0, 0xE
  326. %elif cpuflag(mmx)
  327. mova m1, m0
  328. psrlq m1, 32
  329. %endif
  330. paddw m0, m1
  331. %if cpuflag(mmxext)
  332. PSHUFLW m1, m0, 0x1
  333. %elif cpuflag(mmx)
  334. mova m1, m0
  335. psrlq m1, 16
  336. %endif
  337. paddw m0, m1 ; sum of H coefficients
  338. lea r4, [r0+r2*8-1]
  339. lea r3, [r0+r2*4-1]
  340. add r4, r2
  341. %if ARCH_X86_64
  342. %define e_reg r8
  343. %else
  344. %define e_reg r0
  345. %endif
  346. movzx e_reg, byte [r3+r2*2 ]
  347. movzx r5, byte [r4+r1 ]
  348. sub r5, e_reg
  349. movzx e_reg, byte [r3+r2 ]
  350. movzx r6, byte [r4 ]
  351. sub r6, e_reg
  352. lea r5, [r5+r6*2]
  353. movzx e_reg, byte [r3+r1 ]
  354. movzx r6, byte [r4+r2*2 ]
  355. sub r6, e_reg
  356. lea r5, [r5+r6*4]
  357. movzx e_reg, byte [r3 ]
  358. %if ARCH_X86_64
  359. movzx r7, byte [r4+r2 ]
  360. sub r7, e_reg
  361. %else
  362. movzx r6, byte [r4+r2 ]
  363. sub r6, e_reg
  364. lea r5, [r5+r6*4]
  365. sub r5, r6
  366. %endif
  367. lea e_reg, [r3+r1*4]
  368. lea r3, [r4+r2*4]
  369. movzx r4, byte [e_reg+r2 ]
  370. movzx r6, byte [r3 ]
  371. sub r6, r4
  372. %if ARCH_X86_64
  373. lea r6, [r7+r6*2]
  374. lea r5, [r5+r6*2]
  375. add r5, r6
  376. %else
  377. lea r5, [r5+r6*4]
  378. lea r5, [r5+r6*2]
  379. %endif
  380. movzx r4, byte [e_reg ]
  381. %if ARCH_X86_64
  382. movzx r7, byte [r3 +r2 ]
  383. sub r7, r4
  384. sub r5, r7
  385. %else
  386. movzx r6, byte [r3 +r2 ]
  387. sub r6, r4
  388. lea r5, [r5+r6*8]
  389. sub r5, r6
  390. %endif
  391. movzx r4, byte [e_reg+r1 ]
  392. movzx r6, byte [r3 +r2*2]
  393. sub r6, r4
  394. %if ARCH_X86_64
  395. add r6, r7
  396. %endif
  397. lea r5, [r5+r6*8]
  398. movzx r4, byte [e_reg+r2*2]
  399. movzx r6, byte [r3 +r1 ]
  400. sub r6, r4
  401. lea r5, [r5+r6*4]
  402. add r5, r6 ; sum of V coefficients
  403. %if ARCH_X86_64 == 0
  404. mov r0, r0m
  405. %endif
  406. %ifidn %1, h264
  407. lea r5, [r5*5+32]
  408. sar r5, 6
  409. %elifidn %1, rv40
  410. lea r5, [r5*5]
  411. sar r5, 6
  412. %elifidn %1, svq3
  413. test r5, r5
  414. lea r6, [r5+3]
  415. cmovs r5, r6
  416. sar r5, 2 ; V/4
  417. lea r5, [r5*5] ; 5*(V/4)
  418. test r5, r5
  419. lea r6, [r5+15]
  420. cmovs r5, r6
  421. sar r5, 4 ; (5*(V/4))/16
  422. %endif
  423. movzx r4, byte [r0+r1 +15]
  424. movzx r3, byte [r3+r2*2 ]
  425. lea r3, [r3+r4+1]
  426. shl r3, 4
  427. movd r1d, m0
  428. movsx r1d, r1w
  429. %ifnidn %1, svq3
  430. %ifidn %1, h264
  431. lea r1d, [r1d*5+32]
  432. %else ; rv40
  433. lea r1d, [r1d*5]
  434. %endif
  435. sar r1d, 6
  436. %else ; svq3
  437. test r1d, r1d
  438. lea r4d, [r1d+3]
  439. cmovs r1d, r4d
  440. sar r1d, 2 ; H/4
  441. lea r1d, [r1d*5] ; 5*(H/4)
  442. test r1d, r1d
  443. lea r4d, [r1d+15]
  444. cmovs r1d, r4d
  445. sar r1d, 4 ; (5*(H/4))/16
  446. %endif
  447. movd m0, r1d
  448. add r1d, r5d
  449. add r3d, r1d
  450. shl r1d, 3
  451. sub r3d, r1d ; a
  452. movd m1, r5d
  453. movd m3, r3d
  454. SPLATW m0, m0, 0 ; H
  455. SPLATW m1, m1, 0 ; V
  456. SPLATW m3, m3, 0 ; a
  457. %ifidn %1, svq3
  458. SWAP 0, 1
  459. %endif
  460. mova m2, m0
  461. %if mmsize == 8
  462. mova m5, m0
  463. %endif
  464. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  465. %if mmsize == 16
  466. psllw m2, 3
  467. %else
  468. psllw m5, 3
  469. psllw m2, 2
  470. mova m6, m5
  471. paddw m6, m2
  472. %endif
  473. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  474. paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
  475. %if mmsize == 8
  476. paddw m5, m0 ; a + {8,9,10,11}*H
  477. paddw m6, m0 ; a + {12,13,14,15}*H
  478. %endif
  479. mov r4, 8
  480. .loop:
  481. mova m3, m0 ; b[0..7]
  482. mova m4, m2 ; b[8..15]
  483. psraw m3, 5
  484. psraw m4, 5
  485. packuswb m3, m4
  486. mova [r0], m3
  487. %if mmsize == 8
  488. mova m3, m5 ; b[8..11]
  489. mova m4, m6 ; b[12..15]
  490. psraw m3, 5
  491. psraw m4, 5
  492. packuswb m3, m4
  493. mova [r0+8], m3
  494. %endif
  495. paddw m0, m1
  496. paddw m2, m1
  497. %if mmsize == 8
  498. paddw m5, m1
  499. paddw m6, m1
  500. %endif
  501. mova m3, m0 ; b[0..7]
  502. mova m4, m2 ; b[8..15]
  503. psraw m3, 5
  504. psraw m4, 5
  505. packuswb m3, m4
  506. mova [r0+r2], m3
  507. %if mmsize == 8
  508. mova m3, m5 ; b[8..11]
  509. mova m4, m6 ; b[12..15]
  510. psraw m3, 5
  511. psraw m4, 5
  512. packuswb m3, m4
  513. mova [r0+r2+8], m3
  514. %endif
  515. paddw m0, m1
  516. paddw m2, m1
  517. %if mmsize == 8
  518. paddw m5, m1
  519. paddw m6, m1
  520. %endif
  521. lea r0, [r0+r2*2]
  522. dec r4
  523. jg .loop
  524. REP_RET
  525. %endmacro
  526. INIT_MMX mmx
  527. H264_PRED16x16_PLANE h264
  528. H264_PRED16x16_PLANE rv40
  529. H264_PRED16x16_PLANE svq3
  530. INIT_MMX mmxext
  531. H264_PRED16x16_PLANE h264
  532. H264_PRED16x16_PLANE rv40
  533. H264_PRED16x16_PLANE svq3
  534. INIT_XMM sse2
  535. H264_PRED16x16_PLANE h264
  536. H264_PRED16x16_PLANE rv40
  537. H264_PRED16x16_PLANE svq3
  538. INIT_XMM ssse3
  539. H264_PRED16x16_PLANE h264
  540. H264_PRED16x16_PLANE rv40
  541. H264_PRED16x16_PLANE svq3
  542. ;-----------------------------------------------------------------------------
  543. ; void ff_pred8x8_plane_8(uint8_t *src, ptrdiff_t stride)
  544. ;-----------------------------------------------------------------------------
  545. %macro H264_PRED8x8_PLANE 0
  546. cglobal pred8x8_plane_8, 2,9,7
  547. mov r2, r1 ; +stride
  548. neg r1 ; -stride
  549. movd m0, [r0+r1 -1]
  550. %if mmsize == 8
  551. pxor m2, m2
  552. movh m1, [r0+r1 +4 ]
  553. punpcklbw m0, m2
  554. punpcklbw m1, m2
  555. pmullw m0, [pw_m4to4]
  556. pmullw m1, [pw_m4to4+8]
  557. %else ; mmsize == 16
  558. %if cpuflag(ssse3)
  559. movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
  560. pmaddubsw m0, [plane8_shuf] ; H coefficients
  561. %else ; sse2
  562. pxor m2, m2
  563. movd m1, [r0+r1 +4]
  564. punpckldq m0, m1
  565. punpcklbw m0, m2
  566. pmullw m0, [pw_m4to4]
  567. %endif
  568. movhlps m1, m0
  569. %endif
  570. paddw m0, m1
  571. %if notcpuflag(ssse3)
  572. %if cpuflag(mmxext)
  573. PSHUFLW m1, m0, 0xE
  574. %elif cpuflag(mmx)
  575. mova m1, m0
  576. psrlq m1, 32
  577. %endif
  578. paddw m0, m1
  579. %endif ; !ssse3
  580. %if cpuflag(mmxext)
  581. PSHUFLW m1, m0, 0x1
  582. %elif cpuflag(mmx)
  583. mova m1, m0
  584. psrlq m1, 16
  585. %endif
  586. paddw m0, m1 ; sum of H coefficients
  587. lea r4, [r0+r2*4-1]
  588. lea r3, [r0 -1]
  589. add r4, r2
  590. %if ARCH_X86_64
  591. %define e_reg r8
  592. %else
  593. %define e_reg r0
  594. %endif
  595. movzx e_reg, byte [r3+r2*2 ]
  596. movzx r5, byte [r4+r1 ]
  597. sub r5, e_reg
  598. movzx e_reg, byte [r3 ]
  599. %if ARCH_X86_64
  600. movzx r7, byte [r4+r2 ]
  601. sub r7, e_reg
  602. sub r5, r7
  603. %else
  604. movzx r6, byte [r4+r2 ]
  605. sub r6, e_reg
  606. lea r5, [r5+r6*4]
  607. sub r5, r6
  608. %endif
  609. movzx e_reg, byte [r3+r1 ]
  610. movzx r6, byte [r4+r2*2 ]
  611. sub r6, e_reg
  612. %if ARCH_X86_64
  613. add r6, r7
  614. %endif
  615. lea r5, [r5+r6*4]
  616. movzx e_reg, byte [r3+r2 ]
  617. movzx r6, byte [r4 ]
  618. sub r6, e_reg
  619. lea r6, [r5+r6*2]
  620. lea r5, [r6*9+16]
  621. lea r5, [r5+r6*8]
  622. sar r5, 5
  623. %if ARCH_X86_64 == 0
  624. mov r0, r0m
  625. %endif
  626. movzx r3, byte [r4+r2*2 ]
  627. movzx r4, byte [r0+r1 +7]
  628. lea r3, [r3+r4+1]
  629. shl r3, 4
  630. movd r1d, m0
  631. movsx r1d, r1w
  632. imul r1d, 17
  633. add r1d, 16
  634. sar r1d, 5
  635. movd m0, r1d
  636. add r1d, r5d
  637. sub r3d, r1d
  638. add r1d, r1d
  639. sub r3d, r1d ; a
  640. movd m1, r5d
  641. movd m3, r3d
  642. SPLATW m0, m0, 0 ; H
  643. SPLATW m1, m1, 0 ; V
  644. SPLATW m3, m3, 0 ; a
  645. %if mmsize == 8
  646. mova m2, m0
  647. %endif
  648. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  649. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  650. %if mmsize == 8
  651. psllw m2, 2
  652. paddw m2, m0 ; a + {4,5,6,7}*H
  653. %endif
  654. mov r4, 4
  655. ALIGN 16
  656. .loop:
  657. %if mmsize == 16
  658. mova m3, m0 ; b[0..7]
  659. paddw m0, m1
  660. psraw m3, 5
  661. mova m4, m0 ; V+b[0..7]
  662. paddw m0, m1
  663. psraw m4, 5
  664. packuswb m3, m4
  665. movh [r0], m3
  666. movhps [r0+r2], m3
  667. %else ; mmsize == 8
  668. mova m3, m0 ; b[0..3]
  669. mova m4, m2 ; b[4..7]
  670. paddw m0, m1
  671. paddw m2, m1
  672. psraw m3, 5
  673. psraw m4, 5
  674. mova m5, m0 ; V+b[0..3]
  675. mova m6, m2 ; V+b[4..7]
  676. paddw m0, m1
  677. paddw m2, m1
  678. psraw m5, 5
  679. psraw m6, 5
  680. packuswb m3, m4
  681. packuswb m5, m6
  682. mova [r0], m3
  683. mova [r0+r2], m5
  684. %endif
  685. lea r0, [r0+r2*2]
  686. dec r4
  687. jg .loop
  688. REP_RET
  689. %endmacro
  690. INIT_MMX mmx
  691. H264_PRED8x8_PLANE
  692. INIT_MMX mmxext
  693. H264_PRED8x8_PLANE
  694. INIT_XMM sse2
  695. H264_PRED8x8_PLANE
  696. INIT_XMM ssse3
  697. H264_PRED8x8_PLANE
  698. ;-----------------------------------------------------------------------------
  699. ; void ff_pred8x8_vertical_8(uint8_t *src, ptrdiff_t stride)
  700. ;-----------------------------------------------------------------------------
  701. INIT_MMX mmx
  702. cglobal pred8x8_vertical_8, 2,2
  703. sub r0, r1
  704. movq mm0, [r0]
  705. %rep 3
  706. movq [r0+r1*1], mm0
  707. movq [r0+r1*2], mm0
  708. lea r0, [r0+r1*2]
  709. %endrep
  710. movq [r0+r1*1], mm0
  711. movq [r0+r1*2], mm0
  712. RET
  713. ;-----------------------------------------------------------------------------
  714. ; void ff_pred8x8_horizontal_8(uint8_t *src, ptrdiff_t stride)
  715. ;-----------------------------------------------------------------------------
  716. %macro PRED8x8_H 0
  717. cglobal pred8x8_horizontal_8, 2,3
  718. mov r2, 4
  719. %if cpuflag(ssse3)
  720. mova m2, [pb_3]
  721. %endif
  722. .loop:
  723. SPLATB_LOAD m0, r0+r1*0-1, m2
  724. SPLATB_LOAD m1, r0+r1*1-1, m2
  725. mova [r0+r1*0], m0
  726. mova [r0+r1*1], m1
  727. lea r0, [r0+r1*2]
  728. dec r2
  729. jg .loop
  730. REP_RET
  731. %endmacro
  732. INIT_MMX mmx
  733. PRED8x8_H
  734. INIT_MMX mmxext
  735. PRED8x8_H
  736. INIT_MMX ssse3
  737. PRED8x8_H
  738. ;-----------------------------------------------------------------------------
  739. ; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
  740. ;-----------------------------------------------------------------------------
  741. INIT_MMX mmxext
  742. cglobal pred8x8_top_dc_8, 2,5
  743. sub r0, r1
  744. movq mm0, [r0]
  745. pxor mm1, mm1
  746. pxor mm2, mm2
  747. lea r2, [r0+r1*2]
  748. punpckhbw mm1, mm0
  749. punpcklbw mm0, mm2
  750. psadbw mm1, mm2 ; s1
  751. lea r3, [r2+r1*2]
  752. psadbw mm0, mm2 ; s0
  753. psrlw mm1, 1
  754. psrlw mm0, 1
  755. pavgw mm1, mm2
  756. lea r4, [r3+r1*2]
  757. pavgw mm0, mm2
  758. pshufw mm1, mm1, 0
  759. pshufw mm0, mm0, 0 ; dc0 (w)
  760. packuswb mm0, mm1 ; dc0,dc1 (b)
  761. movq [r0+r1*1], mm0
  762. movq [r0+r1*2], mm0
  763. lea r0, [r3+r1*2]
  764. movq [r2+r1*1], mm0
  765. movq [r2+r1*2], mm0
  766. movq [r3+r1*1], mm0
  767. movq [r3+r1*2], mm0
  768. movq [r0+r1*1], mm0
  769. movq [r0+r1*2], mm0
  770. RET
  771. ;-----------------------------------------------------------------------------
  772. ; void ff_pred8x8_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
  773. ;-----------------------------------------------------------------------------
  774. INIT_MMX mmxext
  775. cglobal pred8x8_dc_8, 2,5
  776. sub r0, r1
  777. pxor m7, m7
  778. movd m0, [r0+0]
  779. movd m1, [r0+4]
  780. psadbw m0, m7 ; s0
  781. mov r4, r0
  782. psadbw m1, m7 ; s1
  783. movzx r2d, byte [r0+r1*1-1]
  784. movzx r3d, byte [r0+r1*2-1]
  785. lea r0, [r0+r1*2]
  786. add r2d, r3d
  787. movzx r3d, byte [r0+r1*1-1]
  788. add r2d, r3d
  789. movzx r3d, byte [r0+r1*2-1]
  790. add r2d, r3d
  791. lea r0, [r0+r1*2]
  792. movd m2, r2d ; s2
  793. movzx r2d, byte [r0+r1*1-1]
  794. movzx r3d, byte [r0+r1*2-1]
  795. lea r0, [r0+r1*2]
  796. add r2d, r3d
  797. movzx r3d, byte [r0+r1*1-1]
  798. add r2d, r3d
  799. movzx r3d, byte [r0+r1*2-1]
  800. add r2d, r3d
  801. movd m3, r2d ; s3
  802. punpcklwd m0, m1
  803. mov r0, r4
  804. punpcklwd m2, m3
  805. punpckldq m0, m2 ; s0, s1, s2, s3
  806. pshufw m3, m0, 11110110b ; s2, s1, s3, s3
  807. lea r2, [r0+r1*2]
  808. pshufw m0, m0, 01110100b ; s0, s1, s3, s1
  809. paddw m0, m3
  810. lea r3, [r2+r1*2]
  811. psrlw m0, 2
  812. pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
  813. lea r4, [r3+r1*2]
  814. packuswb m0, m0
  815. punpcklbw m0, m0
  816. movq m1, m0
  817. punpcklbw m0, m0
  818. punpckhbw m1, m1
  819. movq [r0+r1*1], m0
  820. movq [r0+r1*2], m0
  821. movq [r2+r1*1], m0
  822. movq [r2+r1*2], m0
  823. movq [r3+r1*1], m1
  824. movq [r3+r1*2], m1
  825. movq [r4+r1*1], m1
  826. movq [r4+r1*2], m1
  827. RET
  828. ;-----------------------------------------------------------------------------
  829. ; void ff_pred8x8_dc_rv40_8(uint8_t *src, ptrdiff_t stride)
  830. ;-----------------------------------------------------------------------------
  831. INIT_MMX mmxext
  832. cglobal pred8x8_dc_rv40_8, 2,7
  833. mov r4, r0
  834. sub r0, r1
  835. pxor mm0, mm0
  836. psadbw mm0, [r0]
  837. dec r0
  838. movzx r5d, byte [r0+r1*1]
  839. movd r6d, mm0
  840. lea r0, [r0+r1*2]
  841. %rep 3
  842. movzx r2d, byte [r0+r1*0]
  843. movzx r3d, byte [r0+r1*1]
  844. add r5d, r2d
  845. add r6d, r3d
  846. lea r0, [r0+r1*2]
  847. %endrep
  848. movzx r2d, byte [r0+r1*0]
  849. add r5d, r6d
  850. lea r2d, [r2+r5+8]
  851. shr r2d, 4
  852. movd mm0, r2d
  853. punpcklbw mm0, mm0
  854. pshufw mm0, mm0, 0
  855. mov r3d, 4
  856. .loop:
  857. movq [r4+r1*0], mm0
  858. movq [r4+r1*1], mm0
  859. lea r4, [r4+r1*2]
  860. dec r3d
  861. jg .loop
  862. REP_RET
  863. ;-----------------------------------------------------------------------------
  864. ; void ff_pred8x8_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
  865. ;-----------------------------------------------------------------------------
  866. %macro PRED8x8_TM 0
  867. cglobal pred8x8_tm_vp8_8, 2,6
  868. sub r0, r1
  869. pxor mm7, mm7
  870. movq mm0, [r0]
  871. movq mm1, mm0
  872. punpcklbw mm0, mm7
  873. punpckhbw mm1, mm7
  874. movzx r4d, byte [r0-1]
  875. mov r5d, 4
  876. .loop:
  877. movzx r2d, byte [r0+r1*1-1]
  878. movzx r3d, byte [r0+r1*2-1]
  879. sub r2d, r4d
  880. sub r3d, r4d
  881. movd mm2, r2d
  882. movd mm4, r3d
  883. SPLATW mm2, mm2, 0
  884. SPLATW mm4, mm4, 0
  885. movq mm3, mm2
  886. movq mm5, mm4
  887. paddw mm2, mm0
  888. paddw mm3, mm1
  889. paddw mm4, mm0
  890. paddw mm5, mm1
  891. packuswb mm2, mm3
  892. packuswb mm4, mm5
  893. movq [r0+r1*1], mm2
  894. movq [r0+r1*2], mm4
  895. lea r0, [r0+r1*2]
  896. dec r5d
  897. jg .loop
  898. REP_RET
  899. %endmacro
  900. INIT_MMX mmx
  901. PRED8x8_TM
  902. INIT_MMX mmxext
  903. PRED8x8_TM
  904. INIT_XMM sse2
  905. cglobal pred8x8_tm_vp8_8, 2,6,4
  906. sub r0, r1
  907. pxor xmm1, xmm1
  908. movq xmm0, [r0]
  909. punpcklbw xmm0, xmm1
  910. movzx r4d, byte [r0-1]
  911. mov r5d, 4
  912. .loop:
  913. movzx r2d, byte [r0+r1*1-1]
  914. movzx r3d, byte [r0+r1*2-1]
  915. sub r2d, r4d
  916. sub r3d, r4d
  917. movd xmm2, r2d
  918. movd xmm3, r3d
  919. pshuflw xmm2, xmm2, 0
  920. pshuflw xmm3, xmm3, 0
  921. punpcklqdq xmm2, xmm2
  922. punpcklqdq xmm3, xmm3
  923. paddw xmm2, xmm0
  924. paddw xmm3, xmm0
  925. packuswb xmm2, xmm3
  926. movq [r0+r1*1], xmm2
  927. movhps [r0+r1*2], xmm2
  928. lea r0, [r0+r1*2]
  929. dec r5d
  930. jg .loop
  931. REP_RET
  932. INIT_XMM ssse3
  933. cglobal pred8x8_tm_vp8_8, 2,3,6
  934. sub r0, r1
  935. movdqa xmm4, [tm_shuf]
  936. pxor xmm1, xmm1
  937. movq xmm0, [r0]
  938. punpcklbw xmm0, xmm1
  939. movd xmm5, [r0-4]
  940. pshufb xmm5, xmm4
  941. mov r2d, 4
  942. .loop:
  943. movd xmm2, [r0+r1*1-4]
  944. movd xmm3, [r0+r1*2-4]
  945. pshufb xmm2, xmm4
  946. pshufb xmm3, xmm4
  947. psubw xmm2, xmm5
  948. psubw xmm3, xmm5
  949. paddw xmm2, xmm0
  950. paddw xmm3, xmm0
  951. packuswb xmm2, xmm3
  952. movq [r0+r1*1], xmm2
  953. movhps [r0+r1*2], xmm2
  954. lea r0, [r0+r1*2]
  955. dec r2d
  956. jg .loop
  957. REP_RET
  958. ; dest, left, right, src, tmp
  959. ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  960. %macro PRED4x4_LOWPASS 5
  961. mova %5, %2
  962. pavgb %2, %3
  963. pxor %3, %5
  964. mova %1, %4
  965. pand %3, [pb_1]
  966. psubusb %2, %3
  967. pavgb %1, %2
  968. %endmacro
  969. ;-----------------------------------------------------------------------------
  970. ; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright,
  971. ; ptrdiff_t stride)
  972. ;-----------------------------------------------------------------------------
  973. %macro PRED8x8L_TOP_DC 0
  974. cglobal pred8x8l_top_dc_8, 4,4
  975. sub r0, r3
  976. pxor mm7, mm7
  977. movq mm0, [r0-8]
  978. movq mm3, [r0]
  979. movq mm1, [r0+8]
  980. movq mm2, mm3
  981. movq mm4, mm3
  982. PALIGNR mm2, mm0, 7, mm0
  983. PALIGNR mm1, mm4, 1, mm4
  984. test r1d, r1d ; top_left
  985. jz .fix_lt_2
  986. test r2d, r2d ; top_right
  987. jz .fix_tr_1
  988. jmp .body
  989. .fix_lt_2:
  990. movq mm5, mm3
  991. pxor mm5, mm2
  992. psllq mm5, 56
  993. psrlq mm5, 56
  994. pxor mm2, mm5
  995. test r2d, r2d ; top_right
  996. jnz .body
  997. .fix_tr_1:
  998. movq mm5, mm3
  999. pxor mm5, mm1
  1000. psrlq mm5, 56
  1001. psllq mm5, 56
  1002. pxor mm1, mm5
  1003. .body:
  1004. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  1005. psadbw mm7, mm0
  1006. paddw mm7, [pw_4]
  1007. psrlw mm7, 3
  1008. pshufw mm7, mm7, 0
  1009. packuswb mm7, mm7
  1010. %rep 3
  1011. movq [r0+r3*1], mm7
  1012. movq [r0+r3*2], mm7
  1013. lea r0, [r0+r3*2]
  1014. %endrep
  1015. movq [r0+r3*1], mm7
  1016. movq [r0+r3*2], mm7
  1017. RET
  1018. %endmacro
  1019. INIT_MMX mmxext
  1020. PRED8x8L_TOP_DC
  1021. INIT_MMX ssse3
  1022. PRED8x8L_TOP_DC
  1023. ;-----------------------------------------------------------------------------
  1024. ; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright,
  1025. ; ptrdiff_t stride)
  1026. ;-----------------------------------------------------------------------------
  1027. %macro PRED8x8L_DC 0
  1028. cglobal pred8x8l_dc_8, 4,5
  1029. sub r0, r3
  1030. lea r4, [r0+r3*2]
  1031. movq mm0, [r0+r3*1-8]
  1032. punpckhbw mm0, [r0+r3*0-8]
  1033. movq mm1, [r4+r3*1-8]
  1034. punpckhbw mm1, [r0+r3*2-8]
  1035. mov r4, r0
  1036. punpckhwd mm1, mm0
  1037. lea r0, [r0+r3*4]
  1038. movq mm2, [r0+r3*1-8]
  1039. punpckhbw mm2, [r0+r3*0-8]
  1040. lea r0, [r0+r3*2]
  1041. movq mm3, [r0+r3*1-8]
  1042. punpckhbw mm3, [r0+r3*0-8]
  1043. punpckhwd mm3, mm2
  1044. punpckhdq mm3, mm1
  1045. lea r0, [r0+r3*2]
  1046. movq mm0, [r0+r3*0-8]
  1047. movq mm1, [r4]
  1048. mov r0, r4
  1049. movq mm4, mm3
  1050. movq mm2, mm3
  1051. PALIGNR mm4, mm0, 7, mm0
  1052. PALIGNR mm1, mm2, 1, mm2
  1053. test r1d, r1d
  1054. jnz .do_left
  1055. .fix_lt_1:
  1056. movq mm5, mm3
  1057. pxor mm5, mm4
  1058. psrlq mm5, 56
  1059. psllq mm5, 48
  1060. pxor mm1, mm5
  1061. jmp .do_left
  1062. .fix_lt_2:
  1063. movq mm5, mm3
  1064. pxor mm5, mm2
  1065. psllq mm5, 56
  1066. psrlq mm5, 56
  1067. pxor mm2, mm5
  1068. test r2d, r2d
  1069. jnz .body
  1070. .fix_tr_1:
  1071. movq mm5, mm3
  1072. pxor mm5, mm1
  1073. psrlq mm5, 56
  1074. psllq mm5, 56
  1075. pxor mm1, mm5
  1076. jmp .body
  1077. .do_left:
  1078. movq mm0, mm4
  1079. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1080. movq mm4, mm0
  1081. movq mm7, mm2
  1082. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1083. psllq mm1, 56
  1084. PALIGNR mm7, mm1, 7, mm3
  1085. movq mm0, [r0-8]
  1086. movq mm3, [r0]
  1087. movq mm1, [r0+8]
  1088. movq mm2, mm3
  1089. movq mm4, mm3
  1090. PALIGNR mm2, mm0, 7, mm0
  1091. PALIGNR mm1, mm4, 1, mm4
  1092. test r1d, r1d
  1093. jz .fix_lt_2
  1094. test r2d, r2d
  1095. jz .fix_tr_1
  1096. .body:
  1097. lea r1, [r0+r3*2]
  1098. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1099. pxor mm0, mm0
  1100. pxor mm1, mm1
  1101. lea r2, [r1+r3*2]
  1102. psadbw mm0, mm7
  1103. psadbw mm1, mm6
  1104. paddw mm0, [pw_8]
  1105. paddw mm0, mm1
  1106. lea r4, [r2+r3*2]
  1107. psrlw mm0, 4
  1108. pshufw mm0, mm0, 0
  1109. packuswb mm0, mm0
  1110. movq [r0+r3*1], mm0
  1111. movq [r0+r3*2], mm0
  1112. movq [r1+r3*1], mm0
  1113. movq [r1+r3*2], mm0
  1114. movq [r2+r3*1], mm0
  1115. movq [r2+r3*2], mm0
  1116. movq [r4+r3*1], mm0
  1117. movq [r4+r3*2], mm0
  1118. RET
  1119. %endmacro
  1120. INIT_MMX mmxext
  1121. PRED8x8L_DC
  1122. INIT_MMX ssse3
  1123. PRED8x8L_DC
  1124. ;-----------------------------------------------------------------------------
  1125. ; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft,
  1126. ; int has_topright, ptrdiff_t stride)
  1127. ;-----------------------------------------------------------------------------
  1128. %macro PRED8x8L_HORIZONTAL 0
  1129. cglobal pred8x8l_horizontal_8, 4,4
  1130. sub r0, r3
  1131. lea r2, [r0+r3*2]
  1132. movq mm0, [r0+r3*1-8]
  1133. test r1d, r1d
  1134. lea r1, [r0+r3]
  1135. cmovnz r1, r0
  1136. punpckhbw mm0, [r1+r3*0-8]
  1137. movq mm1, [r2+r3*1-8]
  1138. punpckhbw mm1, [r0+r3*2-8]
  1139. mov r2, r0
  1140. punpckhwd mm1, mm0
  1141. lea r0, [r0+r3*4]
  1142. movq mm2, [r0+r3*1-8]
  1143. punpckhbw mm2, [r0+r3*0-8]
  1144. lea r0, [r0+r3*2]
  1145. movq mm3, [r0+r3*1-8]
  1146. punpckhbw mm3, [r0+r3*0-8]
  1147. punpckhwd mm3, mm2
  1148. punpckhdq mm3, mm1
  1149. lea r0, [r0+r3*2]
  1150. movq mm0, [r0+r3*0-8]
  1151. movq mm1, [r1+r3*0-8]
  1152. mov r0, r2
  1153. movq mm4, mm3
  1154. movq mm2, mm3
  1155. PALIGNR mm4, mm0, 7, mm0
  1156. PALIGNR mm1, mm2, 1, mm2
  1157. movq mm0, mm4
  1158. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1159. movq mm4, mm0
  1160. movq mm7, mm2
  1161. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1162. psllq mm1, 56
  1163. PALIGNR mm7, mm1, 7, mm3
  1164. movq mm3, mm7
  1165. lea r1, [r0+r3*2]
  1166. movq mm7, mm3
  1167. punpckhbw mm3, mm3
  1168. punpcklbw mm7, mm7
  1169. pshufw mm0, mm3, 0xff
  1170. pshufw mm1, mm3, 0xaa
  1171. lea r2, [r1+r3*2]
  1172. pshufw mm2, mm3, 0x55
  1173. pshufw mm3, mm3, 0x00
  1174. pshufw mm4, mm7, 0xff
  1175. pshufw mm5, mm7, 0xaa
  1176. pshufw mm6, mm7, 0x55
  1177. pshufw mm7, mm7, 0x00
  1178. movq [r0+r3*1], mm0
  1179. movq [r0+r3*2], mm1
  1180. movq [r1+r3*1], mm2
  1181. movq [r1+r3*2], mm3
  1182. movq [r2+r3*1], mm4
  1183. movq [r2+r3*2], mm5
  1184. lea r0, [r2+r3*2]
  1185. movq [r0+r3*1], mm6
  1186. movq [r0+r3*2], mm7
  1187. RET
  1188. %endmacro
  1189. INIT_MMX mmxext
  1190. PRED8x8L_HORIZONTAL
  1191. INIT_MMX ssse3
  1192. PRED8x8L_HORIZONTAL
  1193. ;-----------------------------------------------------------------------------
  1194. ; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright,
  1195. ; ptrdiff_t stride)
  1196. ;-----------------------------------------------------------------------------
  1197. %macro PRED8x8L_VERTICAL 0
  1198. cglobal pred8x8l_vertical_8, 4,4
  1199. sub r0, r3
  1200. movq mm0, [r0-8]
  1201. movq mm3, [r0]
  1202. movq mm1, [r0+8]
  1203. movq mm2, mm3
  1204. movq mm4, mm3
  1205. PALIGNR mm2, mm0, 7, mm0
  1206. PALIGNR mm1, mm4, 1, mm4
  1207. test r1d, r1d ; top_left
  1208. jz .fix_lt_2
  1209. test r2d, r2d ; top_right
  1210. jz .fix_tr_1
  1211. jmp .body
  1212. .fix_lt_2:
  1213. movq mm5, mm3
  1214. pxor mm5, mm2
  1215. psllq mm5, 56
  1216. psrlq mm5, 56
  1217. pxor mm2, mm5
  1218. test r2d, r2d ; top_right
  1219. jnz .body
  1220. .fix_tr_1:
  1221. movq mm5, mm3
  1222. pxor mm5, mm1
  1223. psrlq mm5, 56
  1224. psllq mm5, 56
  1225. pxor mm1, mm5
  1226. .body:
  1227. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  1228. %rep 3
  1229. movq [r0+r3*1], mm0
  1230. movq [r0+r3*2], mm0
  1231. lea r0, [r0+r3*2]
  1232. %endrep
  1233. movq [r0+r3*1], mm0
  1234. movq [r0+r3*2], mm0
  1235. RET
  1236. %endmacro
  1237. INIT_MMX mmxext
  1238. PRED8x8L_VERTICAL
  1239. INIT_MMX ssse3
  1240. PRED8x8L_VERTICAL
  1241. ;-----------------------------------------------------------------------------
  1242. ; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft,
  1243. ; int has_topright, ptrdiff_t stride)
  1244. ;-----------------------------------------------------------------------------
  1245. INIT_MMX mmxext
  1246. cglobal pred8x8l_down_left_8, 4,5
  1247. sub r0, r3
  1248. movq mm0, [r0-8]
  1249. movq mm3, [r0]
  1250. movq mm1, [r0+8]
  1251. movq mm2, mm3
  1252. movq mm4, mm3
  1253. PALIGNR mm2, mm0, 7, mm0
  1254. PALIGNR mm1, mm4, 1, mm4
  1255. test r1d, r1d
  1256. jz .fix_lt_2
  1257. test r2d, r2d
  1258. jz .fix_tr_1
  1259. jmp .do_top
  1260. .fix_lt_2:
  1261. movq mm5, mm3
  1262. pxor mm5, mm2
  1263. psllq mm5, 56
  1264. psrlq mm5, 56
  1265. pxor mm2, mm5
  1266. test r2d, r2d
  1267. jnz .do_top
  1268. .fix_tr_1:
  1269. movq mm5, mm3
  1270. pxor mm5, mm1
  1271. psrlq mm5, 56
  1272. psllq mm5, 56
  1273. pxor mm1, mm5
  1274. jmp .do_top
  1275. .fix_tr_2:
  1276. punpckhbw mm3, mm3
  1277. pshufw mm1, mm3, 0xFF
  1278. jmp .do_topright
  1279. .do_top:
  1280. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1281. movq mm7, mm4
  1282. test r2d, r2d
  1283. jz .fix_tr_2
  1284. movq mm0, [r0+8]
  1285. movq mm5, mm0
  1286. movq mm2, mm0
  1287. movq mm4, mm0
  1288. psrlq mm5, 56
  1289. PALIGNR mm2, mm3, 7, mm3
  1290. PALIGNR mm5, mm4, 1, mm4
  1291. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1292. .do_topright:
  1293. lea r1, [r0+r3*2]
  1294. movq mm6, mm1
  1295. psrlq mm1, 56
  1296. movq mm4, mm1
  1297. lea r2, [r1+r3*2]
  1298. movq mm2, mm6
  1299. PALIGNR mm2, mm7, 1, mm0
  1300. movq mm3, mm6
  1301. PALIGNR mm3, mm7, 7, mm0
  1302. PALIGNR mm4, mm6, 1, mm0
  1303. movq mm5, mm7
  1304. movq mm1, mm7
  1305. movq mm7, mm6
  1306. lea r4, [r2+r3*2]
  1307. psllq mm1, 8
  1308. PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
  1309. PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
  1310. movq [r4+r3*2], mm1
  1311. movq mm2, mm0
  1312. psllq mm1, 8
  1313. psrlq mm2, 56
  1314. psllq mm0, 8
  1315. por mm1, mm2
  1316. movq [r4+r3*1], mm1
  1317. movq mm2, mm0
  1318. psllq mm1, 8
  1319. psrlq mm2, 56
  1320. psllq mm0, 8
  1321. por mm1, mm2
  1322. movq [r2+r3*2], mm1
  1323. movq mm2, mm0
  1324. psllq mm1, 8
  1325. psrlq mm2, 56
  1326. psllq mm0, 8
  1327. por mm1, mm2
  1328. movq [r2+r3*1], mm1
  1329. movq mm2, mm0
  1330. psllq mm1, 8
  1331. psrlq mm2, 56
  1332. psllq mm0, 8
  1333. por mm1, mm2
  1334. movq [r1+r3*2], mm1
  1335. movq mm2, mm0
  1336. psllq mm1, 8
  1337. psrlq mm2, 56
  1338. psllq mm0, 8
  1339. por mm1, mm2
  1340. movq [r1+r3*1], mm1
  1341. movq mm2, mm0
  1342. psllq mm1, 8
  1343. psrlq mm2, 56
  1344. psllq mm0, 8
  1345. por mm1, mm2
  1346. movq [r0+r3*2], mm1
  1347. psllq mm1, 8
  1348. psrlq mm0, 56
  1349. por mm1, mm0
  1350. movq [r0+r3*1], mm1
  1351. RET
  1352. %macro PRED8x8L_DOWN_LEFT 0
  1353. cglobal pred8x8l_down_left_8, 4,4
  1354. sub r0, r3
  1355. movq mm0, [r0-8]
  1356. movq mm3, [r0]
  1357. movq mm1, [r0+8]
  1358. movq mm2, mm3
  1359. movq mm4, mm3
  1360. PALIGNR mm2, mm0, 7, mm0
  1361. PALIGNR mm1, mm4, 1, mm4
  1362. test r1d, r1d ; top_left
  1363. jz .fix_lt_2
  1364. test r2d, r2d ; top_right
  1365. jz .fix_tr_1
  1366. jmp .do_top
  1367. .fix_lt_2:
  1368. movq mm5, mm3
  1369. pxor mm5, mm2
  1370. psllq mm5, 56
  1371. psrlq mm5, 56
  1372. pxor mm2, mm5
  1373. test r2d, r2d ; top_right
  1374. jnz .do_top
  1375. .fix_tr_1:
  1376. movq mm5, mm3
  1377. pxor mm5, mm1
  1378. psrlq mm5, 56
  1379. psllq mm5, 56
  1380. pxor mm1, mm5
  1381. jmp .do_top
  1382. .fix_tr_2:
  1383. punpckhbw mm3, mm3
  1384. pshufw mm1, mm3, 0xFF
  1385. jmp .do_topright
  1386. .do_top:
  1387. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1388. movq2dq xmm3, mm4
  1389. test r2d, r2d ; top_right
  1390. jz .fix_tr_2
  1391. movq mm0, [r0+8]
  1392. movq mm5, mm0
  1393. movq mm2, mm0
  1394. movq mm4, mm0
  1395. psrlq mm5, 56
  1396. PALIGNR mm2, mm3, 7, mm3
  1397. PALIGNR mm5, mm4, 1, mm4
  1398. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1399. .do_topright:
  1400. movq2dq xmm4, mm1
  1401. psrlq mm1, 56
  1402. movq2dq xmm5, mm1
  1403. lea r1, [r0+r3*2]
  1404. pslldq xmm4, 8
  1405. por xmm3, xmm4
  1406. movdqa xmm2, xmm3
  1407. psrldq xmm2, 1
  1408. pslldq xmm5, 15
  1409. por xmm2, xmm5
  1410. lea r2, [r1+r3*2]
  1411. movdqa xmm1, xmm3
  1412. pslldq xmm1, 1
  1413. INIT_XMM cpuname
  1414. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1415. psrldq xmm0, 1
  1416. movq [r0+r3*1], xmm0
  1417. psrldq xmm0, 1
  1418. movq [r0+r3*2], xmm0
  1419. psrldq xmm0, 1
  1420. lea r0, [r2+r3*2]
  1421. movq [r1+r3*1], xmm0
  1422. psrldq xmm0, 1
  1423. movq [r1+r3*2], xmm0
  1424. psrldq xmm0, 1
  1425. movq [r2+r3*1], xmm0
  1426. psrldq xmm0, 1
  1427. movq [r2+r3*2], xmm0
  1428. psrldq xmm0, 1
  1429. movq [r0+r3*1], xmm0
  1430. psrldq xmm0, 1
  1431. movq [r0+r3*2], xmm0
  1432. RET
  1433. %endmacro
  1434. INIT_MMX sse2
  1435. PRED8x8L_DOWN_LEFT
  1436. INIT_MMX ssse3
  1437. PRED8x8L_DOWN_LEFT
  1438. ;-----------------------------------------------------------------------------
  1439. ; void ff_pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft,
  1440. ; int has_topright, ptrdiff_t stride)
  1441. ;-----------------------------------------------------------------------------
  1442. INIT_MMX mmxext
  1443. cglobal pred8x8l_down_right_8, 4,5
  1444. sub r0, r3
  1445. lea r4, [r0+r3*2]
  1446. movq mm0, [r0+r3*1-8]
  1447. punpckhbw mm0, [r0+r3*0-8]
  1448. movq mm1, [r4+r3*1-8]
  1449. punpckhbw mm1, [r0+r3*2-8]
  1450. mov r4, r0
  1451. punpckhwd mm1, mm0
  1452. lea r0, [r0+r3*4]
  1453. movq mm2, [r0+r3*1-8]
  1454. punpckhbw mm2, [r0+r3*0-8]
  1455. lea r0, [r0+r3*2]
  1456. movq mm3, [r0+r3*1-8]
  1457. punpckhbw mm3, [r0+r3*0-8]
  1458. punpckhwd mm3, mm2
  1459. punpckhdq mm3, mm1
  1460. lea r0, [r0+r3*2]
  1461. movq mm0, [r0+r3*0-8]
  1462. movq mm1, [r4]
  1463. mov r0, r4
  1464. movq mm4, mm3
  1465. movq mm2, mm3
  1466. PALIGNR mm4, mm0, 7, mm0
  1467. PALIGNR mm1, mm2, 1, mm2
  1468. test r1d, r1d ; top_left
  1469. jz .fix_lt_1
  1470. .do_left:
  1471. movq mm0, mm4
  1472. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1473. movq mm4, mm0
  1474. movq mm7, mm2
  1475. movq mm6, mm2
  1476. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1477. psllq mm1, 56
  1478. PALIGNR mm7, mm1, 7, mm3
  1479. movq mm0, [r0-8]
  1480. movq mm3, [r0]
  1481. movq mm1, [r0+8]
  1482. movq mm2, mm3
  1483. movq mm4, mm3
  1484. PALIGNR mm2, mm0, 7, mm0
  1485. PALIGNR mm1, mm4, 1, mm4
  1486. test r1d, r1d ; top_left
  1487. jz .fix_lt_2
  1488. test r2d, r2d ; top_right
  1489. jz .fix_tr_1
  1490. .do_top:
  1491. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1492. movq mm5, mm4
  1493. jmp .body
  1494. .fix_lt_1:
  1495. movq mm5, mm3
  1496. pxor mm5, mm4
  1497. psrlq mm5, 56
  1498. psllq mm5, 48
  1499. pxor mm1, mm5
  1500. jmp .do_left
  1501. .fix_lt_2:
  1502. movq mm5, mm3
  1503. pxor mm5, mm2
  1504. psllq mm5, 56
  1505. psrlq mm5, 56
  1506. pxor mm2, mm5
  1507. test r2d, r2d ; top_right
  1508. jnz .do_top
  1509. .fix_tr_1:
  1510. movq mm5, mm3
  1511. pxor mm5, mm1
  1512. psrlq mm5, 56
  1513. psllq mm5, 56
  1514. pxor mm1, mm5
  1515. jmp .do_top
  1516. .body:
  1517. lea r1, [r0+r3*2]
  1518. movq mm1, mm7
  1519. movq mm7, mm5
  1520. movq mm5, mm6
  1521. movq mm2, mm7
  1522. lea r2, [r1+r3*2]
  1523. PALIGNR mm2, mm6, 1, mm0
  1524. movq mm3, mm7
  1525. PALIGNR mm3, mm6, 7, mm0
  1526. movq mm4, mm7
  1527. lea r4, [r2+r3*2]
  1528. psrlq mm4, 8
  1529. PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
  1530. PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
  1531. movq [r4+r3*2], mm0
  1532. movq mm2, mm1
  1533. psrlq mm0, 8
  1534. psllq mm2, 56
  1535. psrlq mm1, 8
  1536. por mm0, mm2
  1537. movq [r4+r3*1], mm0
  1538. movq mm2, mm1
  1539. psrlq mm0, 8
  1540. psllq mm2, 56
  1541. psrlq mm1, 8
  1542. por mm0, mm2
  1543. movq [r2+r3*2], mm0
  1544. movq mm2, mm1
  1545. psrlq mm0, 8
  1546. psllq mm2, 56
  1547. psrlq mm1, 8
  1548. por mm0, mm2
  1549. movq [r2+r3*1], mm0
  1550. movq mm2, mm1
  1551. psrlq mm0, 8
  1552. psllq mm2, 56
  1553. psrlq mm1, 8
  1554. por mm0, mm2
  1555. movq [r1+r3*2], mm0
  1556. movq mm2, mm1
  1557. psrlq mm0, 8
  1558. psllq mm2, 56
  1559. psrlq mm1, 8
  1560. por mm0, mm2
  1561. movq [r1+r3*1], mm0
  1562. movq mm2, mm1
  1563. psrlq mm0, 8
  1564. psllq mm2, 56
  1565. psrlq mm1, 8
  1566. por mm0, mm2
  1567. movq [r0+r3*2], mm0
  1568. psrlq mm0, 8
  1569. psllq mm1, 56
  1570. por mm0, mm1
  1571. movq [r0+r3*1], mm0
  1572. RET
  1573. %macro PRED8x8L_DOWN_RIGHT 0
  1574. cglobal pred8x8l_down_right_8, 4,5
  1575. sub r0, r3
  1576. lea r4, [r0+r3*2]
  1577. movq mm0, [r0+r3*1-8]
  1578. punpckhbw mm0, [r0+r3*0-8]
  1579. movq mm1, [r4+r3*1-8]
  1580. punpckhbw mm1, [r0+r3*2-8]
  1581. mov r4, r0
  1582. punpckhwd mm1, mm0
  1583. lea r0, [r0+r3*4]
  1584. movq mm2, [r0+r3*1-8]
  1585. punpckhbw mm2, [r0+r3*0-8]
  1586. lea r0, [r0+r3*2]
  1587. movq mm3, [r0+r3*1-8]
  1588. punpckhbw mm3, [r0+r3*0-8]
  1589. punpckhwd mm3, mm2
  1590. punpckhdq mm3, mm1
  1591. lea r0, [r0+r3*2]
  1592. movq mm0, [r0+r3*0-8]
  1593. movq mm1, [r4]
  1594. mov r0, r4
  1595. movq mm4, mm3
  1596. movq mm2, mm3
  1597. PALIGNR mm4, mm0, 7, mm0
  1598. PALIGNR mm1, mm2, 1, mm2
  1599. test r1d, r1d
  1600. jz .fix_lt_1
  1601. jmp .do_left
  1602. .fix_lt_1:
  1603. movq mm5, mm3
  1604. pxor mm5, mm4
  1605. psrlq mm5, 56
  1606. psllq mm5, 48
  1607. pxor mm1, mm5
  1608. jmp .do_left
  1609. .fix_lt_2:
  1610. movq mm5, mm3
  1611. pxor mm5, mm2
  1612. psllq mm5, 56
  1613. psrlq mm5, 56
  1614. pxor mm2, mm5
  1615. test r2d, r2d
  1616. jnz .do_top
  1617. .fix_tr_1:
  1618. movq mm5, mm3
  1619. pxor mm5, mm1
  1620. psrlq mm5, 56
  1621. psllq mm5, 56
  1622. pxor mm1, mm5
  1623. jmp .do_top
  1624. .do_left:
  1625. movq mm0, mm4
  1626. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1627. movq mm4, mm0
  1628. movq mm7, mm2
  1629. movq2dq xmm3, mm2
  1630. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1631. psllq mm1, 56
  1632. PALIGNR mm7, mm1, 7, mm3
  1633. movq2dq xmm1, mm7
  1634. movq mm0, [r0-8]
  1635. movq mm3, [r0]
  1636. movq mm1, [r0+8]
  1637. movq mm2, mm3
  1638. movq mm4, mm3
  1639. PALIGNR mm2, mm0, 7, mm0
  1640. PALIGNR mm1, mm4, 1, mm4
  1641. test r1d, r1d
  1642. jz .fix_lt_2
  1643. test r2d, r2d
  1644. jz .fix_tr_1
  1645. .do_top:
  1646. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1647. movq2dq xmm4, mm4
  1648. lea r1, [r0+r3*2]
  1649. movdqa xmm0, xmm3
  1650. pslldq xmm4, 8
  1651. por xmm3, xmm4
  1652. lea r2, [r1+r3*2]
  1653. pslldq xmm4, 1
  1654. por xmm1, xmm4
  1655. psrldq xmm0, 7
  1656. pslldq xmm0, 15
  1657. psrldq xmm0, 7
  1658. por xmm1, xmm0
  1659. lea r0, [r2+r3*2]
  1660. movdqa xmm2, xmm3
  1661. psrldq xmm2, 1
  1662. INIT_XMM cpuname
  1663. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1664. movdqa xmm1, xmm0
  1665. psrldq xmm1, 1
  1666. movq [r0+r3*2], xmm0
  1667. movq [r0+r3*1], xmm1
  1668. psrldq xmm0, 2
  1669. psrldq xmm1, 2
  1670. movq [r2+r3*2], xmm0
  1671. movq [r2+r3*1], xmm1
  1672. psrldq xmm0, 2
  1673. psrldq xmm1, 2
  1674. movq [r1+r3*2], xmm0
  1675. movq [r1+r3*1], xmm1
  1676. psrldq xmm0, 2
  1677. psrldq xmm1, 2
  1678. movq [r4+r3*2], xmm0
  1679. movq [r4+r3*1], xmm1
  1680. RET
  1681. %endmacro
  1682. INIT_MMX sse2
  1683. PRED8x8L_DOWN_RIGHT
  1684. INIT_MMX ssse3
  1685. PRED8x8L_DOWN_RIGHT
  1686. ;-----------------------------------------------------------------------------
  1687. ; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft,
  1688. ; int has_topright, ptrdiff_t stride)
  1689. ;-----------------------------------------------------------------------------
  1690. INIT_MMX mmxext
  1691. cglobal pred8x8l_vertical_right_8, 4,5
  1692. sub r0, r3
  1693. lea r4, [r0+r3*2]
  1694. movq mm0, [r0+r3*1-8]
  1695. punpckhbw mm0, [r0+r3*0-8]
  1696. movq mm1, [r4+r3*1-8]
  1697. punpckhbw mm1, [r0+r3*2-8]
  1698. mov r4, r0
  1699. punpckhwd mm1, mm0
  1700. lea r0, [r0+r3*4]
  1701. movq mm2, [r0+r3*1-8]
  1702. punpckhbw mm2, [r0+r3*0-8]
  1703. lea r0, [r0+r3*2]
  1704. movq mm3, [r0+r3*1-8]
  1705. punpckhbw mm3, [r0+r3*0-8]
  1706. punpckhwd mm3, mm2
  1707. punpckhdq mm3, mm1
  1708. lea r0, [r0+r3*2]
  1709. movq mm0, [r0+r3*0-8]
  1710. movq mm1, [r4]
  1711. mov r0, r4
  1712. movq mm4, mm3
  1713. movq mm2, mm3
  1714. PALIGNR mm4, mm0, 7, mm0
  1715. PALIGNR mm1, mm2, 1, mm2
  1716. test r1d, r1d
  1717. jz .fix_lt_1
  1718. jmp .do_left
  1719. .fix_lt_1:
  1720. movq mm5, mm3
  1721. pxor mm5, mm4
  1722. psrlq mm5, 56
  1723. psllq mm5, 48
  1724. pxor mm1, mm5
  1725. jmp .do_left
  1726. .fix_lt_2:
  1727. movq mm5, mm3
  1728. pxor mm5, mm2
  1729. psllq mm5, 56
  1730. psrlq mm5, 56
  1731. pxor mm2, mm5
  1732. test r2d, r2d
  1733. jnz .do_top
  1734. .fix_tr_1:
  1735. movq mm5, mm3
  1736. pxor mm5, mm1
  1737. psrlq mm5, 56
  1738. psllq mm5, 56
  1739. pxor mm1, mm5
  1740. jmp .do_top
  1741. .do_left:
  1742. movq mm0, mm4
  1743. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1744. movq mm7, mm2
  1745. movq mm0, [r0-8]
  1746. movq mm3, [r0]
  1747. movq mm1, [r0+8]
  1748. movq mm2, mm3
  1749. movq mm4, mm3
  1750. PALIGNR mm2, mm0, 7, mm0
  1751. PALIGNR mm1, mm4, 1, mm4
  1752. test r1d, r1d
  1753. jz .fix_lt_2
  1754. test r2d, r2d
  1755. jz .fix_tr_1
  1756. .do_top:
  1757. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1758. lea r1, [r0+r3*2]
  1759. movq mm2, mm6
  1760. movq mm3, mm6
  1761. PALIGNR mm3, mm7, 7, mm0
  1762. PALIGNR mm6, mm7, 6, mm1
  1763. movq mm4, mm3
  1764. pavgb mm3, mm2
  1765. lea r2, [r1+r3*2]
  1766. PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
  1767. movq [r0+r3*1], mm3
  1768. movq [r0+r3*2], mm0
  1769. movq mm5, mm0
  1770. movq mm6, mm3
  1771. movq mm1, mm7
  1772. movq mm2, mm1
  1773. psllq mm2, 8
  1774. movq mm3, mm1
  1775. psllq mm3, 16
  1776. lea r4, [r2+r3*2]
  1777. PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
  1778. PALIGNR mm6, mm0, 7, mm2
  1779. movq [r1+r3*1], mm6
  1780. psllq mm0, 8
  1781. PALIGNR mm5, mm0, 7, mm1
  1782. movq [r1+r3*2], mm5
  1783. psllq mm0, 8
  1784. PALIGNR mm6, mm0, 7, mm2
  1785. movq [r2+r3*1], mm6
  1786. psllq mm0, 8
  1787. PALIGNR mm5, mm0, 7, mm1
  1788. movq [r2+r3*2], mm5
  1789. psllq mm0, 8
  1790. PALIGNR mm6, mm0, 7, mm2
  1791. movq [r4+r3*1], mm6
  1792. psllq mm0, 8
  1793. PALIGNR mm5, mm0, 7, mm1
  1794. movq [r4+r3*2], mm5
  1795. RET
  1796. %macro PRED8x8L_VERTICAL_RIGHT 0
  1797. cglobal pred8x8l_vertical_right_8, 4,5,7
  1798. ; manually spill XMM registers for Win64 because
  1799. ; the code here is initialized with INIT_MMX
  1800. WIN64_SPILL_XMM 7
  1801. sub r0, r3
  1802. lea r4, [r0+r3*2]
  1803. movq mm0, [r0+r3*1-8]
  1804. punpckhbw mm0, [r0+r3*0-8]
  1805. movq mm1, [r4+r3*1-8]
  1806. punpckhbw mm1, [r0+r3*2-8]
  1807. mov r4, r0
  1808. punpckhwd mm1, mm0
  1809. lea r0, [r0+r3*4]
  1810. movq mm2, [r0+r3*1-8]
  1811. punpckhbw mm2, [r0+r3*0-8]
  1812. lea r0, [r0+r3*2]
  1813. movq mm3, [r0+r3*1-8]
  1814. punpckhbw mm3, [r0+r3*0-8]
  1815. punpckhwd mm3, mm2
  1816. punpckhdq mm3, mm1
  1817. lea r0, [r0+r3*2]
  1818. movq mm0, [r0+r3*0-8]
  1819. movq mm1, [r4]
  1820. mov r0, r4
  1821. movq mm4, mm3
  1822. movq mm2, mm3
  1823. PALIGNR mm4, mm0, 7, mm0
  1824. PALIGNR mm1, mm2, 1, mm2
  1825. test r1d, r1d
  1826. jnz .do_left
  1827. .fix_lt_1:
  1828. movq mm5, mm3
  1829. pxor mm5, mm4
  1830. psrlq mm5, 56
  1831. psllq mm5, 48
  1832. pxor mm1, mm5
  1833. jmp .do_left
  1834. .fix_lt_2:
  1835. movq mm5, mm3
  1836. pxor mm5, mm2
  1837. psllq mm5, 56
  1838. psrlq mm5, 56
  1839. pxor mm2, mm5
  1840. test r2d, r2d
  1841. jnz .do_top
  1842. .fix_tr_1:
  1843. movq mm5, mm3
  1844. pxor mm5, mm1
  1845. psrlq mm5, 56
  1846. psllq mm5, 56
  1847. pxor mm1, mm5
  1848. jmp .do_top
  1849. .do_left:
  1850. movq mm0, mm4
  1851. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1852. movq2dq xmm0, mm2
  1853. movq mm0, [r0-8]
  1854. movq mm3, [r0]
  1855. movq mm1, [r0+8]
  1856. movq mm2, mm3
  1857. movq mm4, mm3
  1858. PALIGNR mm2, mm0, 7, mm0
  1859. PALIGNR mm1, mm4, 1, mm4
  1860. test r1d, r1d
  1861. jz .fix_lt_2
  1862. test r2d, r2d
  1863. jz .fix_tr_1
  1864. .do_top:
  1865. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1866. lea r1, [r0+r3*2]
  1867. movq2dq xmm4, mm6
  1868. pslldq xmm4, 8
  1869. por xmm0, xmm4
  1870. movdqa xmm6, [pw_ff00]
  1871. movdqa xmm1, xmm0
  1872. lea r2, [r1+r3*2]
  1873. movdqa xmm2, xmm0
  1874. movdqa xmm3, xmm0
  1875. pslldq xmm0, 1
  1876. pslldq xmm1, 2
  1877. pavgb xmm2, xmm0
  1878. INIT_XMM cpuname
  1879. PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
  1880. pandn xmm6, xmm4
  1881. movdqa xmm5, xmm4
  1882. psrlw xmm4, 8
  1883. packuswb xmm6, xmm4
  1884. movhlps xmm4, xmm6
  1885. movhps [r0+r3*2], xmm5
  1886. movhps [r0+r3*1], xmm2
  1887. psrldq xmm5, 4
  1888. movss xmm5, xmm6
  1889. psrldq xmm2, 4
  1890. movss xmm2, xmm4
  1891. lea r0, [r2+r3*2]
  1892. psrldq xmm5, 1
  1893. psrldq xmm2, 1
  1894. movq [r0+r3*2], xmm5
  1895. movq [r0+r3*1], xmm2
  1896. psrldq xmm5, 1
  1897. psrldq xmm2, 1
  1898. movq [r2+r3*2], xmm5
  1899. movq [r2+r3*1], xmm2
  1900. psrldq xmm5, 1
  1901. psrldq xmm2, 1
  1902. movq [r1+r3*2], xmm5
  1903. movq [r1+r3*1], xmm2
  1904. RET
  1905. %endmacro
  1906. INIT_MMX sse2
  1907. PRED8x8L_VERTICAL_RIGHT
  1908. INIT_MMX ssse3
  1909. PRED8x8L_VERTICAL_RIGHT
  1910. ;-----------------------------------------------------------------------------
  1911. ; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft,
  1912. ; int has_topright, ptrdiff_t stride)
  1913. ;-----------------------------------------------------------------------------
  1914. %macro PRED8x8L_VERTICAL_LEFT 0
  1915. cglobal pred8x8l_vertical_left_8, 4,4
  1916. sub r0, r3
  1917. movq mm0, [r0-8]
  1918. movq mm3, [r0]
  1919. movq mm1, [r0+8]
  1920. movq mm2, mm3
  1921. movq mm4, mm3
  1922. PALIGNR mm2, mm0, 7, mm0
  1923. PALIGNR mm1, mm4, 1, mm4
  1924. test r1d, r1d
  1925. jz .fix_lt_2
  1926. test r2d, r2d
  1927. jz .fix_tr_1
  1928. jmp .do_top
  1929. .fix_lt_2:
  1930. movq mm5, mm3
  1931. pxor mm5, mm2
  1932. psllq mm5, 56
  1933. psrlq mm5, 56
  1934. pxor mm2, mm5
  1935. test r2d, r2d
  1936. jnz .do_top
  1937. .fix_tr_1:
  1938. movq mm5, mm3
  1939. pxor mm5, mm1
  1940. psrlq mm5, 56
  1941. psllq mm5, 56
  1942. pxor mm1, mm5
  1943. jmp .do_top
  1944. .fix_tr_2:
  1945. punpckhbw mm3, mm3
  1946. pshufw mm1, mm3, 0xFF
  1947. jmp .do_topright
  1948. .do_top:
  1949. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1950. movq2dq xmm4, mm4
  1951. test r2d, r2d
  1952. jz .fix_tr_2
  1953. movq mm0, [r0+8]
  1954. movq mm5, mm0
  1955. movq mm2, mm0
  1956. movq mm4, mm0
  1957. psrlq mm5, 56
  1958. PALIGNR mm2, mm3, 7, mm3
  1959. PALIGNR mm5, mm4, 1, mm4
  1960. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1961. .do_topright:
  1962. movq2dq xmm3, mm1
  1963. lea r1, [r0+r3*2]
  1964. pslldq xmm3, 8
  1965. por xmm4, xmm3
  1966. movdqa xmm2, xmm4
  1967. movdqa xmm1, xmm4
  1968. movdqa xmm3, xmm4
  1969. psrldq xmm2, 1
  1970. pslldq xmm1, 1
  1971. pavgb xmm3, xmm2
  1972. lea r2, [r1+r3*2]
  1973. INIT_XMM cpuname
  1974. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
  1975. psrldq xmm0, 1
  1976. movq [r0+r3*1], xmm3
  1977. movq [r0+r3*2], xmm0
  1978. lea r0, [r2+r3*2]
  1979. psrldq xmm3, 1
  1980. psrldq xmm0, 1
  1981. movq [r1+r3*1], xmm3
  1982. movq [r1+r3*2], xmm0
  1983. psrldq xmm3, 1
  1984. psrldq xmm0, 1
  1985. movq [r2+r3*1], xmm3
  1986. movq [r2+r3*2], xmm0
  1987. psrldq xmm3, 1
  1988. psrldq xmm0, 1
  1989. movq [r0+r3*1], xmm3
  1990. movq [r0+r3*2], xmm0
  1991. RET
  1992. %endmacro
  1993. INIT_MMX sse2
  1994. PRED8x8L_VERTICAL_LEFT
  1995. INIT_MMX ssse3
  1996. PRED8x8L_VERTICAL_LEFT
  1997. ;-----------------------------------------------------------------------------
  1998. ; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft,
  1999. ; int has_topright, ptrdiff_t stride)
  2000. ;-----------------------------------------------------------------------------
  2001. %macro PRED8x8L_HORIZONTAL_UP 0
  2002. cglobal pred8x8l_horizontal_up_8, 4,4
  2003. sub r0, r3
  2004. lea r2, [r0+r3*2]
  2005. movq mm0, [r0+r3*1-8]
  2006. test r1d, r1d
  2007. lea r1, [r0+r3]
  2008. cmovnz r1, r0
  2009. punpckhbw mm0, [r1+r3*0-8]
  2010. movq mm1, [r2+r3*1-8]
  2011. punpckhbw mm1, [r0+r3*2-8]
  2012. mov r2, r0
  2013. punpckhwd mm1, mm0
  2014. lea r0, [r0+r3*4]
  2015. movq mm2, [r0+r3*1-8]
  2016. punpckhbw mm2, [r0+r3*0-8]
  2017. lea r0, [r0+r3*2]
  2018. movq mm3, [r0+r3*1-8]
  2019. punpckhbw mm3, [r0+r3*0-8]
  2020. punpckhwd mm3, mm2
  2021. punpckhdq mm3, mm1
  2022. lea r0, [r0+r3*2]
  2023. movq mm0, [r0+r3*0-8]
  2024. movq mm1, [r1+r3*0-8]
  2025. mov r0, r2
  2026. movq mm4, mm3
  2027. movq mm2, mm3
  2028. PALIGNR mm4, mm0, 7, mm0
  2029. PALIGNR mm1, mm2, 1, mm2
  2030. movq mm0, mm4
  2031. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2032. movq mm4, mm0
  2033. movq mm7, mm2
  2034. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2035. psllq mm1, 56
  2036. PALIGNR mm7, mm1, 7, mm3
  2037. lea r1, [r0+r3*2]
  2038. pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
  2039. psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
  2040. movq mm2, mm0
  2041. psllw mm0, 8
  2042. psrlw mm2, 8
  2043. por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
  2044. movq mm3, mm2
  2045. movq mm4, mm2
  2046. movq mm5, mm2
  2047. psrlq mm2, 8
  2048. psrlq mm3, 16
  2049. lea r2, [r1+r3*2]
  2050. por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
  2051. punpckhbw mm7, mm7
  2052. por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
  2053. pavgb mm4, mm2
  2054. PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
  2055. movq mm5, mm4
  2056. punpcklbw mm4, mm1 ; p4 p3 p2 p1
  2057. punpckhbw mm5, mm1 ; p8 p7 p6 p5
  2058. movq mm6, mm5
  2059. movq mm7, mm5
  2060. movq mm0, mm5
  2061. PALIGNR mm5, mm4, 2, mm1
  2062. pshufw mm1, mm6, 11111001b
  2063. PALIGNR mm6, mm4, 4, mm2
  2064. pshufw mm2, mm7, 11111110b
  2065. PALIGNR mm7, mm4, 6, mm3
  2066. pshufw mm3, mm0, 11111111b
  2067. movq [r0+r3*1], mm4
  2068. movq [r0+r3*2], mm5
  2069. lea r0, [r2+r3*2]
  2070. movq [r1+r3*1], mm6
  2071. movq [r1+r3*2], mm7
  2072. movq [r2+r3*1], mm0
  2073. movq [r2+r3*2], mm1
  2074. movq [r0+r3*1], mm2
  2075. movq [r0+r3*2], mm3
  2076. RET
  2077. %endmacro
  2078. INIT_MMX mmxext
  2079. PRED8x8L_HORIZONTAL_UP
  2080. INIT_MMX ssse3
  2081. PRED8x8L_HORIZONTAL_UP
  2082. ;-----------------------------------------------------------------------------
  2083. ; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft,
  2084. ; int has_topright, ptrdiff_t stride)
  2085. ;-----------------------------------------------------------------------------
  2086. INIT_MMX mmxext
  2087. cglobal pred8x8l_horizontal_down_8, 4,5
  2088. sub r0, r3
  2089. lea r4, [r0+r3*2]
  2090. movq mm0, [r0+r3*1-8]
  2091. punpckhbw mm0, [r0+r3*0-8]
  2092. movq mm1, [r4+r3*1-8]
  2093. punpckhbw mm1, [r0+r3*2-8]
  2094. mov r4, r0
  2095. punpckhwd mm1, mm0
  2096. lea r0, [r0+r3*4]
  2097. movq mm2, [r0+r3*1-8]
  2098. punpckhbw mm2, [r0+r3*0-8]
  2099. lea r0, [r0+r3*2]
  2100. movq mm3, [r0+r3*1-8]
  2101. punpckhbw mm3, [r0+r3*0-8]
  2102. punpckhwd mm3, mm2
  2103. punpckhdq mm3, mm1
  2104. lea r0, [r0+r3*2]
  2105. movq mm0, [r0+r3*0-8]
  2106. movq mm1, [r4]
  2107. mov r0, r4
  2108. movq mm4, mm3
  2109. movq mm2, mm3
  2110. PALIGNR mm4, mm0, 7, mm0
  2111. PALIGNR mm1, mm2, 1, mm2
  2112. test r1d, r1d
  2113. jnz .do_left
  2114. .fix_lt_1:
  2115. movq mm5, mm3
  2116. pxor mm5, mm4
  2117. psrlq mm5, 56
  2118. psllq mm5, 48
  2119. pxor mm1, mm5
  2120. jmp .do_left
  2121. .fix_lt_2:
  2122. movq mm5, mm3
  2123. pxor mm5, mm2
  2124. psllq mm5, 56
  2125. psrlq mm5, 56
  2126. pxor mm2, mm5
  2127. test r2d, r2d
  2128. jnz .do_top
  2129. .fix_tr_1:
  2130. movq mm5, mm3
  2131. pxor mm5, mm1
  2132. psrlq mm5, 56
  2133. psllq mm5, 56
  2134. pxor mm1, mm5
  2135. jmp .do_top
  2136. .do_left:
  2137. movq mm0, mm4
  2138. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2139. movq mm4, mm0
  2140. movq mm7, mm2
  2141. movq mm6, mm2
  2142. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2143. psllq mm1, 56
  2144. PALIGNR mm7, mm1, 7, mm3
  2145. movq mm0, [r0-8]
  2146. movq mm3, [r0]
  2147. movq mm1, [r0+8]
  2148. movq mm2, mm3
  2149. movq mm4, mm3
  2150. PALIGNR mm2, mm0, 7, mm0
  2151. PALIGNR mm1, mm4, 1, mm4
  2152. test r1d, r1d
  2153. jz .fix_lt_2
  2154. test r2d, r2d
  2155. jz .fix_tr_1
  2156. .do_top:
  2157. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  2158. movq mm5, mm4
  2159. lea r1, [r0+r3*2]
  2160. psllq mm7, 56
  2161. movq mm2, mm5
  2162. movq mm3, mm6
  2163. movq mm4, mm2
  2164. PALIGNR mm2, mm6, 7, mm5
  2165. PALIGNR mm6, mm7, 7, mm0
  2166. lea r2, [r1+r3*2]
  2167. PALIGNR mm4, mm3, 1, mm7
  2168. movq mm5, mm3
  2169. pavgb mm3, mm6
  2170. PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
  2171. movq mm4, mm2
  2172. movq mm1, mm2
  2173. lea r4, [r2+r3*2]
  2174. psrlq mm4, 16
  2175. psrlq mm1, 8
  2176. PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
  2177. movq mm7, mm3
  2178. punpcklbw mm3, mm0
  2179. punpckhbw mm7, mm0
  2180. movq mm1, mm7
  2181. movq mm0, mm7
  2182. movq mm4, mm7
  2183. movq [r4+r3*2], mm3
  2184. PALIGNR mm7, mm3, 2, mm5
  2185. movq [r4+r3*1], mm7
  2186. PALIGNR mm1, mm3, 4, mm5
  2187. movq [r2+r3*2], mm1
  2188. PALIGNR mm0, mm3, 6, mm3
  2189. movq [r2+r3*1], mm0
  2190. movq mm2, mm6
  2191. movq mm3, mm6
  2192. movq [r1+r3*2], mm4
  2193. PALIGNR mm6, mm4, 2, mm5
  2194. movq [r1+r3*1], mm6
  2195. PALIGNR mm2, mm4, 4, mm5
  2196. movq [r0+r3*2], mm2
  2197. PALIGNR mm3, mm4, 6, mm4
  2198. movq [r0+r3*1], mm3
  2199. RET
  2200. %macro PRED8x8L_HORIZONTAL_DOWN 0
  2201. cglobal pred8x8l_horizontal_down_8, 4,5
  2202. sub r0, r3
  2203. lea r4, [r0+r3*2]
  2204. movq mm0, [r0+r3*1-8]
  2205. punpckhbw mm0, [r0+r3*0-8]
  2206. movq mm1, [r4+r3*1-8]
  2207. punpckhbw mm1, [r0+r3*2-8]
  2208. mov r4, r0
  2209. punpckhwd mm1, mm0
  2210. lea r0, [r0+r3*4]
  2211. movq mm2, [r0+r3*1-8]
  2212. punpckhbw mm2, [r0+r3*0-8]
  2213. lea r0, [r0+r3*2]
  2214. movq mm3, [r0+r3*1-8]
  2215. punpckhbw mm3, [r0+r3*0-8]
  2216. punpckhwd mm3, mm2
  2217. punpckhdq mm3, mm1
  2218. lea r0, [r0+r3*2]
  2219. movq mm0, [r0+r3*0-8]
  2220. movq mm1, [r4]
  2221. mov r0, r4
  2222. movq mm4, mm3
  2223. movq mm2, mm3
  2224. PALIGNR mm4, mm0, 7, mm0
  2225. PALIGNR mm1, mm2, 1, mm2
  2226. test r1d, r1d
  2227. jnz .do_left
  2228. .fix_lt_1:
  2229. movq mm5, mm3
  2230. pxor mm5, mm4
  2231. psrlq mm5, 56
  2232. psllq mm5, 48
  2233. pxor mm1, mm5
  2234. jmp .do_left
  2235. .fix_lt_2:
  2236. movq mm5, mm3
  2237. pxor mm5, mm2
  2238. psllq mm5, 56
  2239. psrlq mm5, 56
  2240. pxor mm2, mm5
  2241. test r2d, r2d
  2242. jnz .do_top
  2243. .fix_tr_1:
  2244. movq mm5, mm3
  2245. pxor mm5, mm1
  2246. psrlq mm5, 56
  2247. psllq mm5, 56
  2248. pxor mm1, mm5
  2249. jmp .do_top
  2250. .fix_tr_2:
  2251. punpckhbw mm3, mm3
  2252. pshufw mm1, mm3, 0xFF
  2253. jmp .do_topright
  2254. .do_left:
  2255. movq mm0, mm4
  2256. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  2257. movq2dq xmm0, mm2
  2258. pslldq xmm0, 8
  2259. movq mm4, mm0
  2260. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  2261. movq2dq xmm2, mm1
  2262. pslldq xmm2, 15
  2263. psrldq xmm2, 8
  2264. por xmm0, xmm2
  2265. movq mm0, [r0-8]
  2266. movq mm3, [r0]
  2267. movq mm1, [r0+8]
  2268. movq mm2, mm3
  2269. movq mm4, mm3
  2270. PALIGNR mm2, mm0, 7, mm0
  2271. PALIGNR mm1, mm4, 1, mm4
  2272. test r1d, r1d
  2273. jz .fix_lt_2
  2274. test r2d, r2d
  2275. jz .fix_tr_1
  2276. .do_top:
  2277. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  2278. movq2dq xmm1, mm4
  2279. test r2d, r2d
  2280. jz .fix_tr_2
  2281. movq mm0, [r0+8]
  2282. movq mm5, mm0
  2283. movq mm2, mm0
  2284. movq mm4, mm0
  2285. psrlq mm5, 56
  2286. PALIGNR mm2, mm3, 7, mm3
  2287. PALIGNR mm5, mm4, 1, mm4
  2288. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  2289. .do_topright:
  2290. movq2dq xmm5, mm1
  2291. pslldq xmm5, 8
  2292. por xmm1, xmm5
  2293. INIT_XMM cpuname
  2294. lea r2, [r4+r3*2]
  2295. movdqa xmm2, xmm1
  2296. movdqa xmm3, xmm1
  2297. PALIGNR xmm1, xmm0, 7, xmm4
  2298. PALIGNR xmm2, xmm0, 9, xmm5
  2299. lea r1, [r2+r3*2]
  2300. PALIGNR xmm3, xmm0, 8, xmm0
  2301. movdqa xmm4, xmm1
  2302. pavgb xmm4, xmm3
  2303. lea r0, [r1+r3*2]
  2304. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
  2305. punpcklbw xmm4, xmm0
  2306. movhlps xmm0, xmm4
  2307. movq [r0+r3*2], xmm4
  2308. movq [r2+r3*2], xmm0
  2309. psrldq xmm4, 2
  2310. psrldq xmm0, 2
  2311. movq [r0+r3*1], xmm4
  2312. movq [r2+r3*1], xmm0
  2313. psrldq xmm4, 2
  2314. psrldq xmm0, 2
  2315. movq [r1+r3*2], xmm4
  2316. movq [r4+r3*2], xmm0
  2317. psrldq xmm4, 2
  2318. psrldq xmm0, 2
  2319. movq [r1+r3*1], xmm4
  2320. movq [r4+r3*1], xmm0
  2321. RET
  2322. %endmacro
  2323. INIT_MMX sse2
  2324. PRED8x8L_HORIZONTAL_DOWN
  2325. INIT_MMX ssse3
  2326. PRED8x8L_HORIZONTAL_DOWN
  2327. ;-------------------------------------------------------------------------------
  2328. ; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright,
  2329. ; ptrdiff_t stride)
  2330. ;-------------------------------------------------------------------------------
  2331. INIT_MMX mmxext
  2332. cglobal pred4x4_dc_8, 3,5
  2333. pxor mm7, mm7
  2334. mov r4, r0
  2335. sub r0, r2
  2336. movd mm0, [r0]
  2337. psadbw mm0, mm7
  2338. movzx r1d, byte [r0+r2*1-1]
  2339. movd r3d, mm0
  2340. add r3d, r1d
  2341. movzx r1d, byte [r0+r2*2-1]
  2342. lea r0, [r0+r2*2]
  2343. add r3d, r1d
  2344. movzx r1d, byte [r0+r2*1-1]
  2345. add r3d, r1d
  2346. movzx r1d, byte [r0+r2*2-1]
  2347. add r3d, r1d
  2348. add r3d, 4
  2349. shr r3d, 3
  2350. imul r3d, 0x01010101
  2351. mov [r4+r2*0], r3d
  2352. mov [r0+r2*0], r3d
  2353. mov [r0+r2*1], r3d
  2354. mov [r0+r2*2], r3d
  2355. RET
  2356. ;-----------------------------------------------------------------------------
  2357. ; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
  2358. ; ptrdiff_t stride)
  2359. ;-----------------------------------------------------------------------------
  2360. %macro PRED4x4_TM 0
  2361. cglobal pred4x4_tm_vp8_8, 3,6
  2362. sub r0, r2
  2363. pxor mm7, mm7
  2364. movd mm0, [r0]
  2365. punpcklbw mm0, mm7
  2366. movzx r4d, byte [r0-1]
  2367. mov r5d, 2
  2368. .loop:
  2369. movzx r1d, byte [r0+r2*1-1]
  2370. movzx r3d, byte [r0+r2*2-1]
  2371. sub r1d, r4d
  2372. sub r3d, r4d
  2373. movd mm2, r1d
  2374. movd mm4, r3d
  2375. %if cpuflag(mmxext)
  2376. pshufw mm2, mm2, 0
  2377. pshufw mm4, mm4, 0
  2378. %else
  2379. punpcklwd mm2, mm2
  2380. punpcklwd mm4, mm4
  2381. punpckldq mm2, mm2
  2382. punpckldq mm4, mm4
  2383. %endif
  2384. paddw mm2, mm0
  2385. paddw mm4, mm0
  2386. packuswb mm2, mm2
  2387. packuswb mm4, mm4
  2388. movd [r0+r2*1], mm2
  2389. movd [r0+r2*2], mm4
  2390. lea r0, [r0+r2*2]
  2391. dec r5d
  2392. jg .loop
  2393. REP_RET
  2394. %endmacro
  2395. INIT_MMX mmx
  2396. PRED4x4_TM
  2397. INIT_MMX mmxext
  2398. PRED4x4_TM
  2399. INIT_XMM ssse3
  2400. cglobal pred4x4_tm_vp8_8, 3,3
  2401. sub r0, r2
  2402. movq mm6, [tm_shuf]
  2403. pxor mm1, mm1
  2404. movd mm0, [r0]
  2405. punpcklbw mm0, mm1
  2406. movd mm7, [r0-4]
  2407. pshufb mm7, mm6
  2408. lea r1, [r0+r2*2]
  2409. movd mm2, [r0+r2*1-4]
  2410. movd mm3, [r0+r2*2-4]
  2411. movd mm4, [r1+r2*1-4]
  2412. movd mm5, [r1+r2*2-4]
  2413. pshufb mm2, mm6
  2414. pshufb mm3, mm6
  2415. pshufb mm4, mm6
  2416. pshufb mm5, mm6
  2417. psubw mm0, mm7
  2418. paddw mm2, mm0
  2419. paddw mm3, mm0
  2420. paddw mm4, mm0
  2421. paddw mm5, mm0
  2422. packuswb mm2, mm2
  2423. packuswb mm3, mm3
  2424. packuswb mm4, mm4
  2425. packuswb mm5, mm5
  2426. movd [r0+r2*1], mm2
  2427. movd [r0+r2*2], mm3
  2428. movd [r1+r2*1], mm4
  2429. movd [r1+r2*2], mm5
  2430. RET
  2431. ;-----------------------------------------------------------------------------
  2432. ; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
  2433. ; ptrdiff_t stride)
  2434. ;-----------------------------------------------------------------------------
  2435. INIT_MMX mmxext
  2436. cglobal pred4x4_vertical_vp8_8, 3,3
  2437. sub r0, r2
  2438. movd m1, [r0-1]
  2439. movd m0, [r0]
  2440. mova m2, m0 ;t0 t1 t2 t3
  2441. punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
  2442. lea r1, [r0+r2*2]
  2443. psrlq m0, 8 ;t1 t2 t3 t4
  2444. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2445. movd [r0+r2*1], m3
  2446. movd [r0+r2*2], m3
  2447. movd [r1+r2*1], m3
  2448. movd [r1+r2*2], m3
  2449. RET
  2450. ;-----------------------------------------------------------------------------
  2451. ; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright,
  2452. ; ptrdiff_t stride)
  2453. ;-----------------------------------------------------------------------------
  2454. INIT_MMX mmxext
  2455. cglobal pred4x4_down_left_8, 3,3
  2456. sub r0, r2
  2457. movq m1, [r0]
  2458. punpckldq m1, [r1]
  2459. movq m2, m1
  2460. movq m3, m1
  2461. psllq m1, 8
  2462. pxor m2, m1
  2463. psrlq m2, 8
  2464. pxor m2, m3
  2465. PRED4x4_LOWPASS m0, m1, m2, m3, m4
  2466. lea r1, [r0+r2*2]
  2467. psrlq m0, 8
  2468. movd [r0+r2*1], m0
  2469. psrlq m0, 8
  2470. movd [r0+r2*2], m0
  2471. psrlq m0, 8
  2472. movd [r1+r2*1], m0
  2473. psrlq m0, 8
  2474. movd [r1+r2*2], m0
  2475. RET
  2476. ;------------------------------------------------------------------------------
  2477. ; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright,
  2478. ; ptrdiff_t stride)
  2479. ;------------------------------------------------------------------------------
  2480. INIT_MMX mmxext
  2481. cglobal pred4x4_vertical_left_8, 3,3
  2482. sub r0, r2
  2483. movq m1, [r0]
  2484. punpckldq m1, [r1]
  2485. movq m3, m1
  2486. movq m2, m1
  2487. psrlq m3, 8
  2488. psrlq m2, 16
  2489. movq m4, m3
  2490. pavgb m4, m1
  2491. PRED4x4_LOWPASS m0, m1, m2, m3, m5
  2492. lea r1, [r0+r2*2]
  2493. movh [r0+r2*1], m4
  2494. movh [r0+r2*2], m0
  2495. psrlq m4, 8
  2496. psrlq m0, 8
  2497. movh [r1+r2*1], m4
  2498. movh [r1+r2*2], m0
  2499. RET
  2500. ;------------------------------------------------------------------------------
  2501. ; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright,
  2502. ; ptrdiff_t stride)
  2503. ;------------------------------------------------------------------------------
  2504. INIT_MMX mmxext
  2505. cglobal pred4x4_horizontal_up_8, 3,3
  2506. sub r0, r2
  2507. lea r1, [r0+r2*2]
  2508. movd m0, [r0+r2*1-4]
  2509. punpcklbw m0, [r0+r2*2-4]
  2510. movd m1, [r1+r2*1-4]
  2511. punpcklbw m1, [r1+r2*2-4]
  2512. punpckhwd m0, m1
  2513. movq m1, m0
  2514. punpckhbw m1, m1
  2515. pshufw m1, m1, 0xFF
  2516. punpckhdq m0, m1
  2517. movq m2, m0
  2518. movq m3, m0
  2519. movq m7, m0
  2520. psrlq m2, 16
  2521. psrlq m3, 8
  2522. pavgb m7, m3
  2523. PRED4x4_LOWPASS m4, m0, m2, m3, m5
  2524. punpcklbw m7, m4
  2525. movd [r0+r2*1], m7
  2526. psrlq m7, 16
  2527. movd [r0+r2*2], m7
  2528. psrlq m7, 16
  2529. movd [r1+r2*1], m7
  2530. movd [r1+r2*2], m1
  2531. RET
  2532. ;------------------------------------------------------------------------------
  2533. ; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src,
  2534. ; const uint8_t *topright,
  2535. ; ptrdiff_t stride)
  2536. ;------------------------------------------------------------------------------
  2537. INIT_MMX mmxext
  2538. cglobal pred4x4_horizontal_down_8, 3,3
  2539. sub r0, r2
  2540. lea r1, [r0+r2*2]
  2541. movh m0, [r0-4] ; lt ..
  2542. punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
  2543. psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
  2544. movd m1, [r1+r2*2-4] ; l3
  2545. punpcklbw m1, [r1+r2*1-4] ; l2 l3
  2546. movd m2, [r0+r2*2-4] ; l1
  2547. punpcklbw m2, [r0+r2*1-4] ; l0 l1
  2548. punpckhwd m1, m2 ; l0 l1 l2 l3
  2549. punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
  2550. movq m0, m1
  2551. movq m2, m1
  2552. movq m5, m1
  2553. psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
  2554. psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
  2555. pavgb m5, m2
  2556. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2557. punpcklbw m5, m3
  2558. psrlq m3, 32
  2559. PALIGNR m3, m5, 6, m4
  2560. movh [r1+r2*2], m5
  2561. psrlq m5, 16
  2562. movh [r1+r2*1], m5
  2563. psrlq m5, 16
  2564. movh [r0+r2*2], m5
  2565. movh [r0+r2*1], m3
  2566. RET
  2567. ;-----------------------------------------------------------------------------
  2568. ; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src,
  2569. ; const uint8_t *topright,
  2570. ; ptrdiff_t stride)
  2571. ;-----------------------------------------------------------------------------
  2572. INIT_MMX mmxext
  2573. cglobal pred4x4_vertical_right_8, 3,3
  2574. sub r0, r2
  2575. lea r1, [r0+r2*2]
  2576. movh m0, [r0] ; ........t3t2t1t0
  2577. movq m5, m0
  2578. PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
  2579. pavgb m5, m0
  2580. PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
  2581. movq m1, m0
  2582. PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
  2583. movq m2, m0
  2584. PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
  2585. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  2586. movq m1, m3
  2587. psrlq m3, 16
  2588. psllq m1, 48
  2589. movh [r0+r2*1], m5
  2590. movh [r0+r2*2], m3
  2591. PALIGNR m5, m1, 7, m2
  2592. psllq m1, 8
  2593. movh [r1+r2*1], m5
  2594. PALIGNR m3, m1, 7, m1
  2595. movh [r1+r2*2], m3
  2596. RET
  2597. ;-----------------------------------------------------------------------------
  2598. ; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright,
  2599. ; ptrdiff_t stride)
  2600. ;-----------------------------------------------------------------------------
  2601. INIT_MMX mmxext
  2602. cglobal pred4x4_down_right_8, 3,3
  2603. sub r0, r2
  2604. lea r1, [r0+r2*2]
  2605. movq m1, [r1-8]
  2606. movq m2, [r0+r2*1-8]
  2607. punpckhbw m2, [r0-8]
  2608. movh m3, [r0]
  2609. punpckhwd m1, m2
  2610. PALIGNR m3, m1, 5, m1
  2611. movq m1, m3
  2612. PALIGNR m3, [r1+r2*1-8], 7, m4
  2613. movq m2, m3
  2614. PALIGNR m3, [r1+r2*2-8], 7, m4
  2615. PRED4x4_LOWPASS m0, m3, m1, m2, m4
  2616. movh [r1+r2*2], m0
  2617. psrlq m0, 8
  2618. movh [r1+r2*1], m0
  2619. psrlq m0, 8
  2620. movh [r0+r2*2], m0
  2621. psrlq m0, 8
  2622. movh [r0+r2*1], m0
  2623. RET