PageRenderTime 64ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 1ms

/libavcodec/x86/h264_intrapred.asm

https://bitbucket.org/freeze/ffmpeg
Assembly | 2782 lines | 2532 code | 132 blank | 118 comment | 12 complexity | 1d404deef61cdf2949d98046aec053fa MD5 | raw file
Possible License(s): LGPL-2.1, LGPL-3.0, CC-BY-SA-3.0, GPL-2.0, GPL-3.0

Large files files are truncated, but you can click here to view the full file

  1. ;******************************************************************************
  2. ;* H.264 intra prediction asm optimizations
  3. ;* Copyright (c) 2010 Jason Garrett-Glaser
  4. ;* Copyright (c) 2010 Holger Lubitz
  5. ;* Copyright (c) 2010 Loren Merritt
  6. ;* Copyright (c) 2010 Ronald S. Bultje
  7. ;*
  8. ;* This file is part of FFmpeg.
  9. ;*
  10. ;* FFmpeg is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* FFmpeg is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with FFmpeg; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "libavutil/x86/x86inc.asm"
  25. %include "libavutil/x86/x86util.asm"
  26. SECTION_RODATA
  27. tm_shuf: times 8 db 0x03, 0x80
  28. pw_ff00: times 8 dw 0xff00
  29. plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
  30. db 1, 2, 3, 4, 5, 6, 7, 8
  31. plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
  32. db 1, 2, 3, 4, 0, 0, 0, 0
  33. pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
  34. pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
  35. pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
  36. pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
  37. SECTION .text
  38. cextern pb_1
  39. cextern pb_3
  40. cextern pw_4
  41. cextern pw_5
  42. cextern pw_8
  43. cextern pw_16
  44. cextern pw_17
  45. cextern pw_32
  46. ;-----------------------------------------------------------------------------
  47. ; void pred16x16_vertical(uint8_t *src, int stride)
  48. ;-----------------------------------------------------------------------------
  49. cglobal pred16x16_vertical_mmx, 2,3
  50. sub r0, r1
  51. mov r2, 8
  52. movq mm0, [r0+0]
  53. movq mm1, [r0+8]
  54. .loop:
  55. movq [r0+r1*1+0], mm0
  56. movq [r0+r1*1+8], mm1
  57. movq [r0+r1*2+0], mm0
  58. movq [r0+r1*2+8], mm1
  59. lea r0, [r0+r1*2]
  60. dec r2
  61. jg .loop
  62. REP_RET
  63. cglobal pred16x16_vertical_sse, 2,3
  64. sub r0, r1
  65. mov r2, 4
  66. movaps xmm0, [r0]
  67. .loop:
  68. movaps [r0+r1*1], xmm0
  69. movaps [r0+r1*2], xmm0
  70. lea r0, [r0+r1*2]
  71. movaps [r0+r1*1], xmm0
  72. movaps [r0+r1*2], xmm0
  73. lea r0, [r0+r1*2]
  74. dec r2
  75. jg .loop
  76. REP_RET
  77. ;-----------------------------------------------------------------------------
  78. ; void pred16x16_horizontal(uint8_t *src, int stride)
  79. ;-----------------------------------------------------------------------------
  80. %macro PRED16x16_H 1
  81. cglobal pred16x16_horizontal_%1, 2,3
  82. mov r2, 8
  83. %ifidn %1, ssse3
  84. mova m2, [pb_3]
  85. %endif
  86. .loop:
  87. movd m0, [r0+r1*0-4]
  88. movd m1, [r0+r1*1-4]
  89. %ifidn %1, ssse3
  90. pshufb m0, m2
  91. pshufb m1, m2
  92. %else
  93. punpcklbw m0, m0
  94. punpcklbw m1, m1
  95. %ifidn %1, mmxext
  96. pshufw m0, m0, 0xff
  97. pshufw m1, m1, 0xff
  98. %else
  99. punpckhwd m0, m0
  100. punpckhwd m1, m1
  101. punpckhdq m0, m0
  102. punpckhdq m1, m1
  103. %endif
  104. mova [r0+r1*0+8], m0
  105. mova [r0+r1*1+8], m1
  106. %endif
  107. mova [r0+r1*0], m0
  108. mova [r0+r1*1], m1
  109. lea r0, [r0+r1*2]
  110. dec r2
  111. jg .loop
  112. REP_RET
  113. %endmacro
  114. INIT_MMX
  115. PRED16x16_H mmx
  116. PRED16x16_H mmxext
  117. INIT_XMM
  118. PRED16x16_H ssse3
  119. ;-----------------------------------------------------------------------------
  120. ; void pred16x16_dc(uint8_t *src, int stride)
  121. ;-----------------------------------------------------------------------------
  122. %macro PRED16x16_DC 1
  123. cglobal pred16x16_dc_%1, 2,7
  124. mov r4, r0
  125. sub r0, r1
  126. pxor mm0, mm0
  127. pxor mm1, mm1
  128. psadbw mm0, [r0+0]
  129. psadbw mm1, [r0+8]
  130. dec r0
  131. movzx r5d, byte [r0+r1*1]
  132. paddw mm0, mm1
  133. movd r6d, mm0
  134. lea r0, [r0+r1*2]
  135. %rep 7
  136. movzx r2d, byte [r0+r1*0]
  137. movzx r3d, byte [r0+r1*1]
  138. add r5d, r2d
  139. add r6d, r3d
  140. lea r0, [r0+r1*2]
  141. %endrep
  142. movzx r2d, byte [r0+r1*0]
  143. add r5d, r6d
  144. lea r2d, [r2+r5+16]
  145. shr r2d, 5
  146. %ifidn %1, mmxext
  147. movd m0, r2d
  148. punpcklbw m0, m0
  149. pshufw m0, m0, 0
  150. %elifidn %1, sse2
  151. movd m0, r2d
  152. punpcklbw m0, m0
  153. pshuflw m0, m0, 0
  154. punpcklqdq m0, m0
  155. %elifidn %1, ssse3
  156. pxor m1, m1
  157. movd m0, r2d
  158. pshufb m0, m1
  159. %endif
  160. %if mmsize==8
  161. mov r3d, 8
  162. .loop:
  163. mova [r4+r1*0+0], m0
  164. mova [r4+r1*0+8], m0
  165. mova [r4+r1*1+0], m0
  166. mova [r4+r1*1+8], m0
  167. %else
  168. mov r3d, 4
  169. .loop:
  170. mova [r4+r1*0], m0
  171. mova [r4+r1*1], m0
  172. lea r4, [r4+r1*2]
  173. mova [r4+r1*0], m0
  174. mova [r4+r1*1], m0
  175. %endif
  176. lea r4, [r4+r1*2]
  177. dec r3d
  178. jg .loop
  179. REP_RET
  180. %endmacro
  181. INIT_MMX
  182. PRED16x16_DC mmxext
  183. INIT_XMM
  184. PRED16x16_DC sse2
  185. PRED16x16_DC ssse3
  186. ;-----------------------------------------------------------------------------
  187. ; void pred16x16_tm_vp8(uint8_t *src, int stride)
  188. ;-----------------------------------------------------------------------------
  189. %macro PRED16x16_TM_MMX 1
  190. cglobal pred16x16_tm_vp8_%1, 2,5
  191. sub r0, r1
  192. pxor mm7, mm7
  193. movq mm0, [r0+0]
  194. movq mm2, [r0+8]
  195. movq mm1, mm0
  196. movq mm3, mm2
  197. punpcklbw mm0, mm7
  198. punpckhbw mm1, mm7
  199. punpcklbw mm2, mm7
  200. punpckhbw mm3, mm7
  201. movzx r3d, byte [r0-1]
  202. mov r4d, 16
  203. .loop:
  204. movzx r2d, byte [r0+r1-1]
  205. sub r2d, r3d
  206. movd mm4, r2d
  207. %ifidn %1, mmx
  208. punpcklwd mm4, mm4
  209. punpckldq mm4, mm4
  210. %else
  211. pshufw mm4, mm4, 0
  212. %endif
  213. movq mm5, mm4
  214. movq mm6, mm4
  215. movq mm7, mm4
  216. paddw mm4, mm0
  217. paddw mm5, mm1
  218. paddw mm6, mm2
  219. paddw mm7, mm3
  220. packuswb mm4, mm5
  221. packuswb mm6, mm7
  222. movq [r0+r1+0], mm4
  223. movq [r0+r1+8], mm6
  224. add r0, r1
  225. dec r4d
  226. jg .loop
  227. REP_RET
  228. %endmacro
  229. PRED16x16_TM_MMX mmx
  230. PRED16x16_TM_MMX mmxext
  231. cglobal pred16x16_tm_vp8_sse2, 2,6,6
  232. sub r0, r1
  233. pxor xmm2, xmm2
  234. movdqa xmm0, [r0]
  235. movdqa xmm1, xmm0
  236. punpcklbw xmm0, xmm2
  237. punpckhbw xmm1, xmm2
  238. movzx r4d, byte [r0-1]
  239. mov r5d, 8
  240. .loop:
  241. movzx r2d, byte [r0+r1*1-1]
  242. movzx r3d, byte [r0+r1*2-1]
  243. sub r2d, r4d
  244. sub r3d, r4d
  245. movd xmm2, r2d
  246. movd xmm4, r3d
  247. pshuflw xmm2, xmm2, 0
  248. pshuflw xmm4, xmm4, 0
  249. punpcklqdq xmm2, xmm2
  250. punpcklqdq xmm4, xmm4
  251. movdqa xmm3, xmm2
  252. movdqa xmm5, xmm4
  253. paddw xmm2, xmm0
  254. paddw xmm3, xmm1
  255. paddw xmm4, xmm0
  256. paddw xmm5, xmm1
  257. packuswb xmm2, xmm3
  258. packuswb xmm4, xmm5
  259. movdqa [r0+r1*1], xmm2
  260. movdqa [r0+r1*2], xmm4
  261. lea r0, [r0+r1*2]
  262. dec r5d
  263. jg .loop
  264. REP_RET
  265. ;-----------------------------------------------------------------------------
  266. ; void pred16x16_plane(uint8_t *src, int stride)
  267. ;-----------------------------------------------------------------------------
  268. %macro H264_PRED16x16_PLANE 3
  269. cglobal pred16x16_plane_%3_%1, 2, 7, %2
  270. mov r2, r1 ; +stride
  271. neg r1 ; -stride
  272. movh m0, [r0+r1 -1]
  273. %if mmsize == 8
  274. pxor m4, m4
  275. movh m1, [r0+r1 +3 ]
  276. movh m2, [r0+r1 +8 ]
  277. movh m3, [r0+r1 +12]
  278. punpcklbw m0, m4
  279. punpcklbw m1, m4
  280. punpcklbw m2, m4
  281. punpcklbw m3, m4
  282. pmullw m0, [pw_m8tom1 ]
  283. pmullw m1, [pw_m8tom1+8]
  284. pmullw m2, [pw_1to8 ]
  285. pmullw m3, [pw_1to8 +8]
  286. paddw m0, m2
  287. paddw m1, m3
  288. %else ; mmsize == 16
  289. %ifidn %1, sse2
  290. pxor m2, m2
  291. movh m1, [r0+r1 +8]
  292. punpcklbw m0, m2
  293. punpcklbw m1, m2
  294. pmullw m0, [pw_m8tom1]
  295. pmullw m1, [pw_1to8]
  296. paddw m0, m1
  297. %else ; ssse3
  298. movhps m0, [r0+r1 +8]
  299. pmaddubsw m0, [plane_shuf] ; H coefficients
  300. %endif
  301. movhlps m1, m0
  302. %endif
  303. paddw m0, m1
  304. %ifidn %1, mmx
  305. mova m1, m0
  306. psrlq m1, 32
  307. %elifidn %1, mmx2
  308. pshufw m1, m0, 0xE
  309. %else ; mmsize == 16
  310. pshuflw m1, m0, 0xE
  311. %endif
  312. paddw m0, m1
  313. %ifidn %1, mmx
  314. mova m1, m0
  315. psrlq m1, 16
  316. %elifidn %1, mmx2
  317. pshufw m1, m0, 0x1
  318. %else
  319. pshuflw m1, m0, 0x1
  320. %endif
  321. paddw m0, m1 ; sum of H coefficients
  322. lea r4, [r0+r2*8-1]
  323. lea r3, [r0+r2*4-1]
  324. add r4, r2
  325. %ifdef ARCH_X86_64
  326. %define e_reg r11
  327. %else
  328. %define e_reg r0
  329. %endif
  330. movzx e_reg, byte [r3+r2*2 ]
  331. movzx r5, byte [r4+r1 ]
  332. sub r5, e_reg
  333. movzx e_reg, byte [r3+r2 ]
  334. movzx r6, byte [r4 ]
  335. sub r6, e_reg
  336. lea r5, [r5+r6*2]
  337. movzx e_reg, byte [r3+r1 ]
  338. movzx r6, byte [r4+r2*2 ]
  339. sub r6, e_reg
  340. lea r5, [r5+r6*4]
  341. movzx e_reg, byte [r3 ]
  342. %ifdef ARCH_X86_64
  343. movzx r10, byte [r4+r2 ]
  344. sub r10, e_reg
  345. %else
  346. movzx r6, byte [r4+r2 ]
  347. sub r6, e_reg
  348. lea r5, [r5+r6*4]
  349. sub r5, r6
  350. %endif
  351. lea e_reg, [r3+r1*4]
  352. lea r3, [r4+r2*4]
  353. movzx r4, byte [e_reg+r2 ]
  354. movzx r6, byte [r3 ]
  355. sub r6, r4
  356. %ifdef ARCH_X86_64
  357. lea r6, [r10+r6*2]
  358. lea r5, [r5+r6*2]
  359. add r5, r6
  360. %else
  361. lea r5, [r5+r6*4]
  362. lea r5, [r5+r6*2]
  363. %endif
  364. movzx r4, byte [e_reg ]
  365. %ifdef ARCH_X86_64
  366. movzx r10, byte [r3 +r2 ]
  367. sub r10, r4
  368. sub r5, r10
  369. %else
  370. movzx r6, byte [r3 +r2 ]
  371. sub r6, r4
  372. lea r5, [r5+r6*8]
  373. sub r5, r6
  374. %endif
  375. movzx r4, byte [e_reg+r1 ]
  376. movzx r6, byte [r3 +r2*2]
  377. sub r6, r4
  378. %ifdef ARCH_X86_64
  379. add r6, r10
  380. %endif
  381. lea r5, [r5+r6*8]
  382. movzx r4, byte [e_reg+r2*2]
  383. movzx r6, byte [r3 +r1 ]
  384. sub r6, r4
  385. lea r5, [r5+r6*4]
  386. add r5, r6 ; sum of V coefficients
  387. %ifndef ARCH_X86_64
  388. mov r0, r0m
  389. %endif
  390. %ifidn %3, h264
  391. lea r5, [r5*5+32]
  392. sar r5, 6
  393. %elifidn %3, rv40
  394. lea r5, [r5*5]
  395. sar r5, 6
  396. %elifidn %3, svq3
  397. test r5, r5
  398. lea r6, [r5+3]
  399. cmovs r5, r6
  400. sar r5, 2 ; V/4
  401. lea r5, [r5*5] ; 5*(V/4)
  402. test r5, r5
  403. lea r6, [r5+15]
  404. cmovs r5, r6
  405. sar r5, 4 ; (5*(V/4))/16
  406. %endif
  407. movzx r4, byte [r0+r1 +15]
  408. movzx r3, byte [r3+r2*2 ]
  409. lea r3, [r3+r4+1]
  410. shl r3, 4
  411. movd r1d, m0
  412. movsx r1d, r1w
  413. %ifnidn %3, svq3
  414. %ifidn %3, h264
  415. lea r1d, [r1d*5+32]
  416. %else ; rv40
  417. lea r1d, [r1d*5]
  418. %endif
  419. sar r1d, 6
  420. %else ; svq3
  421. test r1d, r1d
  422. lea r4d, [r1d+3]
  423. cmovs r1d, r4d
  424. sar r1d, 2 ; H/4
  425. lea r1d, [r1d*5] ; 5*(H/4)
  426. test r1d, r1d
  427. lea r4d, [r1d+15]
  428. cmovs r1d, r4d
  429. sar r1d, 4 ; (5*(H/4))/16
  430. %endif
  431. movd m0, r1d
  432. add r1d, r5d
  433. add r3d, r1d
  434. shl r1d, 3
  435. sub r3d, r1d ; a
  436. movd m1, r5d
  437. movd m3, r3d
  438. %ifidn %1, mmx
  439. punpcklwd m0, m0
  440. punpcklwd m1, m1
  441. punpcklwd m3, m3
  442. punpckldq m0, m0
  443. punpckldq m1, m1
  444. punpckldq m3, m3
  445. %elifidn %1, mmx2
  446. pshufw m0, m0, 0x0
  447. pshufw m1, m1, 0x0
  448. pshufw m3, m3, 0x0
  449. %else
  450. pshuflw m0, m0, 0x0
  451. pshuflw m1, m1, 0x0
  452. pshuflw m3, m3, 0x0
  453. punpcklqdq m0, m0 ; splat H (words)
  454. punpcklqdq m1, m1 ; splat V (words)
  455. punpcklqdq m3, m3 ; splat a (words)
  456. %endif
  457. %ifidn %3, svq3
  458. SWAP 0, 1
  459. %endif
  460. mova m2, m0
  461. %if mmsize == 8
  462. mova m5, m0
  463. %endif
  464. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  465. %if mmsize == 16
  466. psllw m2, 3
  467. %else
  468. psllw m5, 3
  469. psllw m2, 2
  470. mova m6, m5
  471. paddw m6, m2
  472. %endif
  473. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  474. paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
  475. %if mmsize == 8
  476. paddw m5, m0 ; a + {8,9,10,11}*H
  477. paddw m6, m0 ; a + {12,13,14,15}*H
  478. %endif
  479. mov r4, 8
  480. .loop
  481. mova m3, m0 ; b[0..7]
  482. mova m4, m2 ; b[8..15]
  483. psraw m3, 5
  484. psraw m4, 5
  485. packuswb m3, m4
  486. mova [r0], m3
  487. %if mmsize == 8
  488. mova m3, m5 ; b[8..11]
  489. mova m4, m6 ; b[12..15]
  490. psraw m3, 5
  491. psraw m4, 5
  492. packuswb m3, m4
  493. mova [r0+8], m3
  494. %endif
  495. paddw m0, m1
  496. paddw m2, m1
  497. %if mmsize == 8
  498. paddw m5, m1
  499. paddw m6, m1
  500. %endif
  501. mova m3, m0 ; b[0..7]
  502. mova m4, m2 ; b[8..15]
  503. psraw m3, 5
  504. psraw m4, 5
  505. packuswb m3, m4
  506. mova [r0+r2], m3
  507. %if mmsize == 8
  508. mova m3, m5 ; b[8..11]
  509. mova m4, m6 ; b[12..15]
  510. psraw m3, 5
  511. psraw m4, 5
  512. packuswb m3, m4
  513. mova [r0+r2+8], m3
  514. %endif
  515. paddw m0, m1
  516. paddw m2, m1
  517. %if mmsize == 8
  518. paddw m5, m1
  519. paddw m6, m1
  520. %endif
  521. lea r0, [r0+r2*2]
  522. dec r4
  523. jg .loop
  524. REP_RET
  525. %endmacro
  526. INIT_MMX
  527. H264_PRED16x16_PLANE mmx, 0, h264
  528. H264_PRED16x16_PLANE mmx, 0, rv40
  529. H264_PRED16x16_PLANE mmx, 0, svq3
  530. H264_PRED16x16_PLANE mmx2, 0, h264
  531. H264_PRED16x16_PLANE mmx2, 0, rv40
  532. H264_PRED16x16_PLANE mmx2, 0, svq3
  533. INIT_XMM
  534. H264_PRED16x16_PLANE sse2, 8, h264
  535. H264_PRED16x16_PLANE sse2, 8, rv40
  536. H264_PRED16x16_PLANE sse2, 8, svq3
  537. H264_PRED16x16_PLANE ssse3, 8, h264
  538. H264_PRED16x16_PLANE ssse3, 8, rv40
  539. H264_PRED16x16_PLANE ssse3, 8, svq3
  540. ;-----------------------------------------------------------------------------
  541. ; void pred8x8_plane(uint8_t *src, int stride)
  542. ;-----------------------------------------------------------------------------
  543. %macro H264_PRED8x8_PLANE 2
  544. cglobal pred8x8_plane_%1, 2, 7, %2
  545. mov r2, r1 ; +stride
  546. neg r1 ; -stride
  547. movd m0, [r0+r1 -1]
  548. %if mmsize == 8
  549. pxor m2, m2
  550. movh m1, [r0+r1 +4 ]
  551. punpcklbw m0, m2
  552. punpcklbw m1, m2
  553. pmullw m0, [pw_m4to4]
  554. pmullw m1, [pw_m4to4+8]
  555. %else ; mmsize == 16
  556. %ifidn %1, sse2
  557. pxor m2, m2
  558. movd m1, [r0+r1 +4]
  559. punpckldq m0, m1
  560. punpcklbw m0, m2
  561. pmullw m0, [pw_m4to4]
  562. %else ; ssse3
  563. movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
  564. pmaddubsw m0, [plane8_shuf] ; H coefficients
  565. %endif
  566. movhlps m1, m0
  567. %endif
  568. paddw m0, m1
  569. %ifnidn %1, ssse3
  570. %ifidn %1, mmx
  571. mova m1, m0
  572. psrlq m1, 32
  573. %elifidn %1, mmx2
  574. pshufw m1, m0, 0xE
  575. %else ; mmsize == 16
  576. pshuflw m1, m0, 0xE
  577. %endif
  578. paddw m0, m1
  579. %endif ; !ssse3
  580. %ifidn %1, mmx
  581. mova m1, m0
  582. psrlq m1, 16
  583. %elifidn %1, mmx2
  584. pshufw m1, m0, 0x1
  585. %else
  586. pshuflw m1, m0, 0x1
  587. %endif
  588. paddw m0, m1 ; sum of H coefficients
  589. lea r4, [r0+r2*4-1]
  590. lea r3, [r0 -1]
  591. add r4, r2
  592. %ifdef ARCH_X86_64
  593. %define e_reg r11
  594. %else
  595. %define e_reg r0
  596. %endif
  597. movzx e_reg, byte [r3+r2*2 ]
  598. movzx r5, byte [r4+r1 ]
  599. sub r5, e_reg
  600. movzx e_reg, byte [r3 ]
  601. %ifdef ARCH_X86_64
  602. movzx r10, byte [r4+r2 ]
  603. sub r10, e_reg
  604. sub r5, r10
  605. %else
  606. movzx r6, byte [r4+r2 ]
  607. sub r6, e_reg
  608. lea r5, [r5+r6*4]
  609. sub r5, r6
  610. %endif
  611. movzx e_reg, byte [r3+r1 ]
  612. movzx r6, byte [r4+r2*2 ]
  613. sub r6, e_reg
  614. %ifdef ARCH_X86_64
  615. add r6, r10
  616. %endif
  617. lea r5, [r5+r6*4]
  618. movzx e_reg, byte [r3+r2 ]
  619. movzx r6, byte [r4 ]
  620. sub r6, e_reg
  621. lea r6, [r5+r6*2]
  622. lea r5, [r6*9+16]
  623. lea r5, [r5+r6*8]
  624. sar r5, 5
  625. %ifndef ARCH_X86_64
  626. mov r0, r0m
  627. %endif
  628. movzx r3, byte [r4+r2*2 ]
  629. movzx r4, byte [r0+r1 +7]
  630. lea r3, [r3+r4+1]
  631. shl r3, 4
  632. movd r1d, m0
  633. movsx r1d, r1w
  634. imul r1d, 17
  635. add r1d, 16
  636. sar r1d, 5
  637. movd m0, r1d
  638. add r1d, r5d
  639. sub r3d, r1d
  640. add r1d, r1d
  641. sub r3d, r1d ; a
  642. movd m1, r5d
  643. movd m3, r3d
  644. %ifidn %1, mmx
  645. punpcklwd m0, m0
  646. punpcklwd m1, m1
  647. punpcklwd m3, m3
  648. punpckldq m0, m0
  649. punpckldq m1, m1
  650. punpckldq m3, m3
  651. %elifidn %1, mmx2
  652. pshufw m0, m0, 0x0
  653. pshufw m1, m1, 0x0
  654. pshufw m3, m3, 0x0
  655. %else
  656. pshuflw m0, m0, 0x0
  657. pshuflw m1, m1, 0x0
  658. pshuflw m3, m3, 0x0
  659. punpcklqdq m0, m0 ; splat H (words)
  660. punpcklqdq m1, m1 ; splat V (words)
  661. punpcklqdq m3, m3 ; splat a (words)
  662. %endif
  663. %if mmsize == 8
  664. mova m2, m0
  665. %endif
  666. pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
  667. paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
  668. %if mmsize == 8
  669. psllw m2, 2
  670. paddw m2, m0 ; a + {4,5,6,7}*H
  671. %endif
  672. mov r4, 4
  673. ALIGN 16
  674. .loop
  675. %if mmsize == 16
  676. mova m3, m0 ; b[0..7]
  677. paddw m0, m1
  678. psraw m3, 5
  679. mova m4, m0 ; V+b[0..7]
  680. paddw m0, m1
  681. psraw m4, 5
  682. packuswb m3, m4
  683. movh [r0], m3
  684. movhps [r0+r2], m3
  685. %else ; mmsize == 8
  686. mova m3, m0 ; b[0..3]
  687. mova m4, m2 ; b[4..7]
  688. paddw m0, m1
  689. paddw m2, m1
  690. psraw m3, 5
  691. psraw m4, 5
  692. mova m5, m0 ; V+b[0..3]
  693. mova m6, m2 ; V+b[4..7]
  694. paddw m0, m1
  695. paddw m2, m1
  696. psraw m5, 5
  697. psraw m6, 5
  698. packuswb m3, m4
  699. packuswb m5, m6
  700. mova [r0], m3
  701. mova [r0+r2], m5
  702. %endif
  703. lea r0, [r0+r2*2]
  704. dec r4
  705. jg .loop
  706. REP_RET
  707. %endmacro
  708. INIT_MMX
  709. H264_PRED8x8_PLANE mmx, 0
  710. H264_PRED8x8_PLANE mmx2, 0
  711. INIT_XMM
  712. H264_PRED8x8_PLANE sse2, 8
  713. H264_PRED8x8_PLANE ssse3, 8
  714. ;-----------------------------------------------------------------------------
  715. ; void pred8x8_vertical(uint8_t *src, int stride)
  716. ;-----------------------------------------------------------------------------
  717. cglobal pred8x8_vertical_mmx, 2,2
  718. sub r0, r1
  719. movq mm0, [r0]
  720. %rep 3
  721. movq [r0+r1*1], mm0
  722. movq [r0+r1*2], mm0
  723. lea r0, [r0+r1*2]
  724. %endrep
  725. movq [r0+r1*1], mm0
  726. movq [r0+r1*2], mm0
  727. RET
  728. ;-----------------------------------------------------------------------------
  729. ; void pred8x8_horizontal(uint8_t *src, int stride)
  730. ;-----------------------------------------------------------------------------
  731. %macro PRED8x8_H 1
  732. cglobal pred8x8_horizontal_%1, 2,3
  733. mov r2, 4
  734. %ifidn %1, ssse3
  735. mova m2, [pb_3]
  736. %endif
  737. .loop:
  738. movd m0, [r0+r1*0-4]
  739. movd m1, [r0+r1*1-4]
  740. %ifidn %1, ssse3
  741. pshufb m0, m2
  742. pshufb m1, m2
  743. %else
  744. punpcklbw m0, m0
  745. punpcklbw m1, m1
  746. %ifidn %1, mmxext
  747. pshufw m0, m0, 0xff
  748. pshufw m1, m1, 0xff
  749. %else
  750. punpckhwd m0, m0
  751. punpckhwd m1, m1
  752. punpckhdq m0, m0
  753. punpckhdq m1, m1
  754. %endif
  755. %endif
  756. mova [r0+r1*0], m0
  757. mova [r0+r1*1], m1
  758. lea r0, [r0+r1*2]
  759. dec r2
  760. jg .loop
  761. REP_RET
  762. %endmacro
  763. INIT_MMX
  764. PRED8x8_H mmx
  765. PRED8x8_H mmxext
  766. PRED8x8_H ssse3
  767. ;-----------------------------------------------------------------------------
  768. ; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
  769. ;-----------------------------------------------------------------------------
  770. cglobal pred8x8_top_dc_mmxext, 2,5
  771. sub r0, r1
  772. movq mm0, [r0]
  773. pxor mm1, mm1
  774. pxor mm2, mm2
  775. lea r2, [r0+r1*2]
  776. punpckhbw mm1, mm0
  777. punpcklbw mm0, mm2
  778. psadbw mm1, mm2 ; s1
  779. lea r3, [r2+r1*2]
  780. psadbw mm0, mm2 ; s0
  781. psrlw mm1, 1
  782. psrlw mm0, 1
  783. pavgw mm1, mm2
  784. lea r4, [r3+r1*2]
  785. pavgw mm0, mm2
  786. pshufw mm1, mm1, 0
  787. pshufw mm0, mm0, 0 ; dc0 (w)
  788. packuswb mm0, mm1 ; dc0,dc1 (b)
  789. movq [r0+r1*1], mm0
  790. movq [r0+r1*2], mm0
  791. lea r0, [r3+r1*2]
  792. movq [r2+r1*1], mm0
  793. movq [r2+r1*2], mm0
  794. movq [r3+r1*1], mm0
  795. movq [r3+r1*2], mm0
  796. movq [r0+r1*1], mm0
  797. movq [r0+r1*2], mm0
  798. RET
  799. ;-----------------------------------------------------------------------------
  800. ; void pred8x8_dc_mmxext(uint8_t *src, int stride)
  801. ;-----------------------------------------------------------------------------
  802. INIT_MMX
  803. cglobal pred8x8_dc_mmxext, 2,5
  804. sub r0, r1
  805. pxor m7, m7
  806. movd m0, [r0+0]
  807. movd m1, [r0+4]
  808. psadbw m0, m7 ; s0
  809. mov r4, r0
  810. psadbw m1, m7 ; s1
  811. movzx r2d, byte [r0+r1*1-1]
  812. movzx r3d, byte [r0+r1*2-1]
  813. lea r0, [r0+r1*2]
  814. add r2d, r3d
  815. movzx r3d, byte [r0+r1*1-1]
  816. add r2d, r3d
  817. movzx r3d, byte [r0+r1*2-1]
  818. add r2d, r3d
  819. lea r0, [r0+r1*2]
  820. movd m2, r2d ; s2
  821. movzx r2d, byte [r0+r1*1-1]
  822. movzx r3d, byte [r0+r1*2-1]
  823. lea r0, [r0+r1*2]
  824. add r2d, r3d
  825. movzx r3d, byte [r0+r1*1-1]
  826. add r2d, r3d
  827. movzx r3d, byte [r0+r1*2-1]
  828. add r2d, r3d
  829. movd m3, r2d ; s3
  830. punpcklwd m0, m1
  831. mov r0, r4
  832. punpcklwd m2, m3
  833. punpckldq m0, m2 ; s0, s1, s2, s3
  834. pshufw m3, m0, 11110110b ; s2, s1, s3, s3
  835. lea r2, [r0+r1*2]
  836. pshufw m0, m0, 01110100b ; s0, s1, s3, s1
  837. paddw m0, m3
  838. lea r3, [r2+r1*2]
  839. psrlw m0, 2
  840. pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
  841. lea r4, [r3+r1*2]
  842. packuswb m0, m0
  843. punpcklbw m0, m0
  844. movq m1, m0
  845. punpcklbw m0, m0
  846. punpckhbw m1, m1
  847. movq [r0+r1*1], m0
  848. movq [r0+r1*2], m0
  849. movq [r2+r1*1], m0
  850. movq [r2+r1*2], m0
  851. movq [r3+r1*1], m1
  852. movq [r3+r1*2], m1
  853. movq [r4+r1*1], m1
  854. movq [r4+r1*2], m1
  855. RET
  856. ;-----------------------------------------------------------------------------
  857. ; void pred8x8_dc_rv40(uint8_t *src, int stride)
  858. ;-----------------------------------------------------------------------------
  859. cglobal pred8x8_dc_rv40_mmxext, 2,7
  860. mov r4, r0
  861. sub r0, r1
  862. pxor mm0, mm0
  863. psadbw mm0, [r0]
  864. dec r0
  865. movzx r5d, byte [r0+r1*1]
  866. movd r6d, mm0
  867. lea r0, [r0+r1*2]
  868. %rep 3
  869. movzx r2d, byte [r0+r1*0]
  870. movzx r3d, byte [r0+r1*1]
  871. add r5d, r2d
  872. add r6d, r3d
  873. lea r0, [r0+r1*2]
  874. %endrep
  875. movzx r2d, byte [r0+r1*0]
  876. add r5d, r6d
  877. lea r2d, [r2+r5+8]
  878. shr r2d, 4
  879. movd mm0, r2d
  880. punpcklbw mm0, mm0
  881. pshufw mm0, mm0, 0
  882. mov r3d, 4
  883. .loop:
  884. movq [r4+r1*0], mm0
  885. movq [r4+r1*1], mm0
  886. lea r4, [r4+r1*2]
  887. dec r3d
  888. jg .loop
  889. REP_RET
  890. ;-----------------------------------------------------------------------------
  891. ; void pred8x8_tm_vp8(uint8_t *src, int stride)
  892. ;-----------------------------------------------------------------------------
  893. %macro PRED8x8_TM_MMX 1
  894. cglobal pred8x8_tm_vp8_%1, 2,6
  895. sub r0, r1
  896. pxor mm7, mm7
  897. movq mm0, [r0]
  898. movq mm1, mm0
  899. punpcklbw mm0, mm7
  900. punpckhbw mm1, mm7
  901. movzx r4d, byte [r0-1]
  902. mov r5d, 4
  903. .loop:
  904. movzx r2d, byte [r0+r1*1-1]
  905. movzx r3d, byte [r0+r1*2-1]
  906. sub r2d, r4d
  907. sub r3d, r4d
  908. movd mm2, r2d
  909. movd mm4, r3d
  910. %ifidn %1, mmx
  911. punpcklwd mm2, mm2
  912. punpcklwd mm4, mm4
  913. punpckldq mm2, mm2
  914. punpckldq mm4, mm4
  915. %else
  916. pshufw mm2, mm2, 0
  917. pshufw mm4, mm4, 0
  918. %endif
  919. movq mm3, mm2
  920. movq mm5, mm4
  921. paddw mm2, mm0
  922. paddw mm3, mm1
  923. paddw mm4, mm0
  924. paddw mm5, mm1
  925. packuswb mm2, mm3
  926. packuswb mm4, mm5
  927. movq [r0+r1*1], mm2
  928. movq [r0+r1*2], mm4
  929. lea r0, [r0+r1*2]
  930. dec r5d
  931. jg .loop
  932. REP_RET
  933. %endmacro
  934. PRED8x8_TM_MMX mmx
  935. PRED8x8_TM_MMX mmxext
  936. cglobal pred8x8_tm_vp8_sse2, 2,6,4
  937. sub r0, r1
  938. pxor xmm1, xmm1
  939. movq xmm0, [r0]
  940. punpcklbw xmm0, xmm1
  941. movzx r4d, byte [r0-1]
  942. mov r5d, 4
  943. .loop:
  944. movzx r2d, byte [r0+r1*1-1]
  945. movzx r3d, byte [r0+r1*2-1]
  946. sub r2d, r4d
  947. sub r3d, r4d
  948. movd xmm2, r2d
  949. movd xmm3, r3d
  950. pshuflw xmm2, xmm2, 0
  951. pshuflw xmm3, xmm3, 0
  952. punpcklqdq xmm2, xmm2
  953. punpcklqdq xmm3, xmm3
  954. paddw xmm2, xmm0
  955. paddw xmm3, xmm0
  956. packuswb xmm2, xmm3
  957. movq [r0+r1*1], xmm2
  958. movhps [r0+r1*2], xmm2
  959. lea r0, [r0+r1*2]
  960. dec r5d
  961. jg .loop
  962. REP_RET
  963. cglobal pred8x8_tm_vp8_ssse3, 2,3,6
  964. sub r0, r1
  965. movdqa xmm4, [tm_shuf]
  966. pxor xmm1, xmm1
  967. movq xmm0, [r0]
  968. punpcklbw xmm0, xmm1
  969. movd xmm5, [r0-4]
  970. pshufb xmm5, xmm4
  971. mov r2d, 4
  972. .loop:
  973. movd xmm2, [r0+r1*1-4]
  974. movd xmm3, [r0+r1*2-4]
  975. pshufb xmm2, xmm4
  976. pshufb xmm3, xmm4
  977. psubw xmm2, xmm5
  978. psubw xmm3, xmm5
  979. paddw xmm2, xmm0
  980. paddw xmm3, xmm0
  981. packuswb xmm2, xmm3
  982. movq [r0+r1*1], xmm2
  983. movhps [r0+r1*2], xmm2
  984. lea r0, [r0+r1*2]
  985. dec r2d
  986. jg .loop
  987. REP_RET
  988. ; dest, left, right, src, tmp
  989. ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  990. %macro PRED4x4_LOWPASS 5
  991. mova %5, %2
  992. pavgb %2, %3
  993. pxor %3, %5
  994. mova %1, %4
  995. pand %3, [pb_1]
  996. psubusb %2, %3
  997. pavgb %1, %2
  998. %endmacro
  999. ;-----------------------------------------------------------------------------
  1000. ; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
  1001. ;-----------------------------------------------------------------------------
  1002. %macro PRED8x8L_TOP_DC 1
  1003. cglobal pred8x8l_top_dc_%1, 4,4
  1004. sub r0, r3
  1005. pxor mm7, mm7
  1006. movq mm0, [r0-8]
  1007. movq mm3, [r0]
  1008. movq mm1, [r0+8]
  1009. movq mm2, mm3
  1010. movq mm4, mm3
  1011. PALIGNR mm2, mm0, 7, mm0
  1012. PALIGNR mm1, mm4, 1, mm4
  1013. test r1, r1 ; top_left
  1014. jz .fix_lt_2
  1015. test r2, r2 ; top_right
  1016. jz .fix_tr_1
  1017. jmp .body
  1018. .fix_lt_2:
  1019. movq mm5, mm3
  1020. pxor mm5, mm2
  1021. psllq mm5, 56
  1022. psrlq mm5, 56
  1023. pxor mm2, mm5
  1024. test r2, r2 ; top_right
  1025. jnz .body
  1026. .fix_tr_1:
  1027. movq mm5, mm3
  1028. pxor mm5, mm1
  1029. psrlq mm5, 56
  1030. psllq mm5, 56
  1031. pxor mm1, mm5
  1032. .body
  1033. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  1034. psadbw mm7, mm0
  1035. paddw mm7, [pw_4]
  1036. psrlw mm7, 3
  1037. pshufw mm7, mm7, 0
  1038. packuswb mm7, mm7
  1039. %rep 3
  1040. movq [r0+r3*1], mm7
  1041. movq [r0+r3*2], mm7
  1042. lea r0, [r0+r3*2]
  1043. %endrep
  1044. movq [r0+r3*1], mm7
  1045. movq [r0+r3*2], mm7
  1046. RET
  1047. %endmacro
  1048. INIT_MMX
  1049. %define PALIGNR PALIGNR_MMX
  1050. PRED8x8L_TOP_DC mmxext
  1051. %define PALIGNR PALIGNR_SSSE3
  1052. PRED8x8L_TOP_DC ssse3
  1053. ;-----------------------------------------------------------------------------
  1054. ;void pred8x8l_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
  1055. ;-----------------------------------------------------------------------------
  1056. %macro PRED8x8L_DC 1
  1057. cglobal pred8x8l_dc_%1, 4,5
  1058. sub r0, r3
  1059. lea r4, [r0+r3*2]
  1060. movq mm0, [r0+r3*1-8]
  1061. punpckhbw mm0, [r0+r3*0-8]
  1062. movq mm1, [r4+r3*1-8]
  1063. punpckhbw mm1, [r0+r3*2-8]
  1064. mov r4, r0
  1065. punpckhwd mm1, mm0
  1066. lea r0, [r0+r3*4]
  1067. movq mm2, [r0+r3*1-8]
  1068. punpckhbw mm2, [r0+r3*0-8]
  1069. lea r0, [r0+r3*2]
  1070. movq mm3, [r0+r3*1-8]
  1071. punpckhbw mm3, [r0+r3*0-8]
  1072. punpckhwd mm3, mm2
  1073. punpckhdq mm3, mm1
  1074. lea r0, [r0+r3*2]
  1075. movq mm0, [r0+r3*0-8]
  1076. movq mm1, [r4]
  1077. mov r0, r4
  1078. movq mm4, mm3
  1079. movq mm2, mm3
  1080. PALIGNR mm4, mm0, 7, mm0
  1081. PALIGNR mm1, mm2, 1, mm2
  1082. test r1, r1
  1083. jnz .do_left
  1084. .fix_lt_1:
  1085. movq mm5, mm3
  1086. pxor mm5, mm4
  1087. psrlq mm5, 56
  1088. psllq mm5, 48
  1089. pxor mm1, mm5
  1090. jmp .do_left
  1091. .fix_lt_2:
  1092. movq mm5, mm3
  1093. pxor mm5, mm2
  1094. psllq mm5, 56
  1095. psrlq mm5, 56
  1096. pxor mm2, mm5
  1097. test r2, r2
  1098. jnz .body
  1099. .fix_tr_1:
  1100. movq mm5, mm3
  1101. pxor mm5, mm1
  1102. psrlq mm5, 56
  1103. psllq mm5, 56
  1104. pxor mm1, mm5
  1105. jmp .body
  1106. .do_left:
  1107. movq mm0, mm4
  1108. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1109. movq mm4, mm0
  1110. movq mm7, mm2
  1111. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1112. psllq mm1, 56
  1113. PALIGNR mm7, mm1, 7, mm3
  1114. movq mm0, [r0-8]
  1115. movq mm3, [r0]
  1116. movq mm1, [r0+8]
  1117. movq mm2, mm3
  1118. movq mm4, mm3
  1119. PALIGNR mm2, mm0, 7, mm0
  1120. PALIGNR mm1, mm4, 1, mm4
  1121. test r1, r1
  1122. jz .fix_lt_2
  1123. test r2, r2
  1124. jz .fix_tr_1
  1125. .body
  1126. lea r1, [r0+r3*2]
  1127. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1128. pxor mm0, mm0
  1129. pxor mm1, mm1
  1130. lea r2, [r1+r3*2]
  1131. psadbw mm0, mm7
  1132. psadbw mm1, mm6
  1133. paddw mm0, [pw_8]
  1134. paddw mm0, mm1
  1135. lea r4, [r2+r3*2]
  1136. psrlw mm0, 4
  1137. pshufw mm0, mm0, 0
  1138. packuswb mm0, mm0
  1139. movq [r0+r3*1], mm0
  1140. movq [r0+r3*2], mm0
  1141. movq [r1+r3*1], mm0
  1142. movq [r1+r3*2], mm0
  1143. movq [r2+r3*1], mm0
  1144. movq [r2+r3*2], mm0
  1145. movq [r4+r3*1], mm0
  1146. movq [r4+r3*2], mm0
  1147. RET
  1148. %endmacro
  1149. INIT_MMX
  1150. %define PALIGNR PALIGNR_MMX
  1151. PRED8x8L_DC mmxext
  1152. %define PALIGNR PALIGNR_SSSE3
  1153. PRED8x8L_DC ssse3
  1154. ;-----------------------------------------------------------------------------
  1155. ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
  1156. ;-----------------------------------------------------------------------------
  1157. %macro PRED8x8L_HORIZONTAL 1
  1158. cglobal pred8x8l_horizontal_%1, 4,4
  1159. sub r0, r3
  1160. lea r2, [r0+r3*2]
  1161. movq mm0, [r0+r3*1-8]
  1162. test r1, r1
  1163. lea r1, [r0+r3]
  1164. cmovnz r1, r0
  1165. punpckhbw mm0, [r1+r3*0-8]
  1166. movq mm1, [r2+r3*1-8]
  1167. punpckhbw mm1, [r0+r3*2-8]
  1168. mov r2, r0
  1169. punpckhwd mm1, mm0
  1170. lea r0, [r0+r3*4]
  1171. movq mm2, [r0+r3*1-8]
  1172. punpckhbw mm2, [r0+r3*0-8]
  1173. lea r0, [r0+r3*2]
  1174. movq mm3, [r0+r3*1-8]
  1175. punpckhbw mm3, [r0+r3*0-8]
  1176. punpckhwd mm3, mm2
  1177. punpckhdq mm3, mm1
  1178. lea r0, [r0+r3*2]
  1179. movq mm0, [r0+r3*0-8]
  1180. movq mm1, [r1+r3*0-8]
  1181. mov r0, r2
  1182. movq mm4, mm3
  1183. movq mm2, mm3
  1184. PALIGNR mm4, mm0, 7, mm0
  1185. PALIGNR mm1, mm2, 1, mm2
  1186. movq mm0, mm4
  1187. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1188. movq mm4, mm0
  1189. movq mm7, mm2
  1190. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1191. psllq mm1, 56
  1192. PALIGNR mm7, mm1, 7, mm3
  1193. movq mm3, mm7
  1194. lea r1, [r0+r3*2]
  1195. movq mm7, mm3
  1196. punpckhbw mm3, mm3
  1197. punpcklbw mm7, mm7
  1198. pshufw mm0, mm3, 0xff
  1199. pshufw mm1, mm3, 0xaa
  1200. lea r2, [r1+r3*2]
  1201. pshufw mm2, mm3, 0x55
  1202. pshufw mm3, mm3, 0x00
  1203. pshufw mm4, mm7, 0xff
  1204. pshufw mm5, mm7, 0xaa
  1205. pshufw mm6, mm7, 0x55
  1206. pshufw mm7, mm7, 0x00
  1207. movq [r0+r3*1], mm0
  1208. movq [r0+r3*2], mm1
  1209. movq [r1+r3*1], mm2
  1210. movq [r1+r3*2], mm3
  1211. movq [r2+r3*1], mm4
  1212. movq [r2+r3*2], mm5
  1213. lea r0, [r2+r3*2]
  1214. movq [r0+r3*1], mm6
  1215. movq [r0+r3*2], mm7
  1216. RET
  1217. %endmacro
  1218. INIT_MMX
  1219. %define PALIGNR PALIGNR_MMX
  1220. PRED8x8L_HORIZONTAL mmxext
  1221. %define PALIGNR PALIGNR_SSSE3
  1222. PRED8x8L_HORIZONTAL ssse3
  1223. ;-----------------------------------------------------------------------------
  1224. ; void pred8x8l_vertical(uint8_t *src, int has_topleft, int has_topright, int stride)
  1225. ;-----------------------------------------------------------------------------
  1226. %macro PRED8x8L_VERTICAL 1
  1227. cglobal pred8x8l_vertical_%1, 4,4
  1228. sub r0, r3
  1229. movq mm0, [r0-8]
  1230. movq mm3, [r0]
  1231. movq mm1, [r0+8]
  1232. movq mm2, mm3
  1233. movq mm4, mm3
  1234. PALIGNR mm2, mm0, 7, mm0
  1235. PALIGNR mm1, mm4, 1, mm4
  1236. test r1, r1 ; top_left
  1237. jz .fix_lt_2
  1238. test r2, r2 ; top_right
  1239. jz .fix_tr_1
  1240. jmp .body
  1241. .fix_lt_2:
  1242. movq mm5, mm3
  1243. pxor mm5, mm2
  1244. psllq mm5, 56
  1245. psrlq mm5, 56
  1246. pxor mm2, mm5
  1247. test r2, r2 ; top_right
  1248. jnz .body
  1249. .fix_tr_1:
  1250. movq mm5, mm3
  1251. pxor mm5, mm1
  1252. psrlq mm5, 56
  1253. psllq mm5, 56
  1254. pxor mm1, mm5
  1255. .body
  1256. PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
  1257. %rep 3
  1258. movq [r0+r3*1], mm0
  1259. movq [r0+r3*2], mm0
  1260. lea r0, [r0+r3*2]
  1261. %endrep
  1262. movq [r0+r3*1], mm0
  1263. movq [r0+r3*2], mm0
  1264. RET
  1265. %endmacro
  1266. INIT_MMX
  1267. %define PALIGNR PALIGNR_MMX
  1268. PRED8x8L_VERTICAL mmxext
  1269. %define PALIGNR PALIGNR_SSSE3
  1270. PRED8x8L_VERTICAL ssse3
  1271. ;-----------------------------------------------------------------------------
  1272. ;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
  1273. ;-----------------------------------------------------------------------------
  1274. INIT_MMX
  1275. %define PALIGNR PALIGNR_MMX
  1276. cglobal pred8x8l_down_left_mmxext, 4,5
  1277. sub r0, r3
  1278. movq mm0, [r0-8]
  1279. movq mm3, [r0]
  1280. movq mm1, [r0+8]
  1281. movq mm2, mm3
  1282. movq mm4, mm3
  1283. PALIGNR mm2, mm0, 7, mm0
  1284. PALIGNR mm1, mm4, 1, mm4
  1285. test r1, r1
  1286. jz .fix_lt_2
  1287. test r2, r2
  1288. jz .fix_tr_1
  1289. jmp .do_top
  1290. .fix_lt_2:
  1291. movq mm5, mm3
  1292. pxor mm5, mm2
  1293. psllq mm5, 56
  1294. psrlq mm5, 56
  1295. pxor mm2, mm5
  1296. test r2, r2
  1297. jnz .do_top
  1298. .fix_tr_1:
  1299. movq mm5, mm3
  1300. pxor mm5, mm1
  1301. psrlq mm5, 56
  1302. psllq mm5, 56
  1303. pxor mm1, mm5
  1304. jmp .do_top
  1305. .fix_tr_2:
  1306. punpckhbw mm3, mm3
  1307. pshufw mm1, mm3, 0xFF
  1308. jmp .do_topright
  1309. .do_top:
  1310. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1311. movq mm7, mm4
  1312. test r2, r2
  1313. jz .fix_tr_2
  1314. movq mm0, [r0+8]
  1315. movq mm5, mm0
  1316. movq mm2, mm0
  1317. movq mm4, mm0
  1318. psrlq mm5, 56
  1319. PALIGNR mm2, mm3, 7, mm3
  1320. PALIGNR mm5, mm4, 1, mm4
  1321. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1322. .do_topright:
  1323. lea r1, [r0+r3*2]
  1324. movq mm6, mm1
  1325. psrlq mm1, 56
  1326. movq mm4, mm1
  1327. lea r2, [r1+r3*2]
  1328. movq mm2, mm6
  1329. PALIGNR mm2, mm7, 1, mm0
  1330. movq mm3, mm6
  1331. PALIGNR mm3, mm7, 7, mm0
  1332. PALIGNR mm4, mm6, 1, mm0
  1333. movq mm5, mm7
  1334. movq mm1, mm7
  1335. movq mm7, mm6
  1336. lea r4, [r2+r3*2]
  1337. psllq mm1, 8
  1338. PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
  1339. PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
  1340. movq [r4+r3*2], mm1
  1341. movq mm2, mm0
  1342. psllq mm1, 8
  1343. psrlq mm2, 56
  1344. psllq mm0, 8
  1345. por mm1, mm2
  1346. movq [r4+r3*1], mm1
  1347. movq mm2, mm0
  1348. psllq mm1, 8
  1349. psrlq mm2, 56
  1350. psllq mm0, 8
  1351. por mm1, mm2
  1352. movq [r2+r3*2], mm1
  1353. movq mm2, mm0
  1354. psllq mm1, 8
  1355. psrlq mm2, 56
  1356. psllq mm0, 8
  1357. por mm1, mm2
  1358. movq [r2+r3*1], mm1
  1359. movq mm2, mm0
  1360. psllq mm1, 8
  1361. psrlq mm2, 56
  1362. psllq mm0, 8
  1363. por mm1, mm2
  1364. movq [r1+r3*2], mm1
  1365. movq mm2, mm0
  1366. psllq mm1, 8
  1367. psrlq mm2, 56
  1368. psllq mm0, 8
  1369. por mm1, mm2
  1370. movq [r1+r3*1], mm1
  1371. movq mm2, mm0
  1372. psllq mm1, 8
  1373. psrlq mm2, 56
  1374. psllq mm0, 8
  1375. por mm1, mm2
  1376. movq [r0+r3*2], mm1
  1377. psllq mm1, 8
  1378. psrlq mm0, 56
  1379. por mm1, mm0
  1380. movq [r0+r3*1], mm1
  1381. RET
  1382. %macro PRED8x8L_DOWN_LEFT 1
  1383. cglobal pred8x8l_down_left_%1, 4,4
  1384. sub r0, r3
  1385. movq mm0, [r0-8]
  1386. movq mm3, [r0]
  1387. movq mm1, [r0+8]
  1388. movq mm2, mm3
  1389. movq mm4, mm3
  1390. PALIGNR mm2, mm0, 7, mm0
  1391. PALIGNR mm1, mm4, 1, mm4
  1392. test r1, r1 ; top_left
  1393. jz .fix_lt_2
  1394. test r2, r2 ; top_right
  1395. jz .fix_tr_1
  1396. jmp .do_top
  1397. .fix_lt_2:
  1398. movq mm5, mm3
  1399. pxor mm5, mm2
  1400. psllq mm5, 56
  1401. psrlq mm5, 56
  1402. pxor mm2, mm5
  1403. test r2, r2 ; top_right
  1404. jnz .do_top
  1405. .fix_tr_1:
  1406. movq mm5, mm3
  1407. pxor mm5, mm1
  1408. psrlq mm5, 56
  1409. psllq mm5, 56
  1410. pxor mm1, mm5
  1411. jmp .do_top
  1412. .fix_tr_2:
  1413. punpckhbw mm3, mm3
  1414. pshufw mm1, mm3, 0xFF
  1415. jmp .do_topright
  1416. .do_top:
  1417. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1418. movq2dq xmm3, mm4
  1419. test r2, r2 ; top_right
  1420. jz .fix_tr_2
  1421. movq mm0, [r0+8]
  1422. movq mm5, mm0
  1423. movq mm2, mm0
  1424. movq mm4, mm0
  1425. psrlq mm5, 56
  1426. PALIGNR mm2, mm3, 7, mm3
  1427. PALIGNR mm5, mm4, 1, mm4
  1428. PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
  1429. .do_topright:
  1430. movq2dq xmm4, mm1
  1431. psrlq mm1, 56
  1432. movq2dq xmm5, mm1
  1433. lea r1, [r0+r3*2]
  1434. pslldq xmm4, 8
  1435. por xmm3, xmm4
  1436. movdqa xmm2, xmm3
  1437. psrldq xmm2, 1
  1438. pslldq xmm5, 15
  1439. por xmm2, xmm5
  1440. lea r2, [r1+r3*2]
  1441. movdqa xmm1, xmm3
  1442. pslldq xmm1, 1
  1443. INIT_XMM
  1444. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1445. psrldq xmm0, 1
  1446. movq [r0+r3*1], xmm0
  1447. psrldq xmm0, 1
  1448. movq [r0+r3*2], xmm0
  1449. psrldq xmm0, 1
  1450. lea r0, [r2+r3*2]
  1451. movq [r1+r3*1], xmm0
  1452. psrldq xmm0, 1
  1453. movq [r1+r3*2], xmm0
  1454. psrldq xmm0, 1
  1455. movq [r2+r3*1], xmm0
  1456. psrldq xmm0, 1
  1457. movq [r2+r3*2], xmm0
  1458. psrldq xmm0, 1
  1459. movq [r0+r3*1], xmm0
  1460. psrldq xmm0, 1
  1461. movq [r0+r3*2], xmm0
  1462. RET
  1463. %endmacro
  1464. INIT_MMX
  1465. %define PALIGNR PALIGNR_MMX
  1466. PRED8x8L_DOWN_LEFT sse2
  1467. INIT_MMX
  1468. %define PALIGNR PALIGNR_SSSE3
  1469. PRED8x8L_DOWN_LEFT ssse3
  1470. ;-----------------------------------------------------------------------------
  1471. ;void pred8x8l_down_right_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
  1472. ;-----------------------------------------------------------------------------
  1473. INIT_MMX
  1474. %define PALIGNR PALIGNR_MMX
  1475. cglobal pred8x8l_down_right_mmxext, 4,5
  1476. sub r0, r3
  1477. lea r4, [r0+r3*2]
  1478. movq mm0, [r0+r3*1-8]
  1479. punpckhbw mm0, [r0+r3*0-8]
  1480. movq mm1, [r4+r3*1-8]
  1481. punpckhbw mm1, [r0+r3*2-8]
  1482. mov r4, r0
  1483. punpckhwd mm1, mm0
  1484. lea r0, [r0+r3*4]
  1485. movq mm2, [r0+r3*1-8]
  1486. punpckhbw mm2, [r0+r3*0-8]
  1487. lea r0, [r0+r3*2]
  1488. movq mm3, [r0+r3*1-8]
  1489. punpckhbw mm3, [r0+r3*0-8]
  1490. punpckhwd mm3, mm2
  1491. punpckhdq mm3, mm1
  1492. lea r0, [r0+r3*2]
  1493. movq mm0, [r0+r3*0-8]
  1494. movq mm1, [r4]
  1495. mov r0, r4
  1496. movq mm4, mm3
  1497. movq mm2, mm3
  1498. PALIGNR mm4, mm0, 7, mm0
  1499. PALIGNR mm1, mm2, 1, mm2
  1500. test r1, r1 ; top_left
  1501. jz .fix_lt_1
  1502. .do_left:
  1503. movq mm0, mm4
  1504. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1505. movq mm4, mm0
  1506. movq mm7, mm2
  1507. movq mm6, mm2
  1508. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1509. psllq mm1, 56
  1510. PALIGNR mm7, mm1, 7, mm3
  1511. movq mm0, [r0-8]
  1512. movq mm3, [r0]
  1513. movq mm1, [r0+8]
  1514. movq mm2, mm3
  1515. movq mm4, mm3
  1516. PALIGNR mm2, mm0, 7, mm0
  1517. PALIGNR mm1, mm4, 1, mm4
  1518. test r1, r1 ; top_left
  1519. jz .fix_lt_2
  1520. test r2, r2 ; top_right
  1521. jz .fix_tr_1
  1522. .do_top:
  1523. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1524. movq mm5, mm4
  1525. jmp .body
  1526. .fix_lt_1:
  1527. movq mm5, mm3
  1528. pxor mm5, mm4
  1529. psrlq mm5, 56
  1530. psllq mm5, 48
  1531. pxor mm1, mm5
  1532. jmp .do_left
  1533. .fix_lt_2:
  1534. movq mm5, mm3
  1535. pxor mm5, mm2
  1536. psllq mm5, 56
  1537. psrlq mm5, 56
  1538. pxor mm2, mm5
  1539. test r2, r2 ; top_right
  1540. jnz .do_top
  1541. .fix_tr_1:
  1542. movq mm5, mm3
  1543. pxor mm5, mm1
  1544. psrlq mm5, 56
  1545. psllq mm5, 56
  1546. pxor mm1, mm5
  1547. jmp .do_top
  1548. .body
  1549. lea r1, [r0+r3*2]
  1550. movq mm1, mm7
  1551. movq mm7, mm5
  1552. movq mm5, mm6
  1553. movq mm2, mm7
  1554. lea r2, [r1+r3*2]
  1555. PALIGNR mm2, mm6, 1, mm0
  1556. movq mm3, mm7
  1557. PALIGNR mm3, mm6, 7, mm0
  1558. movq mm4, mm7
  1559. lea r4, [r2+r3*2]
  1560. psrlq mm4, 8
  1561. PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
  1562. PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
  1563. movq [r4+r3*2], mm0
  1564. movq mm2, mm1
  1565. psrlq mm0, 8
  1566. psllq mm2, 56
  1567. psrlq mm1, 8
  1568. por mm0, mm2
  1569. movq [r4+r3*1], mm0
  1570. movq mm2, mm1
  1571. psrlq mm0, 8
  1572. psllq mm2, 56
  1573. psrlq mm1, 8
  1574. por mm0, mm2
  1575. movq [r2+r3*2], mm0
  1576. movq mm2, mm1
  1577. psrlq mm0, 8
  1578. psllq mm2, 56
  1579. psrlq mm1, 8
  1580. por mm0, mm2
  1581. movq [r2+r3*1], mm0
  1582. movq mm2, mm1
  1583. psrlq mm0, 8
  1584. psllq mm2, 56
  1585. psrlq mm1, 8
  1586. por mm0, mm2
  1587. movq [r1+r3*2], mm0
  1588. movq mm2, mm1
  1589. psrlq mm0, 8
  1590. psllq mm2, 56
  1591. psrlq mm1, 8
  1592. por mm0, mm2
  1593. movq [r1+r3*1], mm0
  1594. movq mm2, mm1
  1595. psrlq mm0, 8
  1596. psllq mm2, 56
  1597. psrlq mm1, 8
  1598. por mm0, mm2
  1599. movq [r0+r3*2], mm0
  1600. psrlq mm0, 8
  1601. psllq mm1, 56
  1602. por mm0, mm1
  1603. movq [r0+r3*1], mm0
  1604. RET
  1605. %macro PRED8x8L_DOWN_RIGHT 1
  1606. cglobal pred8x8l_down_right_%1, 4,5
  1607. sub r0, r3
  1608. lea r4, [r0+r3*2]
  1609. movq mm0, [r0+r3*1-8]
  1610. punpckhbw mm0, [r0+r3*0-8]
  1611. movq mm1, [r4+r3*1-8]
  1612. punpckhbw mm1, [r0+r3*2-8]
  1613. mov r4, r0
  1614. punpckhwd mm1, mm0
  1615. lea r0, [r0+r3*4]
  1616. movq mm2, [r0+r3*1-8]
  1617. punpckhbw mm2, [r0+r3*0-8]
  1618. lea r0, [r0+r3*2]
  1619. movq mm3, [r0+r3*1-8]
  1620. punpckhbw mm3, [r0+r3*0-8]
  1621. punpckhwd mm3, mm2
  1622. punpckhdq mm3, mm1
  1623. lea r0, [r0+r3*2]
  1624. movq mm0, [r0+r3*0-8]
  1625. movq mm1, [r4]
  1626. mov r0, r4
  1627. movq mm4, mm3
  1628. movq mm2, mm3
  1629. PALIGNR mm4, mm0, 7, mm0
  1630. PALIGNR mm1, mm2, 1, mm2
  1631. test r1, r1
  1632. jz .fix_lt_1
  1633. jmp .do_left
  1634. .fix_lt_1:
  1635. movq mm5, mm3
  1636. pxor mm5, mm4
  1637. psrlq mm5, 56
  1638. psllq mm5, 48
  1639. pxor mm1, mm5
  1640. jmp .do_left
  1641. .fix_lt_2:
  1642. movq mm5, mm3
  1643. pxor mm5, mm2
  1644. psllq mm5, 56
  1645. psrlq mm5, 56
  1646. pxor mm2, mm5
  1647. test r2, r2
  1648. jnz .do_top
  1649. .fix_tr_1:
  1650. movq mm5, mm3
  1651. pxor mm5, mm1
  1652. psrlq mm5, 56
  1653. psllq mm5, 56
  1654. pxor mm1, mm5
  1655. jmp .do_top
  1656. .do_left:
  1657. movq mm0, mm4
  1658. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1659. movq mm4, mm0
  1660. movq mm7, mm2
  1661. movq2dq xmm3, mm2
  1662. PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
  1663. psllq mm1, 56
  1664. PALIGNR mm7, mm1, 7, mm3
  1665. movq2dq xmm1, mm7
  1666. movq mm0, [r0-8]
  1667. movq mm3, [r0]
  1668. movq mm1, [r0+8]
  1669. movq mm2, mm3
  1670. movq mm4, mm3
  1671. PALIGNR mm2, mm0, 7, mm0
  1672. PALIGNR mm1, mm4, 1, mm4
  1673. test r1, r1
  1674. jz .fix_lt_2
  1675. test r2, r2
  1676. jz .fix_tr_1
  1677. .do_top:
  1678. PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
  1679. movq2dq xmm4, mm4
  1680. lea r1, [r0+r3*2]
  1681. movdqa xmm0, xmm3
  1682. pslldq xmm4, 8
  1683. por xmm3, xmm4
  1684. lea r2, [r1+r3*2]
  1685. pslldq xmm4, 1
  1686. por xmm1, xmm4
  1687. psrldq xmm0, 7
  1688. pslldq xmm0, 15
  1689. psrldq xmm0, 7
  1690. por xmm1, xmm0
  1691. lea r0, [r2+r3*2]
  1692. movdqa xmm2, xmm3
  1693. psrldq xmm2, 1
  1694. INIT_XMM
  1695. PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
  1696. movdqa xmm1, xmm0
  1697. psrldq xmm1, 1
  1698. movq [r0+r3*2], xmm0
  1699. movq [r0+r3*1], xmm1
  1700. psrldq xmm0, 2
  1701. psrldq xmm1, 2
  1702. movq [r2+r3*2], xmm0
  1703. movq [r2+r3*1], xmm1
  1704. psrldq xmm0, 2
  1705. psrldq xmm1, 2
  1706. movq [r1+r3*2], xmm0
  1707. movq [r1+r3*1], xmm1
  1708. psrldq xmm0, 2
  1709. psrldq xmm1, 2
  1710. movq [r4+r3*2], xmm0
  1711. movq [r4+r3*1], xmm1
  1712. RET
  1713. %endmacro
  1714. INIT_MMX
  1715. %define PALIGNR PALIGNR_MMX
  1716. PRED8x8L_DOWN_RIGHT sse2
  1717. INIT_MMX
  1718. %define PALIGNR PALIGNR_SSSE3
  1719. PRED8x8L_DOWN_RIGHT ssse3
  1720. ;-----------------------------------------------------------------------------
  1721. ; void pred8x8l_vertical_right(uint8_t *src, int has_topleft, int has_topright, int stride)
  1722. ;-----------------------------------------------------------------------------
  1723. INIT_MMX
  1724. %define PALIGNR PALIGNR_MMX
  1725. cglobal pred8x8l_vertical_right_mmxext, 4,5
  1726. sub r0, r3
  1727. lea r4, [r0+r3*2]
  1728. movq mm0, [r0+r3*1-8]
  1729. punpckhbw mm0, [r0+r3*0-8]
  1730. movq mm1, [r4+r3*1-8]
  1731. punpckhbw mm1, [r0+r3*2-8]
  1732. mov r4, r0
  1733. punpckhwd mm1, mm0
  1734. lea r0, [r0+r3*4]
  1735. movq mm2, [r0+r3*1-8]
  1736. punpckhbw mm2, [r0+r3*0-8]
  1737. lea r0, [r0+r3*2]
  1738. movq mm3, [r0+r3*1-8]
  1739. punpckhbw mm3, [r0+r3*0-8]
  1740. punpckhwd mm3, mm2
  1741. punpckhdq mm3, mm1
  1742. lea r0, [r0+r3*2]
  1743. movq mm0, [r0+r3*0-8]
  1744. movq mm1, [r4]
  1745. mov r0, r4
  1746. movq mm4, mm3
  1747. movq mm2, mm3
  1748. PALIGNR mm4, mm0, 7, mm0
  1749. PALIGNR mm1, mm2, 1, mm2
  1750. test r1, r1
  1751. jz .fix_lt_1
  1752. jmp .do_left
  1753. .fix_lt_1:
  1754. movq mm5, mm3
  1755. pxor mm5, mm4
  1756. psrlq mm5, 56
  1757. psllq mm5, 48
  1758. pxor mm1, mm5
  1759. jmp .do_left
  1760. .fix_lt_2:
  1761. movq mm5, mm3
  1762. pxor mm5, mm2
  1763. psllq mm5, 56
  1764. psrlq mm5, 56
  1765. pxor mm2, mm5
  1766. test r2, r2
  1767. jnz .do_top
  1768. .fix_tr_1:
  1769. movq mm5, mm3
  1770. pxor mm5, mm1
  1771. psrlq mm5, 56
  1772. psllq mm5, 56
  1773. pxor mm1, mm5
  1774. jmp .do_top
  1775. .do_left:
  1776. movq mm0, mm4
  1777. PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
  1778. movq mm7, mm2
  1779. movq mm0, [r0-8]
  1780. movq mm3, [r0]
  1781. movq mm1, [r0+8]
  1782. movq mm2, mm3
  1783. movq mm4, mm3
  1784. PALIGNR mm2, mm0, 7, mm0
  1785. PALIGNR mm1, mm4, 1, mm4
  1786. test r1, r1
  1787. jz .fix_lt_2
  1788. test r2, r2
  1789. jz .fix_tr_1
  1790. .do_top
  1791. PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
  1792. lea r1, [r0+r3*2]
  1793. movq mm2, mm6
  1794. movq mm3, mm6
  1795. PALIGNR mm3, mm7, 7, mm0
  1796. PALIGNR mm6, mm7, 6, mm1
  1797. movq mm4, mm3
  1798. pavgb mm3, mm2
  1799. lea r2, [r1+r3*2]
  1800. PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
  1801. movq [r0+r3*1], mm3
  1802. movq [r0+r3*2], mm0
  1803. movq mm5, mm0
  1804. movq mm6, mm3
  1805. movq mm1, mm7
  1806. movq mm2, mm1
  1807. psllq mm2, 8
  1808. movq mm3, mm1
  1809. psllq mm3, 16
  1810. lea r4, [r2+r3*2]
  1811. PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
  1812. PALIGNR mm6, mm0, 7, mm2
  1813. movq [r1+r3*1], mm6
  1814. psllq mm0, 8
  1815. PALIGNR mm5, mm0, 7, mm1
  1816. movq [r1+r3*2], mm5
  1817. psllq mm0, 8
  1818. PALIGNR mm6, mm0, 7, mm2
  1819. movq [r2+r3*1], mm6
  1820. psllq mm0, 8
  1821. PALIGNR mm5, mm0, 7, mm1
  1822. movq [r2+r3*2], mm5
  1823. psllq mm0, 8
  1824. PALIGNR mm6, mm0, 7, mm2
  1825. movq [r4+r3*1], mm6
  1826. psllq mm0, 8
  1827. PALIGNR mm5, mm0, 7, mm1
  1828. movq [r4+r3*2], mm5
  1829. RET
  1830. %macro PRED8x8L_VERTICAL_RIGHT 1
  1831. cglobal pred8x8l_vertical_right_%1, 4,5,7
  1832. sub r0, r3
  1833. lea r4, [r0+r3*2]
  1834. movq mm0, [r0+r3*1-8]
  1835. punpckhbw mm0, [r0+r3*0-8]
  1836. movq mm1, [r4+r3*1-8]
  1837. punpckhbw mm1, [r0+r3*2-8]
  1838. mov r4, r0
  1839. punpckhwd mm1, mm0
  1840. lea r0, [r0+r3*4]
  1841. movq mm2, [r0+r3*1-8]
  1842. punpckhbw mm2, [r0+r3*0-8]
  1843. lea r0, [r0+r3*2]
  1844. movq mm3, [r0+r3*1-8]
  1845. punpckhbw mm3, [r0+r3*0-8]
  1846. punpckhwd mm3, mm2
  1847. punpckhdq mm3, mm1
  1848. lea r0, [r0+r3*2]
  1849. movq mm0, [r0+r3*0-8]
  1850. movq mm1, [r4]
  1851. mov r0, r4
  1852. movq mm4, mm3
  1853. movq mm2, mm3
  1854. PALIGNR mm4, mm0, 7, mm0
  1855. PALIGNR mm1, mm2, 1, mm2
  1856. test r1, r1
  1857. jnz .do_left
  1858. .fix_lt_1:
  1859. movq mm5, mm3
  1860. pxor mm5, mm4
  1861. psrlq mm5, 56
  1862. psllq mm5, 48
  1863. pxor mm1, mm5
  1864. jmp .do_left
  1865. .fix_lt_2:
  1866. movq mm5, mm3
  1867. pxor mm5, mm2
  1868. psllq mm5, 56
  1869. psrlq mm5, 56
  1870. pxor mm2, mm5
  1871. test r2, r2
  1872. jnz .do_top
  1873. .fix_tr_1:
  1874. movq mm5, mm3
  1875. pxo

Large files files are truncated, but you can click here to view the full file