/libavcodec/x86/h264_intrapred_10bit.asm

http://github.com/FFmpeg/FFmpeg · Assembly · 1199 lines · 1002 code · 71 blank · 126 comment · 0 complexity · 06df63d17f82727da95b572712082cda MD5 · raw file

  1. ;*****************************************************************************
  2. ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2011 x264 project
  5. ;*
  6. ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  7. ;*
  8. ;* This file is part of FFmpeg.
  9. ;*
  10. ;* FFmpeg is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* FFmpeg is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with FFmpeg; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "libavutil/x86/x86util.asm"
  25. SECTION_RODATA
  26. cextern pw_1023
  27. %define pw_pixel_max pw_1023
  28. cextern pw_512
  29. cextern pw_16
  30. cextern pw_8
  31. cextern pw_4
  32. cextern pw_2
  33. cextern pw_1
  34. cextern pd_16
  35. pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
  36. pw_m3: times 8 dw -3
  37. pd_17: times 4 dd 17
  38. SECTION .text
  39. ; dest, left, right, src
  40. ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  41. %macro PRED4x4_LOWPASS 4
  42. paddw %2, %3
  43. psrlw %2, 1
  44. pavgw %1, %4, %2
  45. %endmacro
  46. ;-----------------------------------------------------------------------------
  47. ; void ff_pred4x4_down_right_10(pixel *src, const pixel *topright,
  48. ; ptrdiff_t stride)
  49. ;-----------------------------------------------------------------------------
  50. %macro PRED4x4_DR 0
  51. cglobal pred4x4_down_right_10, 3, 3
  52. sub r0, r2
  53. lea r1, [r0+r2*2]
  54. movhps m1, [r1-8]
  55. movhps m2, [r0+r2*1-8]
  56. movhps m4, [r0-8]
  57. punpckhwd m2, m4
  58. movq m3, [r0]
  59. punpckhdq m1, m2
  60. PALIGNR m3, m1, 10, m1
  61. movhps m4, [r1+r2*1-8]
  62. PALIGNR m0, m3, m4, 14, m4
  63. movhps m4, [r1+r2*2-8]
  64. PALIGNR m2, m0, m4, 14, m4
  65. PRED4x4_LOWPASS m0, m2, m3, m0
  66. movq [r1+r2*2], m0
  67. psrldq m0, 2
  68. movq [r1+r2*1], m0
  69. psrldq m0, 2
  70. movq [r0+r2*2], m0
  71. psrldq m0, 2
  72. movq [r0+r2*1], m0
  73. RET
  74. %endmacro
  75. INIT_XMM sse2
  76. PRED4x4_DR
  77. INIT_XMM ssse3
  78. PRED4x4_DR
  79. %if HAVE_AVX_EXTERNAL
  80. INIT_XMM avx
  81. PRED4x4_DR
  82. %endif
  83. ;------------------------------------------------------------------------------
  84. ; void ff_pred4x4_vertical_right_10(pixel *src, const pixel *topright,
  85. ; ptrdiff_t stride)
  86. ;------------------------------------------------------------------------------
  87. %macro PRED4x4_VR 0
  88. cglobal pred4x4_vertical_right_10, 3, 3, 6
  89. sub r0, r2
  90. lea r1, [r0+r2*2]
  91. movq m5, [r0] ; ........t3t2t1t0
  92. movhps m1, [r0-8]
  93. PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
  94. pavgw m5, m0
  95. movhps m1, [r0+r2*1-8]
  96. PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
  97. movhps m2, [r0+r2*2-8]
  98. PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
  99. movhps m3, [r1+r2*1-8]
  100. PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
  101. PRED4x4_LOWPASS m1, m0, m2, m1
  102. pslldq m0, m1, 12
  103. psrldq m1, 4
  104. movq [r0+r2*1], m5
  105. movq [r0+r2*2], m1
  106. PALIGNR m5, m0, 14, m2
  107. pslldq m0, 2
  108. movq [r1+r2*1], m5
  109. PALIGNR m1, m0, 14, m0
  110. movq [r1+r2*2], m1
  111. RET
  112. %endmacro
  113. INIT_XMM sse2
  114. PRED4x4_VR
  115. INIT_XMM ssse3
  116. PRED4x4_VR
  117. %if HAVE_AVX_EXTERNAL
  118. INIT_XMM avx
  119. PRED4x4_VR
  120. %endif
  121. ;-------------------------------------------------------------------------------
  122. ; void ff_pred4x4_horizontal_down_10(pixel *src, const pixel *topright,
  123. ; ptrdiff_t stride)
  124. ;-------------------------------------------------------------------------------
  125. %macro PRED4x4_HD 0
  126. cglobal pred4x4_horizontal_down_10, 3, 3
  127. sub r0, r2
  128. lea r1, [r0+r2*2]
  129. movq m0, [r0-8] ; lt ..
  130. movhps m0, [r0]
  131. pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
  132. movq m1, [r1+r2*2-8] ; l3
  133. movq m3, [r1+r2*1-8]
  134. punpcklwd m1, m3 ; l2 l3
  135. movq m2, [r0+r2*2-8] ; l1
  136. movq m3, [r0+r2*1-8]
  137. punpcklwd m2, m3 ; l0 l1
  138. punpckhdq m1, m2 ; l0 l1 l2 l3
  139. punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
  140. psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
  141. psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
  142. pavgw m5, m1, m3
  143. PRED4x4_LOWPASS m3, m1, m0, m3
  144. punpcklwd m5, m3
  145. psrldq m3, 8
  146. PALIGNR m3, m5, 12, m4
  147. movq [r1+r2*2], m5
  148. movhps [r0+r2*2], m5
  149. psrldq m5, 4
  150. movq [r1+r2*1], m5
  151. movq [r0+r2*1], m3
  152. RET
  153. %endmacro
  154. INIT_XMM sse2
  155. PRED4x4_HD
  156. INIT_XMM ssse3
  157. PRED4x4_HD
  158. %if HAVE_AVX_EXTERNAL
  159. INIT_XMM avx
  160. PRED4x4_HD
  161. %endif
  162. ;-----------------------------------------------------------------------------
  163. ; void ff_pred4x4_dc_10(pixel *src, const pixel *topright, ptrdiff_t stride)
  164. ;-----------------------------------------------------------------------------
  165. INIT_MMX mmxext
  166. cglobal pred4x4_dc_10, 3, 3
  167. sub r0, r2
  168. lea r1, [r0+r2*2]
  169. movq m2, [r0+r2*1-8]
  170. paddw m2, [r0+r2*2-8]
  171. paddw m2, [r1+r2*1-8]
  172. paddw m2, [r1+r2*2-8]
  173. psrlq m2, 48
  174. movq m0, [r0]
  175. HADDW m0, m1
  176. paddw m0, [pw_4]
  177. paddw m0, m2
  178. psrlw m0, 3
  179. SPLATW m0, m0, 0
  180. movq [r0+r2*1], m0
  181. movq [r0+r2*2], m0
  182. movq [r1+r2*1], m0
  183. movq [r1+r2*2], m0
  184. RET
  185. ;-----------------------------------------------------------------------------
  186. ; void ff_pred4x4_down_left_10(pixel *src, const pixel *topright,
  187. ; ptrdiff_t stride)
  188. ;-----------------------------------------------------------------------------
  189. %macro PRED4x4_DL 0
  190. cglobal pred4x4_down_left_10, 3, 3
  191. sub r0, r2
  192. movq m0, [r0]
  193. movhps m0, [r1]
  194. psrldq m2, m0, 2
  195. pslldq m3, m0, 2
  196. pshufhw m2, m2, 10100100b
  197. PRED4x4_LOWPASS m0, m3, m2, m0
  198. lea r1, [r0+r2*2]
  199. movhps [r1+r2*2], m0
  200. psrldq m0, 2
  201. movq [r0+r2*1], m0
  202. psrldq m0, 2
  203. movq [r0+r2*2], m0
  204. psrldq m0, 2
  205. movq [r1+r2*1], m0
  206. RET
  207. %endmacro
  208. INIT_XMM sse2
  209. PRED4x4_DL
  210. %if HAVE_AVX_EXTERNAL
  211. INIT_XMM avx
  212. PRED4x4_DL
  213. %endif
  214. ;-----------------------------------------------------------------------------
  215. ; void ff_pred4x4_vertical_left_10(pixel *src, const pixel *topright,
  216. ; ptrdiff_t stride)
  217. ;-----------------------------------------------------------------------------
  218. %macro PRED4x4_VL 0
  219. cglobal pred4x4_vertical_left_10, 3, 3
  220. sub r0, r2
  221. movu m1, [r0]
  222. movhps m1, [r1]
  223. psrldq m0, m1, 2
  224. psrldq m2, m1, 4
  225. pavgw m4, m0, m1
  226. PRED4x4_LOWPASS m0, m1, m2, m0
  227. lea r1, [r0+r2*2]
  228. movq [r0+r2*1], m4
  229. movq [r0+r2*2], m0
  230. psrldq m4, 2
  231. psrldq m0, 2
  232. movq [r1+r2*1], m4
  233. movq [r1+r2*2], m0
  234. RET
  235. %endmacro
  236. INIT_XMM sse2
  237. PRED4x4_VL
  238. %if HAVE_AVX_EXTERNAL
  239. INIT_XMM avx
  240. PRED4x4_VL
  241. %endif
  242. ;-----------------------------------------------------------------------------
  243. ; void ff_pred4x4_horizontal_up_10(pixel *src, const pixel *topright,
  244. ; ptrdiff_t stride)
  245. ;-----------------------------------------------------------------------------
  246. INIT_MMX mmxext
  247. cglobal pred4x4_horizontal_up_10, 3, 3
  248. sub r0, r2
  249. lea r1, [r0+r2*2]
  250. movq m0, [r0+r2*1-8]
  251. punpckhwd m0, [r0+r2*2-8]
  252. movq m1, [r1+r2*1-8]
  253. punpckhwd m1, [r1+r2*2-8]
  254. punpckhdq m0, m1
  255. pshufw m1, m1, 0xFF
  256. movq [r1+r2*2], m1
  257. movd [r1+r2*1+4], m1
  258. pshufw m2, m0, 11111001b
  259. movq m1, m2
  260. pavgw m2, m0
  261. pshufw m5, m0, 11111110b
  262. PRED4x4_LOWPASS m1, m0, m5, m1
  263. movq m6, m2
  264. punpcklwd m6, m1
  265. movq [r0+r2*1], m6
  266. psrlq m2, 16
  267. psrlq m1, 16
  268. punpcklwd m2, m1
  269. movq [r0+r2*2], m2
  270. psrlq m2, 32
  271. movd [r1+r2*1], m2
  272. RET
  273. ;-----------------------------------------------------------------------------
  274. ; void ff_pred8x8_vertical_10(pixel *src, ptrdiff_t stride)
  275. ;-----------------------------------------------------------------------------
  276. INIT_XMM sse2
  277. cglobal pred8x8_vertical_10, 2, 2
  278. sub r0, r1
  279. mova m0, [r0]
  280. %rep 3
  281. mova [r0+r1*1], m0
  282. mova [r0+r1*2], m0
  283. lea r0, [r0+r1*2]
  284. %endrep
  285. mova [r0+r1*1], m0
  286. mova [r0+r1*2], m0
  287. RET
  288. ;-----------------------------------------------------------------------------
  289. ; void ff_pred8x8_horizontal_10(pixel *src, ptrdiff_t stride)
  290. ;-----------------------------------------------------------------------------
  291. INIT_XMM sse2
  292. cglobal pred8x8_horizontal_10, 2, 3
  293. mov r2d, 4
  294. .loop:
  295. movq m0, [r0+r1*0-8]
  296. movq m1, [r0+r1*1-8]
  297. pshuflw m0, m0, 0xff
  298. pshuflw m1, m1, 0xff
  299. punpcklqdq m0, m0
  300. punpcklqdq m1, m1
  301. mova [r0+r1*0], m0
  302. mova [r0+r1*1], m1
  303. lea r0, [r0+r1*2]
  304. dec r2d
  305. jg .loop
  306. REP_RET
  307. ;-----------------------------------------------------------------------------
  308. ; void ff_predict_8x8_dc_10(pixel *src, ptrdiff_t stride)
  309. ;-----------------------------------------------------------------------------
  310. %macro MOV8 2-3
  311. ; sort of a hack, but it works
  312. %if mmsize==8
  313. movq [%1+0], %2
  314. movq [%1+8], %3
  315. %else
  316. movdqa [%1], %2
  317. %endif
  318. %endmacro
  319. %macro PRED8x8_DC 1
  320. cglobal pred8x8_dc_10, 2, 6
  321. sub r0, r1
  322. pxor m4, m4
  323. movq m0, [r0+0]
  324. movq m1, [r0+8]
  325. %if mmsize==16
  326. punpcklwd m0, m1
  327. movhlps m1, m0
  328. paddw m0, m1
  329. %else
  330. pshufw m2, m0, 00001110b
  331. pshufw m3, m1, 00001110b
  332. paddw m0, m2
  333. paddw m1, m3
  334. punpcklwd m0, m1
  335. %endif
  336. %1 m2, m0, 00001110b
  337. paddw m0, m2
  338. lea r5, [r1*3]
  339. lea r4, [r0+r1*4]
  340. movzx r2d, word [r0+r1*1-2]
  341. movzx r3d, word [r0+r1*2-2]
  342. add r2d, r3d
  343. movzx r3d, word [r0+r5*1-2]
  344. add r2d, r3d
  345. movzx r3d, word [r4-2]
  346. add r2d, r3d
  347. movd m2, r2d ; s2
  348. movzx r2d, word [r4+r1*1-2]
  349. movzx r3d, word [r4+r1*2-2]
  350. add r2d, r3d
  351. movzx r3d, word [r4+r5*1-2]
  352. add r2d, r3d
  353. movzx r3d, word [r4+r1*4-2]
  354. add r2d, r3d
  355. movd m3, r2d ; s3
  356. punpcklwd m2, m3
  357. punpckldq m0, m2 ; s0, s1, s2, s3
  358. %1 m3, m0, 11110110b ; s2, s1, s3, s3
  359. %1 m0, m0, 01110100b ; s0, s1, s3, s1
  360. paddw m0, m3
  361. psrlw m0, 2
  362. pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
  363. %if mmsize==16
  364. punpcklwd m0, m0
  365. pshufd m3, m0, 11111010b
  366. punpckldq m0, m0
  367. SWAP 0,1
  368. %else
  369. pshufw m1, m0, 0x00
  370. pshufw m2, m0, 0x55
  371. pshufw m3, m0, 0xaa
  372. pshufw m4, m0, 0xff
  373. %endif
  374. MOV8 r0+r1*1, m1, m2
  375. MOV8 r0+r1*2, m1, m2
  376. MOV8 r0+r5*1, m1, m2
  377. MOV8 r0+r1*4, m1, m2
  378. MOV8 r4+r1*1, m3, m4
  379. MOV8 r4+r1*2, m3, m4
  380. MOV8 r4+r5*1, m3, m4
  381. MOV8 r4+r1*4, m3, m4
  382. RET
  383. %endmacro
  384. INIT_MMX mmxext
  385. PRED8x8_DC pshufw
  386. INIT_XMM sse2
  387. PRED8x8_DC pshuflw
  388. ;-----------------------------------------------------------------------------
  389. ; void ff_pred8x8_top_dc_10(pixel *src, ptrdiff_t stride)
  390. ;-----------------------------------------------------------------------------
  391. INIT_XMM sse2
  392. cglobal pred8x8_top_dc_10, 2, 4
  393. sub r0, r1
  394. mova m0, [r0]
  395. pshuflw m1, m0, 0x4e
  396. pshufhw m1, m1, 0x4e
  397. paddw m0, m1
  398. pshuflw m1, m0, 0xb1
  399. pshufhw m1, m1, 0xb1
  400. paddw m0, m1
  401. lea r2, [r1*3]
  402. lea r3, [r0+r1*4]
  403. paddw m0, [pw_2]
  404. psrlw m0, 2
  405. mova [r0+r1*1], m0
  406. mova [r0+r1*2], m0
  407. mova [r0+r2*1], m0
  408. mova [r0+r1*4], m0
  409. mova [r3+r1*1], m0
  410. mova [r3+r1*2], m0
  411. mova [r3+r2*1], m0
  412. mova [r3+r1*4], m0
  413. RET
  414. ;-----------------------------------------------------------------------------
  415. ; void ff_pred8x8_plane_10(pixel *src, ptrdiff_t stride)
  416. ;-----------------------------------------------------------------------------
  417. INIT_XMM sse2
  418. cglobal pred8x8_plane_10, 2, 7, 7
  419. sub r0, r1
  420. lea r2, [r1*3]
  421. lea r3, [r0+r1*4]
  422. mova m2, [r0]
  423. pmaddwd m2, [pw_m32101234]
  424. HADDD m2, m1
  425. movd m0, [r0-4]
  426. psrld m0, 14
  427. psubw m2, m0 ; H
  428. movd m0, [r3+r1*4-4]
  429. movd m1, [r0+12]
  430. paddw m0, m1
  431. psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
  432. movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
  433. movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
  434. sub r4d, r5d
  435. movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
  436. movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
  437. sub r6d, r5d
  438. lea r4d, [r4+r6*2]
  439. movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
  440. movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
  441. sub r5d, r6d
  442. lea r5d, [r5*3]
  443. add r4d, r5d
  444. movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
  445. movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
  446. sub r6d, r5d
  447. lea r4d, [r4+r6*4]
  448. movd m3, r4d ; V
  449. punpckldq m2, m3
  450. pmaddwd m2, [pd_17]
  451. paddd m2, [pd_16]
  452. psrad m2, 5 ; b, c
  453. mova m3, [pw_pixel_max]
  454. pxor m1, m1
  455. SPLATW m0, m0, 1
  456. SPLATW m4, m2, 2
  457. SPLATW m2, m2, 0
  458. pmullw m2, [pw_m32101234] ; b
  459. pmullw m5, m4, [pw_m3] ; c
  460. paddw m5, [pw_16]
  461. mov r2d, 8
  462. add r0, r1
  463. .loop:
  464. paddsw m6, m2, m5
  465. paddsw m6, m0
  466. psraw m6, 5
  467. CLIPW m6, m1, m3
  468. mova [r0], m6
  469. paddw m5, m4
  470. add r0, r1
  471. dec r2d
  472. jg .loop
  473. REP_RET
  474. ;-----------------------------------------------------------------------------
  475. ; void ff_pred8x8l_128_dc_10(pixel *src, int has_topleft, int has_topright,
  476. ; ptrdiff_t stride)
  477. ;-----------------------------------------------------------------------------
  478. %macro PRED8x8L_128_DC 0
  479. cglobal pred8x8l_128_dc_10, 4, 4
  480. mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
  481. lea r1, [r3*3]
  482. lea r2, [r0+r3*4]
  483. MOV8 r0+r3*0, m0, m0
  484. MOV8 r0+r3*1, m0, m0
  485. MOV8 r0+r3*2, m0, m0
  486. MOV8 r0+r1*1, m0, m0
  487. MOV8 r2+r3*0, m0, m0
  488. MOV8 r2+r3*1, m0, m0
  489. MOV8 r2+r3*2, m0, m0
  490. MOV8 r2+r1*1, m0, m0
  491. RET
  492. %endmacro
  493. INIT_MMX mmxext
  494. PRED8x8L_128_DC
  495. INIT_XMM sse2
  496. PRED8x8L_128_DC
  497. ;-----------------------------------------------------------------------------
  498. ; void ff_pred8x8l_top_dc_10(pixel *src, int has_topleft, int has_topright,
  499. ; ptrdiff_t stride)
  500. ;-----------------------------------------------------------------------------
  501. %macro PRED8x8L_TOP_DC 0
  502. cglobal pred8x8l_top_dc_10, 4, 4, 6
  503. sub r0, r3
  504. mova m0, [r0]
  505. shr r1d, 14
  506. shr r2d, 13
  507. neg r1
  508. pslldq m1, m0, 2
  509. psrldq m2, m0, 2
  510. pinsrw m1, [r0+r1], 0
  511. pinsrw m2, [r0+r2+14], 7
  512. lea r1, [r3*3]
  513. lea r2, [r0+r3*4]
  514. PRED4x4_LOWPASS m0, m2, m1, m0
  515. HADDW m0, m1
  516. paddw m0, [pw_4]
  517. psrlw m0, 3
  518. SPLATW m0, m0, 0
  519. mova [r0+r3*1], m0
  520. mova [r0+r3*2], m0
  521. mova [r0+r1*1], m0
  522. mova [r0+r3*4], m0
  523. mova [r2+r3*1], m0
  524. mova [r2+r3*2], m0
  525. mova [r2+r1*1], m0
  526. mova [r2+r3*4], m0
  527. RET
  528. %endmacro
  529. INIT_XMM sse2
  530. PRED8x8L_TOP_DC
  531. %if HAVE_AVX_EXTERNAL
  532. INIT_XMM avx
  533. PRED8x8L_TOP_DC
  534. %endif
  535. ;-------------------------------------------------------------------------------
  536. ; void ff_pred8x8l_dc_10(pixel *src, int has_topleft, int has_topright,
  537. ; ptrdiff_t stride)
  538. ;-------------------------------------------------------------------------------
  539. ;TODO: see if scalar is faster
  540. %macro PRED8x8L_DC 0
  541. cglobal pred8x8l_dc_10, 4, 6, 6
  542. sub r0, r3
  543. lea r4, [r0+r3*4]
  544. lea r5, [r3*3]
  545. mova m0, [r0+r3*2-16]
  546. punpckhwd m0, [r0+r3*1-16]
  547. mova m1, [r4+r3*0-16]
  548. punpckhwd m1, [r0+r5*1-16]
  549. punpckhdq m1, m0
  550. mova m2, [r4+r3*2-16]
  551. punpckhwd m2, [r4+r3*1-16]
  552. mova m3, [r4+r3*4-16]
  553. punpckhwd m3, [r4+r5*1-16]
  554. punpckhdq m3, m2
  555. punpckhqdq m3, m1
  556. mova m0, [r0]
  557. shr r1d, 14
  558. shr r2d, 13
  559. neg r1
  560. pslldq m1, m0, 2
  561. psrldq m2, m0, 2
  562. pinsrw m1, [r0+r1], 0
  563. pinsrw m2, [r0+r2+14], 7
  564. not r1
  565. and r1, r3
  566. pslldq m4, m3, 2
  567. psrldq m5, m3, 2
  568. pshuflw m4, m4, 11100101b
  569. pinsrw m5, [r0+r1-2], 7
  570. PRED4x4_LOWPASS m3, m4, m5, m3
  571. PRED4x4_LOWPASS m0, m2, m1, m0
  572. paddw m0, m3
  573. HADDW m0, m1
  574. paddw m0, [pw_8]
  575. psrlw m0, 4
  576. SPLATW m0, m0
  577. mova [r0+r3*1], m0
  578. mova [r0+r3*2], m0
  579. mova [r0+r5*1], m0
  580. mova [r0+r3*4], m0
  581. mova [r4+r3*1], m0
  582. mova [r4+r3*2], m0
  583. mova [r4+r5*1], m0
  584. mova [r4+r3*4], m0
  585. RET
  586. %endmacro
  587. INIT_XMM sse2
  588. PRED8x8L_DC
  589. %if HAVE_AVX_EXTERNAL
  590. INIT_XMM avx
  591. PRED8x8L_DC
  592. %endif
  593. ;-----------------------------------------------------------------------------
  594. ; void ff_pred8x8l_vertical_10(pixel *src, int has_topleft, int has_topright,
  595. ; ptrdiff_t stride)
  596. ;-----------------------------------------------------------------------------
  597. %macro PRED8x8L_VERTICAL 0
  598. cglobal pred8x8l_vertical_10, 4, 4, 6
  599. sub r0, r3
  600. mova m0, [r0]
  601. shr r1d, 14
  602. shr r2d, 13
  603. neg r1
  604. pslldq m1, m0, 2
  605. psrldq m2, m0, 2
  606. pinsrw m1, [r0+r1], 0
  607. pinsrw m2, [r0+r2+14], 7
  608. lea r1, [r3*3]
  609. lea r2, [r0+r3*4]
  610. PRED4x4_LOWPASS m0, m2, m1, m0
  611. mova [r0+r3*1], m0
  612. mova [r0+r3*2], m0
  613. mova [r0+r1*1], m0
  614. mova [r0+r3*4], m0
  615. mova [r2+r3*1], m0
  616. mova [r2+r3*2], m0
  617. mova [r2+r1*1], m0
  618. mova [r2+r3*4], m0
  619. RET
  620. %endmacro
  621. INIT_XMM sse2
  622. PRED8x8L_VERTICAL
  623. %if HAVE_AVX_EXTERNAL
  624. INIT_XMM avx
  625. PRED8x8L_VERTICAL
  626. %endif
  627. ;-----------------------------------------------------------------------------
  628. ; void ff_pred8x8l_horizontal_10(uint8_t *src, int has_topleft,
  629. ; int has_topright, ptrdiff_t stride)
  630. ;-----------------------------------------------------------------------------
  631. %macro PRED8x8L_HORIZONTAL 0
  632. cglobal pred8x8l_horizontal_10, 4, 4, 5
  633. mova m0, [r0-16]
  634. shr r1d, 14
  635. dec r1
  636. and r1, r3
  637. sub r1, r3
  638. punpckhwd m0, [r0+r1-16]
  639. mova m1, [r0+r3*2-16]
  640. punpckhwd m1, [r0+r3*1-16]
  641. lea r2, [r0+r3*4]
  642. lea r1, [r3*3]
  643. punpckhdq m1, m0
  644. mova m2, [r2+r3*0-16]
  645. punpckhwd m2, [r0+r1-16]
  646. mova m3, [r2+r3*2-16]
  647. punpckhwd m3, [r2+r3*1-16]
  648. punpckhdq m3, m2
  649. punpckhqdq m3, m1
  650. PALIGNR m4, m3, [r2+r1-16], 14, m0
  651. pslldq m0, m4, 2
  652. pshuflw m0, m0, 11100101b
  653. PRED4x4_LOWPASS m4, m3, m0, m4
  654. punpckhwd m3, m4, m4
  655. punpcklwd m4, m4
  656. pshufd m0, m3, 0xff
  657. pshufd m1, m3, 0xaa
  658. pshufd m2, m3, 0x55
  659. pshufd m3, m3, 0x00
  660. mova [r0+r3*0], m0
  661. mova [r0+r3*1], m1
  662. mova [r0+r3*2], m2
  663. mova [r0+r1*1], m3
  664. pshufd m0, m4, 0xff
  665. pshufd m1, m4, 0xaa
  666. pshufd m2, m4, 0x55
  667. pshufd m3, m4, 0x00
  668. mova [r2+r3*0], m0
  669. mova [r2+r3*1], m1
  670. mova [r2+r3*2], m2
  671. mova [r2+r1*1], m3
  672. RET
  673. %endmacro
  674. INIT_XMM sse2
  675. PRED8x8L_HORIZONTAL
  676. INIT_XMM ssse3
  677. PRED8x8L_HORIZONTAL
  678. %if HAVE_AVX_EXTERNAL
  679. INIT_XMM avx
  680. PRED8x8L_HORIZONTAL
  681. %endif
  682. ;-----------------------------------------------------------------------------
  683. ; void ff_pred8x8l_down_left_10(pixel *src, int has_topleft, int has_topright,
  684. ; ptrdiff_t stride)
  685. ;-----------------------------------------------------------------------------
  686. %macro PRED8x8L_DOWN_LEFT 0
  687. cglobal pred8x8l_down_left_10, 4, 4, 7
  688. sub r0, r3
  689. mova m3, [r0]
  690. shr r1d, 14
  691. neg r1
  692. shr r2d, 13
  693. pslldq m1, m3, 2
  694. psrldq m2, m3, 2
  695. pinsrw m1, [r0+r1], 0
  696. pinsrw m2, [r0+r2+14], 7
  697. PRED4x4_LOWPASS m6, m2, m1, m3
  698. jz .fix_tr ; flags from shr r2d
  699. mova m1, [r0+16]
  700. psrldq m5, m1, 2
  701. PALIGNR m2, m1, m3, 14, m3
  702. pshufhw m5, m5, 10100100b
  703. PRED4x4_LOWPASS m1, m2, m5, m1
  704. .do_topright:
  705. lea r1, [r3*3]
  706. psrldq m5, m1, 14
  707. lea r2, [r0+r3*4]
  708. PALIGNR m2, m1, m6, 2, m0
  709. PALIGNR m3, m1, m6, 14, m0
  710. PALIGNR m5, m1, 2, m0
  711. pslldq m4, m6, 2
  712. PRED4x4_LOWPASS m6, m4, m2, m6
  713. PRED4x4_LOWPASS m1, m3, m5, m1
  714. mova [r2+r3*4], m1
  715. PALIGNR m1, m6, 14, m2
  716. pslldq m6, 2
  717. mova [r2+r1*1], m1
  718. PALIGNR m1, m6, 14, m2
  719. pslldq m6, 2
  720. mova [r2+r3*2], m1
  721. PALIGNR m1, m6, 14, m2
  722. pslldq m6, 2
  723. mova [r2+r3*1], m1
  724. PALIGNR m1, m6, 14, m2
  725. pslldq m6, 2
  726. mova [r0+r3*4], m1
  727. PALIGNR m1, m6, 14, m2
  728. pslldq m6, 2
  729. mova [r0+r1*1], m1
  730. PALIGNR m1, m6, 14, m2
  731. pslldq m6, 2
  732. mova [r0+r3*2], m1
  733. PALIGNR m1, m6, 14, m6
  734. mova [r0+r3*1], m1
  735. RET
  736. .fix_tr:
  737. punpckhwd m3, m3
  738. pshufd m1, m3, 0xFF
  739. jmp .do_topright
  740. %endmacro
  741. INIT_XMM sse2
  742. PRED8x8L_DOWN_LEFT
  743. INIT_XMM ssse3
  744. PRED8x8L_DOWN_LEFT
  745. %if HAVE_AVX_EXTERNAL
  746. INIT_XMM avx
  747. PRED8x8L_DOWN_LEFT
  748. %endif
  749. ;-----------------------------------------------------------------------------
  750. ; void ff_pred8x8l_down_right_10(pixel *src, int has_topleft,
  751. ; int has_topright, ptrdiff_t stride)
  752. ;-----------------------------------------------------------------------------
  753. %macro PRED8x8L_DOWN_RIGHT 0
  754. ; standard forbids this when has_topleft is false
  755. ; no need to check
  756. cglobal pred8x8l_down_right_10, 4, 5, 8
  757. sub r0, r3
  758. lea r4, [r0+r3*4]
  759. lea r1, [r3*3]
  760. mova m0, [r0+r3*1-16]
  761. punpckhwd m0, [r0+r3*0-16]
  762. mova m1, [r0+r1*1-16]
  763. punpckhwd m1, [r0+r3*2-16]
  764. punpckhdq m1, m0
  765. mova m2, [r4+r3*1-16]
  766. punpckhwd m2, [r4+r3*0-16]
  767. mova m3, [r4+r1*1-16]
  768. punpckhwd m3, [r4+r3*2-16]
  769. punpckhdq m3, m2
  770. punpckhqdq m3, m1
  771. mova m0, [r4+r3*4-16]
  772. mova m1, [r0]
  773. PALIGNR m4, m3, m0, 14, m0
  774. PALIGNR m1, m3, 2, m2
  775. pslldq m0, m4, 2
  776. pshuflw m0, m0, 11100101b
  777. PRED4x4_LOWPASS m6, m1, m4, m3
  778. PRED4x4_LOWPASS m4, m3, m0, m4
  779. mova m3, [r0]
  780. shr r2d, 13
  781. pslldq m1, m3, 2
  782. psrldq m2, m3, 2
  783. pinsrw m1, [r0-2], 0
  784. pinsrw m2, [r0+r2+14], 7
  785. PRED4x4_LOWPASS m3, m2, m1, m3
  786. PALIGNR m2, m3, m6, 2, m0
  787. PALIGNR m5, m3, m6, 14, m0
  788. psrldq m7, m3, 2
  789. PRED4x4_LOWPASS m6, m4, m2, m6
  790. PRED4x4_LOWPASS m3, m5, m7, m3
  791. mova [r4+r3*4], m6
  792. PALIGNR m3, m6, 14, m2
  793. pslldq m6, 2
  794. mova [r0+r3*1], m3
  795. PALIGNR m3, m6, 14, m2
  796. pslldq m6, 2
  797. mova [r0+r3*2], m3
  798. PALIGNR m3, m6, 14, m2
  799. pslldq m6, 2
  800. mova [r0+r1*1], m3
  801. PALIGNR m3, m6, 14, m2
  802. pslldq m6, 2
  803. mova [r0+r3*4], m3
  804. PALIGNR m3, m6, 14, m2
  805. pslldq m6, 2
  806. mova [r4+r3*1], m3
  807. PALIGNR m3, m6, 14, m2
  808. pslldq m6, 2
  809. mova [r4+r3*2], m3
  810. PALIGNR m3, m6, 14, m6
  811. mova [r4+r1*1], m3
  812. RET
  813. %endmacro
  814. INIT_XMM sse2
  815. PRED8x8L_DOWN_RIGHT
  816. INIT_XMM ssse3
  817. PRED8x8L_DOWN_RIGHT
  818. %if HAVE_AVX_EXTERNAL
  819. INIT_XMM avx
  820. PRED8x8L_DOWN_RIGHT
  821. %endif
  822. ;-----------------------------------------------------------------------------
  823. ; void ff_pred8x8l_vertical_right_10(pixel *src, int has_topleft,
  824. ; int has_topright, ptrdiff_t stride)
  825. ;-----------------------------------------------------------------------------
  826. %macro PRED8x8L_VERTICAL_RIGHT 0
  827. ; likewise with 8x8l_down_right
  828. cglobal pred8x8l_vertical_right_10, 4, 5, 7
  829. sub r0, r3
  830. lea r4, [r0+r3*4]
  831. lea r1, [r3*3]
  832. mova m0, [r0+r3*1-16]
  833. punpckhwd m0, [r0+r3*0-16]
  834. mova m1, [r0+r1*1-16]
  835. punpckhwd m1, [r0+r3*2-16]
  836. punpckhdq m1, m0
  837. mova m2, [r4+r3*1-16]
  838. punpckhwd m2, [r4+r3*0-16]
  839. mova m3, [r4+r1*1-16]
  840. punpckhwd m3, [r4+r3*2-16]
  841. punpckhdq m3, m2
  842. punpckhqdq m3, m1
  843. mova m0, [r4+r3*4-16]
  844. mova m1, [r0]
  845. PALIGNR m4, m3, m0, 14, m0
  846. PALIGNR m1, m3, 2, m2
  847. PRED4x4_LOWPASS m3, m1, m4, m3
  848. mova m2, [r0]
  849. shr r2d, 13
  850. pslldq m1, m2, 2
  851. psrldq m5, m2, 2
  852. pinsrw m1, [r0-2], 0
  853. pinsrw m5, [r0+r2+14], 7
  854. PRED4x4_LOWPASS m2, m5, m1, m2
  855. PALIGNR m6, m2, m3, 12, m1
  856. PALIGNR m5, m2, m3, 14, m0
  857. PRED4x4_LOWPASS m0, m6, m2, m5
  858. pavgw m2, m5
  859. mova [r0+r3*2], m0
  860. mova [r0+r3*1], m2
  861. pslldq m6, m3, 4
  862. pslldq m1, m3, 2
  863. PRED4x4_LOWPASS m1, m3, m6, m1
  864. PALIGNR m2, m1, 14, m4
  865. mova [r0+r1*1], m2
  866. pslldq m1, 2
  867. PALIGNR m0, m1, 14, m3
  868. mova [r0+r3*4], m0
  869. pslldq m1, 2
  870. PALIGNR m2, m1, 14, m4
  871. mova [r4+r3*1], m2
  872. pslldq m1, 2
  873. PALIGNR m0, m1, 14, m3
  874. mova [r4+r3*2], m0
  875. pslldq m1, 2
  876. PALIGNR m2, m1, 14, m4
  877. mova [r4+r1*1], m2
  878. pslldq m1, 2
  879. PALIGNR m0, m1, 14, m1
  880. mova [r4+r3*4], m0
  881. RET
  882. %endmacro
  883. INIT_XMM sse2
  884. PRED8x8L_VERTICAL_RIGHT
  885. INIT_XMM ssse3
  886. PRED8x8L_VERTICAL_RIGHT
  887. %if HAVE_AVX_EXTERNAL
  888. INIT_XMM avx
  889. PRED8x8L_VERTICAL_RIGHT
  890. %endif
  891. ;-----------------------------------------------------------------------------
  892. ; void ff_pred8x8l_horizontal_up_10(pixel *src, int has_topleft,
  893. ; int has_topright, ptrdiff_t stride)
  894. ;-----------------------------------------------------------------------------
  895. %macro PRED8x8L_HORIZONTAL_UP 0
  896. cglobal pred8x8l_horizontal_up_10, 4, 4, 6
  897. mova m0, [r0+r3*0-16]
  898. punpckhwd m0, [r0+r3*1-16]
  899. shr r1d, 14
  900. dec r1
  901. and r1, r3
  902. sub r1, r3
  903. mova m4, [r0+r1*1-16]
  904. lea r1, [r3*3]
  905. lea r2, [r0+r3*4]
  906. mova m1, [r0+r3*2-16]
  907. punpckhwd m1, [r0+r1*1-16]
  908. punpckhdq m0, m1
  909. mova m2, [r2+r3*0-16]
  910. punpckhwd m2, [r2+r3*1-16]
  911. mova m3, [r2+r3*2-16]
  912. punpckhwd m3, [r2+r1*1-16]
  913. punpckhdq m2, m3
  914. punpckhqdq m0, m2
  915. PALIGNR m1, m0, m4, 14, m4
  916. psrldq m2, m0, 2
  917. pshufhw m2, m2, 10100100b
  918. PRED4x4_LOWPASS m0, m1, m2, m0
  919. psrldq m1, m0, 2
  920. psrldq m2, m0, 4
  921. pshufhw m1, m1, 10100100b
  922. pshufhw m2, m2, 01010100b
  923. pavgw m4, m0, m1
  924. PRED4x4_LOWPASS m1, m2, m0, m1
  925. punpckhwd m5, m4, m1
  926. punpcklwd m4, m1
  927. mova [r2+r3*0], m5
  928. mova [r0+r3*0], m4
  929. pshufd m0, m5, 11111001b
  930. pshufd m1, m5, 11111110b
  931. pshufd m2, m5, 11111111b
  932. mova [r2+r3*1], m0
  933. mova [r2+r3*2], m1
  934. mova [r2+r1*1], m2
  935. PALIGNR m2, m5, m4, 4, m0
  936. PALIGNR m3, m5, m4, 8, m1
  937. PALIGNR m5, m5, m4, 12, m4
  938. mova [r0+r3*1], m2
  939. mova [r0+r3*2], m3
  940. mova [r0+r1*1], m5
  941. RET
  942. %endmacro
  943. INIT_XMM sse2
  944. PRED8x8L_HORIZONTAL_UP
  945. INIT_XMM ssse3
  946. PRED8x8L_HORIZONTAL_UP
  947. %if HAVE_AVX_EXTERNAL
  948. INIT_XMM avx
  949. PRED8x8L_HORIZONTAL_UP
  950. %endif
  951. ;-----------------------------------------------------------------------------
  952. ; void ff_pred16x16_vertical_10(pixel *src, ptrdiff_t stride)
  953. ;-----------------------------------------------------------------------------
  954. %macro MOV16 3-5
  955. mova [%1+ 0], %2
  956. mova [%1+mmsize], %3
  957. %if mmsize==8
  958. mova [%1+ 16], %4
  959. mova [%1+ 24], %5
  960. %endif
  961. %endmacro
  962. %macro PRED16x16_VERTICAL 0
  963. cglobal pred16x16_vertical_10, 2, 3
  964. sub r0, r1
  965. mov r2d, 8
  966. mova m0, [r0+ 0]
  967. mova m1, [r0+mmsize]
  968. %if mmsize==8
  969. mova m2, [r0+16]
  970. mova m3, [r0+24]
  971. %endif
  972. .loop:
  973. MOV16 r0+r1*1, m0, m1, m2, m3
  974. MOV16 r0+r1*2, m0, m1, m2, m3
  975. lea r0, [r0+r1*2]
  976. dec r2d
  977. jg .loop
  978. REP_RET
  979. %endmacro
  980. INIT_MMX mmxext
  981. PRED16x16_VERTICAL
  982. INIT_XMM sse2
  983. PRED16x16_VERTICAL
  984. ;-----------------------------------------------------------------------------
  985. ; void ff_pred16x16_horizontal_10(pixel *src, ptrdiff_t stride)
  986. ;-----------------------------------------------------------------------------
  987. %macro PRED16x16_HORIZONTAL 0
  988. cglobal pred16x16_horizontal_10, 2, 3
  989. mov r2d, 8
  990. .vloop:
  991. movd m0, [r0+r1*0-4]
  992. movd m1, [r0+r1*1-4]
  993. SPLATW m0, m0, 1
  994. SPLATW m1, m1, 1
  995. MOV16 r0+r1*0, m0, m0, m0, m0
  996. MOV16 r0+r1*1, m1, m1, m1, m1
  997. lea r0, [r0+r1*2]
  998. dec r2d
  999. jg .vloop
  1000. REP_RET
  1001. %endmacro
  1002. INIT_MMX mmxext
  1003. PRED16x16_HORIZONTAL
  1004. INIT_XMM sse2
  1005. PRED16x16_HORIZONTAL
  1006. ;-----------------------------------------------------------------------------
  1007. ; void ff_pred16x16_dc_10(pixel *src, ptrdiff_t stride)
  1008. ;-----------------------------------------------------------------------------
  1009. %macro PRED16x16_DC 0
  1010. cglobal pred16x16_dc_10, 2, 6
  1011. mov r5, r0
  1012. sub r0, r1
  1013. mova m0, [r0+0]
  1014. paddw m0, [r0+mmsize]
  1015. %if mmsize==8
  1016. paddw m0, [r0+16]
  1017. paddw m0, [r0+24]
  1018. %endif
  1019. HADDW m0, m2
  1020. lea r0, [r0+r1-2]
  1021. movzx r3d, word [r0]
  1022. movzx r4d, word [r0+r1]
  1023. %rep 7
  1024. lea r0, [r0+r1*2]
  1025. movzx r2d, word [r0]
  1026. add r3d, r2d
  1027. movzx r2d, word [r0+r1]
  1028. add r4d, r2d
  1029. %endrep
  1030. lea r3d, [r3+r4+16]
  1031. movd m1, r3d
  1032. paddw m0, m1
  1033. psrlw m0, 5
  1034. SPLATW m0, m0
  1035. mov r3d, 8
  1036. .loop:
  1037. MOV16 r5+r1*0, m0, m0, m0, m0
  1038. MOV16 r5+r1*1, m0, m0, m0, m0
  1039. lea r5, [r5+r1*2]
  1040. dec r3d
  1041. jg .loop
  1042. REP_RET
  1043. %endmacro
  1044. INIT_MMX mmxext
  1045. PRED16x16_DC
  1046. INIT_XMM sse2
  1047. PRED16x16_DC
  1048. ;-----------------------------------------------------------------------------
  1049. ; void ff_pred16x16_top_dc_10(pixel *src, ptrdiff_t stride)
  1050. ;-----------------------------------------------------------------------------
  1051. %macro PRED16x16_TOP_DC 0
  1052. cglobal pred16x16_top_dc_10, 2, 3
  1053. sub r0, r1
  1054. mova m0, [r0+0]
  1055. paddw m0, [r0+mmsize]
  1056. %if mmsize==8
  1057. paddw m0, [r0+16]
  1058. paddw m0, [r0+24]
  1059. %endif
  1060. HADDW m0, m2
  1061. SPLATW m0, m0
  1062. paddw m0, [pw_8]
  1063. psrlw m0, 4
  1064. mov r2d, 8
  1065. .loop:
  1066. MOV16 r0+r1*1, m0, m0, m0, m0
  1067. MOV16 r0+r1*2, m0, m0, m0, m0
  1068. lea r0, [r0+r1*2]
  1069. dec r2d
  1070. jg .loop
  1071. REP_RET
  1072. %endmacro
  1073. INIT_MMX mmxext
  1074. PRED16x16_TOP_DC
  1075. INIT_XMM sse2
  1076. PRED16x16_TOP_DC
  1077. ;-----------------------------------------------------------------------------
  1078. ; void ff_pred16x16_left_dc_10(pixel *src, ptrdiff_t stride)
  1079. ;-----------------------------------------------------------------------------
  1080. %macro PRED16x16_LEFT_DC 0
  1081. cglobal pred16x16_left_dc_10, 2, 6
  1082. mov r5, r0
  1083. sub r0, 2
  1084. movzx r3d, word [r0]
  1085. movzx r4d, word [r0+r1]
  1086. %rep 7
  1087. lea r0, [r0+r1*2]
  1088. movzx r2d, word [r0]
  1089. add r3d, r2d
  1090. movzx r2d, word [r0+r1]
  1091. add r4d, r2d
  1092. %endrep
  1093. lea r3d, [r3+r4+8]
  1094. shr r3d, 4
  1095. movd m0, r3d
  1096. SPLATW m0, m0
  1097. mov r3d, 8
  1098. .loop:
  1099. MOV16 r5+r1*0, m0, m0, m0, m0
  1100. MOV16 r5+r1*1, m0, m0, m0, m0
  1101. lea r5, [r5+r1*2]
  1102. dec r3d
  1103. jg .loop
  1104. REP_RET
  1105. %endmacro
  1106. INIT_MMX mmxext
  1107. PRED16x16_LEFT_DC
  1108. INIT_XMM sse2
  1109. PRED16x16_LEFT_DC
  1110. ;-----------------------------------------------------------------------------
  1111. ; void ff_pred16x16_128_dc_10(pixel *src, ptrdiff_t stride)
  1112. ;-----------------------------------------------------------------------------
  1113. %macro PRED16x16_128_DC 0
  1114. cglobal pred16x16_128_dc_10, 2,3
  1115. mova m0, [pw_512]
  1116. mov r2d, 8
  1117. .loop:
  1118. MOV16 r0+r1*0, m0, m0, m0, m0
  1119. MOV16 r0+r1*1, m0, m0, m0, m0
  1120. lea r0, [r0+r1*2]
  1121. dec r2d
  1122. jg .loop
  1123. REP_RET
  1124. %endmacro
  1125. INIT_MMX mmxext
  1126. PRED16x16_128_DC
  1127. INIT_XMM sse2
  1128. PRED16x16_128_DC