PageRenderTime 76ms CodeModel.GetById 25ms app.highlight 41ms RepoModel.GetById 1ms app.codeStats 0ms

/libavcodec/x86/h264_intrapred_10bit.asm

http://github.com/FFmpeg/FFmpeg
Assembly | 1199 lines | 1002 code | 71 blank | 126 comment | 0 complexity | 06df63d17f82727da95b572712082cda MD5 | raw file
   1;*****************************************************************************
   2;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
   3;*****************************************************************************
   4;* Copyright (C) 2005-2011 x264 project
   5;*
   6;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
   7;*
   8;* This file is part of FFmpeg.
   9;*
  10;* FFmpeg is free software; you can redistribute it and/or
  11;* modify it under the terms of the GNU Lesser General Public
  12;* License as published by the Free Software Foundation; either
  13;* version 2.1 of the License, or (at your option) any later version.
  14;*
  15;* FFmpeg is distributed in the hope that it will be useful,
  16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18;* Lesser General Public License for more details.
  19;*
  20;* You should have received a copy of the GNU Lesser General Public
  21;* License along with FFmpeg; if not, write to the Free Software
  22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23;******************************************************************************
  24
  25%include "libavutil/x86/x86util.asm"
  26
  27SECTION_RODATA
  28
  29cextern pw_1023
  30%define pw_pixel_max pw_1023
  31cextern pw_512
  32cextern pw_16
  33cextern pw_8
  34cextern pw_4
  35cextern pw_2
  36cextern pw_1
  37cextern pd_16
  38
  39pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
  40pw_m3:        times 8 dw -3
  41pd_17:        times 4 dd 17
  42
  43SECTION .text
  44
  45; dest, left, right, src
  46; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  47%macro PRED4x4_LOWPASS 4
  48    paddw       %2, %3
  49    psrlw       %2, 1
  50    pavgw       %1, %4, %2
  51%endmacro
  52
  53;-----------------------------------------------------------------------------
  54; void ff_pred4x4_down_right_10(pixel *src, const pixel *topright,
  55;                               ptrdiff_t stride)
  56;-----------------------------------------------------------------------------
  57%macro PRED4x4_DR 0
  58cglobal pred4x4_down_right_10, 3, 3
  59    sub       r0, r2
  60    lea       r1, [r0+r2*2]
  61    movhps    m1, [r1-8]
  62    movhps    m2, [r0+r2*1-8]
  63    movhps    m4, [r0-8]
  64    punpckhwd m2, m4
  65    movq      m3, [r0]
  66    punpckhdq m1, m2
  67    PALIGNR   m3, m1, 10, m1
  68    movhps    m4, [r1+r2*1-8]
  69    PALIGNR   m0, m3, m4, 14, m4
  70    movhps    m4, [r1+r2*2-8]
  71    PALIGNR   m2, m0, m4, 14, m4
  72    PRED4x4_LOWPASS m0, m2, m3, m0
  73    movq      [r1+r2*2], m0
  74    psrldq    m0, 2
  75    movq      [r1+r2*1], m0
  76    psrldq    m0, 2
  77    movq      [r0+r2*2], m0
  78    psrldq    m0, 2
  79    movq      [r0+r2*1], m0
  80    RET
  81%endmacro
  82
  83INIT_XMM sse2
  84PRED4x4_DR
  85INIT_XMM ssse3
  86PRED4x4_DR
  87%if HAVE_AVX_EXTERNAL
  88INIT_XMM avx
  89PRED4x4_DR
  90%endif
  91
  92;------------------------------------------------------------------------------
  93; void ff_pred4x4_vertical_right_10(pixel *src, const pixel *topright,
  94;                                   ptrdiff_t stride)
  95;------------------------------------------------------------------------------
  96%macro PRED4x4_VR 0
  97cglobal pred4x4_vertical_right_10, 3, 3, 6
  98    sub     r0, r2
  99    lea     r1, [r0+r2*2]
 100    movq    m5, [r0]            ; ........t3t2t1t0
 101    movhps  m1, [r0-8]
 102    PALIGNR m0, m5, m1, 14, m1  ; ......t3t2t1t0lt
 103    pavgw   m5, m0
 104    movhps  m1, [r0+r2*1-8]
 105    PALIGNR m0, m1, 14, m1      ; ....t3t2t1t0ltl0
 106    movhps  m2, [r0+r2*2-8]
 107    PALIGNR m1, m0, m2, 14, m2  ; ..t3t2t1t0ltl0l1
 108    movhps  m3, [r1+r2*1-8]
 109    PALIGNR m2, m1, m3, 14, m3  ; t3t2t1t0ltl0l1l2
 110    PRED4x4_LOWPASS m1, m0, m2, m1
 111    pslldq  m0, m1, 12
 112    psrldq  m1, 4
 113    movq    [r0+r2*1], m5
 114    movq    [r0+r2*2], m1
 115    PALIGNR m5, m0, 14, m2
 116    pslldq  m0, 2
 117    movq    [r1+r2*1], m5
 118    PALIGNR m1, m0, 14, m0
 119    movq    [r1+r2*2], m1
 120    RET
 121%endmacro
 122
 123INIT_XMM sse2
 124PRED4x4_VR
 125INIT_XMM ssse3
 126PRED4x4_VR
 127%if HAVE_AVX_EXTERNAL
 128INIT_XMM avx
 129PRED4x4_VR
 130%endif
 131
 132;-------------------------------------------------------------------------------
 133; void ff_pred4x4_horizontal_down_10(pixel *src, const pixel *topright,
 134;                                    ptrdiff_t stride)
 135;-------------------------------------------------------------------------------
 136%macro PRED4x4_HD 0
 137cglobal pred4x4_horizontal_down_10, 3, 3
 138    sub        r0, r2
 139    lea        r1, [r0+r2*2]
 140    movq       m0, [r0-8]      ; lt ..
 141    movhps     m0, [r0]
 142    pslldq     m0, 2           ; t2 t1 t0 lt .. .. .. ..
 143    movq       m1, [r1+r2*2-8] ; l3
 144    movq       m3, [r1+r2*1-8]
 145    punpcklwd  m1, m3          ; l2 l3
 146    movq       m2, [r0+r2*2-8] ; l1
 147    movq       m3, [r0+r2*1-8]
 148    punpcklwd  m2, m3          ; l0 l1
 149    punpckhdq  m1, m2          ; l0 l1 l2 l3
 150    punpckhqdq m1, m0          ; t2 t1 t0 lt l0 l1 l2 l3
 151    psrldq     m0, m1, 4       ; .. .. t2 t1 t0 lt l0 l1
 152    psrldq     m3, m1, 2       ; .. t2 t1 t0 lt l0 l1 l2
 153    pavgw      m5, m1, m3
 154    PRED4x4_LOWPASS m3, m1, m0, m3
 155    punpcklwd  m5, m3
 156    psrldq     m3, 8
 157    PALIGNR    m3, m5, 12, m4
 158    movq       [r1+r2*2], m5
 159    movhps     [r0+r2*2], m5
 160    psrldq     m5, 4
 161    movq       [r1+r2*1], m5
 162    movq       [r0+r2*1], m3
 163    RET
 164%endmacro
 165
 166INIT_XMM sse2
 167PRED4x4_HD
 168INIT_XMM ssse3
 169PRED4x4_HD
 170%if HAVE_AVX_EXTERNAL
 171INIT_XMM avx
 172PRED4x4_HD
 173%endif
 174
 175;-----------------------------------------------------------------------------
 176; void ff_pred4x4_dc_10(pixel *src, const pixel *topright, ptrdiff_t stride)
 177;-----------------------------------------------------------------------------
 178
 179INIT_MMX mmxext
 180cglobal pred4x4_dc_10, 3, 3
 181    sub    r0, r2
 182    lea    r1, [r0+r2*2]
 183    movq   m2, [r0+r2*1-8]
 184    paddw  m2, [r0+r2*2-8]
 185    paddw  m2, [r1+r2*1-8]
 186    paddw  m2, [r1+r2*2-8]
 187    psrlq  m2, 48
 188    movq   m0, [r0]
 189    HADDW  m0, m1
 190    paddw  m0, [pw_4]
 191    paddw  m0, m2
 192    psrlw  m0, 3
 193    SPLATW m0, m0, 0
 194    movq   [r0+r2*1], m0
 195    movq   [r0+r2*2], m0
 196    movq   [r1+r2*1], m0
 197    movq   [r1+r2*2], m0
 198    RET
 199
 200;-----------------------------------------------------------------------------
 201; void ff_pred4x4_down_left_10(pixel *src, const pixel *topright,
 202;                              ptrdiff_t stride)
 203;-----------------------------------------------------------------------------
 204%macro PRED4x4_DL 0
 205cglobal pred4x4_down_left_10, 3, 3
 206    sub        r0, r2
 207    movq       m0, [r0]
 208    movhps     m0, [r1]
 209    psrldq     m2, m0, 2
 210    pslldq     m3, m0, 2
 211    pshufhw    m2, m2, 10100100b
 212    PRED4x4_LOWPASS m0, m3, m2, m0
 213    lea        r1, [r0+r2*2]
 214    movhps     [r1+r2*2], m0
 215    psrldq     m0, 2
 216    movq       [r0+r2*1], m0
 217    psrldq     m0, 2
 218    movq       [r0+r2*2], m0
 219    psrldq     m0, 2
 220    movq       [r1+r2*1], m0
 221    RET
 222%endmacro
 223
 224INIT_XMM sse2
 225PRED4x4_DL
 226%if HAVE_AVX_EXTERNAL
 227INIT_XMM avx
 228PRED4x4_DL
 229%endif
 230
 231;-----------------------------------------------------------------------------
 232; void ff_pred4x4_vertical_left_10(pixel *src, const pixel *topright,
 233;                                  ptrdiff_t stride)
 234;-----------------------------------------------------------------------------
 235%macro PRED4x4_VL 0
 236cglobal pred4x4_vertical_left_10, 3, 3
 237    sub        r0, r2
 238    movu       m1, [r0]
 239    movhps     m1, [r1]
 240    psrldq     m0, m1, 2
 241    psrldq     m2, m1, 4
 242    pavgw      m4, m0, m1
 243    PRED4x4_LOWPASS m0, m1, m2, m0
 244    lea        r1, [r0+r2*2]
 245    movq       [r0+r2*1], m4
 246    movq       [r0+r2*2], m0
 247    psrldq     m4, 2
 248    psrldq     m0, 2
 249    movq       [r1+r2*1], m4
 250    movq       [r1+r2*2], m0
 251    RET
 252%endmacro
 253
 254INIT_XMM sse2
 255PRED4x4_VL
 256%if HAVE_AVX_EXTERNAL
 257INIT_XMM avx
 258PRED4x4_VL
 259%endif
 260
 261;-----------------------------------------------------------------------------
 262; void ff_pred4x4_horizontal_up_10(pixel *src, const pixel *topright,
 263;                                  ptrdiff_t stride)
 264;-----------------------------------------------------------------------------
 265INIT_MMX mmxext
 266cglobal pred4x4_horizontal_up_10, 3, 3
 267    sub       r0, r2
 268    lea       r1, [r0+r2*2]
 269    movq      m0, [r0+r2*1-8]
 270    punpckhwd m0, [r0+r2*2-8]
 271    movq      m1, [r1+r2*1-8]
 272    punpckhwd m1, [r1+r2*2-8]
 273    punpckhdq m0, m1
 274    pshufw    m1, m1, 0xFF
 275    movq      [r1+r2*2], m1
 276    movd      [r1+r2*1+4], m1
 277    pshufw    m2, m0, 11111001b
 278    movq      m1, m2
 279    pavgw     m2, m0
 280
 281    pshufw    m5, m0, 11111110b
 282    PRED4x4_LOWPASS m1, m0, m5, m1
 283    movq      m6, m2
 284    punpcklwd m6, m1
 285    movq      [r0+r2*1], m6
 286    psrlq     m2, 16
 287    psrlq     m1, 16
 288    punpcklwd m2, m1
 289    movq      [r0+r2*2], m2
 290    psrlq     m2, 32
 291    movd      [r1+r2*1], m2
 292    RET
 293
 294
 295
 296;-----------------------------------------------------------------------------
 297; void ff_pred8x8_vertical_10(pixel *src, ptrdiff_t stride)
 298;-----------------------------------------------------------------------------
 299INIT_XMM sse2
 300cglobal pred8x8_vertical_10, 2, 2
 301    sub  r0, r1
 302    mova m0, [r0]
 303%rep 3
 304    mova [r0+r1*1], m0
 305    mova [r0+r1*2], m0
 306    lea  r0, [r0+r1*2]
 307%endrep
 308    mova [r0+r1*1], m0
 309    mova [r0+r1*2], m0
 310    RET
 311
 312;-----------------------------------------------------------------------------
 313; void ff_pred8x8_horizontal_10(pixel *src, ptrdiff_t stride)
 314;-----------------------------------------------------------------------------
 315INIT_XMM sse2
 316cglobal pred8x8_horizontal_10, 2, 3
 317    mov         r2d, 4
 318.loop:
 319    movq         m0, [r0+r1*0-8]
 320    movq         m1, [r0+r1*1-8]
 321    pshuflw      m0, m0, 0xff
 322    pshuflw      m1, m1, 0xff
 323    punpcklqdq   m0, m0
 324    punpcklqdq   m1, m1
 325    mova  [r0+r1*0], m0
 326    mova  [r0+r1*1], m1
 327    lea          r0, [r0+r1*2]
 328    dec          r2d
 329    jg .loop
 330    REP_RET
 331
 332;-----------------------------------------------------------------------------
 333; void ff_predict_8x8_dc_10(pixel *src, ptrdiff_t stride)
 334;-----------------------------------------------------------------------------
 335%macro MOV8 2-3
 336; sort of a hack, but it works
 337%if mmsize==8
 338    movq    [%1+0], %2
 339    movq    [%1+8], %3
 340%else
 341    movdqa    [%1], %2
 342%endif
 343%endmacro
 344
 345%macro PRED8x8_DC 1
 346cglobal pred8x8_dc_10, 2, 6
 347    sub         r0, r1
 348    pxor        m4, m4
 349    movq        m0, [r0+0]
 350    movq        m1, [r0+8]
 351%if mmsize==16
 352    punpcklwd   m0, m1
 353    movhlps     m1, m0
 354    paddw       m0, m1
 355%else
 356    pshufw      m2, m0, 00001110b
 357    pshufw      m3, m1, 00001110b
 358    paddw       m0, m2
 359    paddw       m1, m3
 360    punpcklwd   m0, m1
 361%endif
 362    %1          m2, m0, 00001110b
 363    paddw       m0, m2
 364
 365    lea         r5, [r1*3]
 366    lea         r4, [r0+r1*4]
 367    movzx      r2d, word [r0+r1*1-2]
 368    movzx      r3d, word [r0+r1*2-2]
 369    add        r2d, r3d
 370    movzx      r3d, word [r0+r5*1-2]
 371    add        r2d, r3d
 372    movzx      r3d, word [r4-2]
 373    add        r2d, r3d
 374    movd        m2, r2d            ; s2
 375
 376    movzx      r2d, word [r4+r1*1-2]
 377    movzx      r3d, word [r4+r1*2-2]
 378    add        r2d, r3d
 379    movzx      r3d, word [r4+r5*1-2]
 380    add        r2d, r3d
 381    movzx      r3d, word [r4+r1*4-2]
 382    add        r2d, r3d
 383    movd        m3, r2d            ; s3
 384
 385    punpcklwd   m2, m3
 386    punpckldq   m0, m2            ; s0, s1, s2, s3
 387    %1          m3, m0, 11110110b ; s2, s1, s3, s3
 388    %1          m0, m0, 01110100b ; s0, s1, s3, s1
 389    paddw       m0, m3
 390    psrlw       m0, 2
 391    pavgw       m0, m4            ; s0+s2, s1, s3, s1+s3
 392%if mmsize==16
 393    punpcklwd   m0, m0
 394    pshufd      m3, m0, 11111010b
 395    punpckldq   m0, m0
 396    SWAP         0,1
 397%else
 398    pshufw      m1, m0, 0x00
 399    pshufw      m2, m0, 0x55
 400    pshufw      m3, m0, 0xaa
 401    pshufw      m4, m0, 0xff
 402%endif
 403    MOV8   r0+r1*1, m1, m2
 404    MOV8   r0+r1*2, m1, m2
 405    MOV8   r0+r5*1, m1, m2
 406    MOV8   r0+r1*4, m1, m2
 407    MOV8   r4+r1*1, m3, m4
 408    MOV8   r4+r1*2, m3, m4
 409    MOV8   r4+r5*1, m3, m4
 410    MOV8   r4+r1*4, m3, m4
 411    RET
 412%endmacro
 413
 414INIT_MMX mmxext
 415PRED8x8_DC pshufw
 416INIT_XMM sse2
 417PRED8x8_DC pshuflw
 418
 419;-----------------------------------------------------------------------------
 420; void ff_pred8x8_top_dc_10(pixel *src, ptrdiff_t stride)
 421;-----------------------------------------------------------------------------
 422INIT_XMM sse2
 423cglobal pred8x8_top_dc_10, 2, 4
 424    sub         r0, r1
 425    mova        m0, [r0]
 426    pshuflw     m1, m0, 0x4e
 427    pshufhw     m1, m1, 0x4e
 428    paddw       m0, m1
 429    pshuflw     m1, m0, 0xb1
 430    pshufhw     m1, m1, 0xb1
 431    paddw       m0, m1
 432    lea         r2, [r1*3]
 433    lea         r3, [r0+r1*4]
 434    paddw       m0, [pw_2]
 435    psrlw       m0, 2
 436    mova [r0+r1*1], m0
 437    mova [r0+r1*2], m0
 438    mova [r0+r2*1], m0
 439    mova [r0+r1*4], m0
 440    mova [r3+r1*1], m0
 441    mova [r3+r1*2], m0
 442    mova [r3+r2*1], m0
 443    mova [r3+r1*4], m0
 444    RET
 445
 446;-----------------------------------------------------------------------------
 447; void ff_pred8x8_plane_10(pixel *src, ptrdiff_t stride)
 448;-----------------------------------------------------------------------------
 449INIT_XMM sse2
 450cglobal pred8x8_plane_10, 2, 7, 7
 451    sub       r0, r1
 452    lea       r2, [r1*3]
 453    lea       r3, [r0+r1*4]
 454    mova      m2, [r0]
 455    pmaddwd   m2, [pw_m32101234]
 456    HADDD     m2, m1
 457    movd      m0, [r0-4]
 458    psrld     m0, 14
 459    psubw     m2, m0               ; H
 460    movd      m0, [r3+r1*4-4]
 461    movd      m1, [r0+12]
 462    paddw     m0, m1
 463    psllw     m0, 4                ; 16*(src[7*stride-1] + src[-stride+7])
 464    movzx    r4d, word [r3+r1*1-2] ; src[4*stride-1]
 465    movzx    r5d, word [r0+r2*1-2] ; src[2*stride-1]
 466    sub      r4d, r5d
 467    movzx    r6d, word [r3+r1*2-2] ; src[5*stride-1]
 468    movzx    r5d, word [r0+r1*2-2] ; src[1*stride-1]
 469    sub      r6d, r5d
 470    lea      r4d, [r4+r6*2]
 471    movzx    r5d, word [r3+r2*1-2] ; src[6*stride-1]
 472    movzx    r6d, word [r0+r1*1-2] ; src[0*stride-1]
 473    sub      r5d, r6d
 474    lea      r5d, [r5*3]
 475    add      r4d, r5d
 476    movzx    r6d, word [r3+r1*4-2] ; src[7*stride-1]
 477    movzx    r5d, word [r0+r1*0-2] ; src[ -stride-1]
 478    sub      r6d, r5d
 479    lea      r4d, [r4+r6*4]
 480    movd      m3, r4d              ; V
 481    punpckldq m2, m3
 482    pmaddwd   m2, [pd_17]
 483    paddd     m2, [pd_16]
 484    psrad     m2, 5                ; b, c
 485
 486    mova      m3, [pw_pixel_max]
 487    pxor      m1, m1
 488    SPLATW    m0, m0, 1
 489    SPLATW    m4, m2, 2
 490    SPLATW    m2, m2, 0
 491    pmullw    m2, [pw_m32101234]   ; b
 492    pmullw    m5, m4, [pw_m3]      ; c
 493    paddw     m5, [pw_16]
 494    mov      r2d, 8
 495    add       r0, r1
 496.loop:
 497    paddsw    m6, m2, m5
 498    paddsw    m6, m0
 499    psraw     m6, 5
 500    CLIPW     m6, m1, m3
 501    mova    [r0], m6
 502    paddw     m5, m4
 503    add       r0, r1
 504    dec r2d
 505    jg .loop
 506    REP_RET
 507
 508
 509;-----------------------------------------------------------------------------
 510; void ff_pred8x8l_128_dc_10(pixel *src, int has_topleft, int has_topright,
 511;                            ptrdiff_t stride)
 512;-----------------------------------------------------------------------------
 513%macro PRED8x8L_128_DC 0
 514cglobal pred8x8l_128_dc_10, 4, 4
 515    mova      m0, [pw_512] ; (1<<(BIT_DEPTH-1))
 516    lea       r1, [r3*3]
 517    lea       r2, [r0+r3*4]
 518    MOV8 r0+r3*0, m0, m0
 519    MOV8 r0+r3*1, m0, m0
 520    MOV8 r0+r3*2, m0, m0
 521    MOV8 r0+r1*1, m0, m0
 522    MOV8 r2+r3*0, m0, m0
 523    MOV8 r2+r3*1, m0, m0
 524    MOV8 r2+r3*2, m0, m0
 525    MOV8 r2+r1*1, m0, m0
 526    RET
 527%endmacro
 528
 529INIT_MMX mmxext
 530PRED8x8L_128_DC
 531INIT_XMM sse2
 532PRED8x8L_128_DC
 533
 534;-----------------------------------------------------------------------------
 535; void ff_pred8x8l_top_dc_10(pixel *src, int has_topleft, int has_topright,
 536;                            ptrdiff_t stride)
 537;-----------------------------------------------------------------------------
 538%macro PRED8x8L_TOP_DC 0
 539cglobal pred8x8l_top_dc_10, 4, 4, 6
 540    sub         r0, r3
 541    mova        m0, [r0]
 542    shr        r1d, 14
 543    shr        r2d, 13
 544    neg         r1
 545    pslldq      m1, m0, 2
 546    psrldq      m2, m0, 2
 547    pinsrw      m1, [r0+r1], 0
 548    pinsrw      m2, [r0+r2+14], 7
 549    lea         r1, [r3*3]
 550    lea         r2, [r0+r3*4]
 551    PRED4x4_LOWPASS m0, m2, m1, m0
 552    HADDW       m0, m1
 553    paddw       m0, [pw_4]
 554    psrlw       m0, 3
 555    SPLATW      m0, m0, 0
 556    mova [r0+r3*1], m0
 557    mova [r0+r3*2], m0
 558    mova [r0+r1*1], m0
 559    mova [r0+r3*4], m0
 560    mova [r2+r3*1], m0
 561    mova [r2+r3*2], m0
 562    mova [r2+r1*1], m0
 563    mova [r2+r3*4], m0
 564    RET
 565%endmacro
 566
 567INIT_XMM sse2
 568PRED8x8L_TOP_DC
 569%if HAVE_AVX_EXTERNAL
 570INIT_XMM avx
 571PRED8x8L_TOP_DC
 572%endif
 573
 574;-------------------------------------------------------------------------------
 575; void ff_pred8x8l_dc_10(pixel *src, int has_topleft, int has_topright,
 576;                        ptrdiff_t stride)
 577;-------------------------------------------------------------------------------
 578;TODO: see if scalar is faster
 579%macro PRED8x8L_DC 0
 580cglobal pred8x8l_dc_10, 4, 6, 6
 581    sub         r0, r3
 582    lea         r4, [r0+r3*4]
 583    lea         r5, [r3*3]
 584    mova        m0, [r0+r3*2-16]
 585    punpckhwd   m0, [r0+r3*1-16]
 586    mova        m1, [r4+r3*0-16]
 587    punpckhwd   m1, [r0+r5*1-16]
 588    punpckhdq   m1, m0
 589    mova        m2, [r4+r3*2-16]
 590    punpckhwd   m2, [r4+r3*1-16]
 591    mova        m3, [r4+r3*4-16]
 592    punpckhwd   m3, [r4+r5*1-16]
 593    punpckhdq   m3, m2
 594    punpckhqdq  m3, m1
 595    mova        m0, [r0]
 596    shr        r1d, 14
 597    shr        r2d, 13
 598    neg         r1
 599    pslldq      m1, m0, 2
 600    psrldq      m2, m0, 2
 601    pinsrw      m1, [r0+r1], 0
 602    pinsrw      m2, [r0+r2+14], 7
 603    not         r1
 604    and         r1, r3
 605    pslldq      m4, m3, 2
 606    psrldq      m5, m3, 2
 607    pshuflw     m4, m4, 11100101b
 608    pinsrw      m5, [r0+r1-2], 7
 609    PRED4x4_LOWPASS m3, m4, m5, m3
 610    PRED4x4_LOWPASS m0, m2, m1, m0
 611    paddw       m0, m3
 612    HADDW       m0, m1
 613    paddw       m0, [pw_8]
 614    psrlw       m0, 4
 615    SPLATW      m0, m0
 616    mova [r0+r3*1], m0
 617    mova [r0+r3*2], m0
 618    mova [r0+r5*1], m0
 619    mova [r0+r3*4], m0
 620    mova [r4+r3*1], m0
 621    mova [r4+r3*2], m0
 622    mova [r4+r5*1], m0
 623    mova [r4+r3*4], m0
 624    RET
 625%endmacro
 626
 627INIT_XMM sse2
 628PRED8x8L_DC
 629%if HAVE_AVX_EXTERNAL
 630INIT_XMM avx
 631PRED8x8L_DC
 632%endif
 633
 634;-----------------------------------------------------------------------------
 635; void ff_pred8x8l_vertical_10(pixel *src, int has_topleft, int has_topright,
 636;                              ptrdiff_t stride)
 637;-----------------------------------------------------------------------------
 638%macro PRED8x8L_VERTICAL 0
 639cglobal pred8x8l_vertical_10, 4, 4, 6
 640    sub         r0, r3
 641    mova        m0, [r0]
 642    shr        r1d, 14
 643    shr        r2d, 13
 644    neg         r1
 645    pslldq      m1, m0, 2
 646    psrldq      m2, m0, 2
 647    pinsrw      m1, [r0+r1], 0
 648    pinsrw      m2, [r0+r2+14], 7
 649    lea         r1, [r3*3]
 650    lea         r2, [r0+r3*4]
 651    PRED4x4_LOWPASS m0, m2, m1, m0
 652    mova [r0+r3*1], m0
 653    mova [r0+r3*2], m0
 654    mova [r0+r1*1], m0
 655    mova [r0+r3*4], m0
 656    mova [r2+r3*1], m0
 657    mova [r2+r3*2], m0
 658    mova [r2+r1*1], m0
 659    mova [r2+r3*4], m0
 660    RET
 661%endmacro
 662
 663INIT_XMM sse2
 664PRED8x8L_VERTICAL
 665%if HAVE_AVX_EXTERNAL
 666INIT_XMM avx
 667PRED8x8L_VERTICAL
 668%endif
 669
 670;-----------------------------------------------------------------------------
 671; void ff_pred8x8l_horizontal_10(uint8_t *src, int has_topleft,
 672;                                int has_topright, ptrdiff_t stride)
 673;-----------------------------------------------------------------------------
 674%macro PRED8x8L_HORIZONTAL 0
 675cglobal pred8x8l_horizontal_10, 4, 4, 5
 676    mova        m0, [r0-16]
 677    shr        r1d, 14
 678    dec         r1
 679    and         r1, r3
 680    sub         r1, r3
 681    punpckhwd   m0, [r0+r1-16]
 682    mova        m1, [r0+r3*2-16]
 683    punpckhwd   m1, [r0+r3*1-16]
 684    lea         r2, [r0+r3*4]
 685    lea         r1, [r3*3]
 686    punpckhdq   m1, m0
 687    mova        m2, [r2+r3*0-16]
 688    punpckhwd   m2, [r0+r1-16]
 689    mova        m3, [r2+r3*2-16]
 690    punpckhwd   m3, [r2+r3*1-16]
 691    punpckhdq   m3, m2
 692    punpckhqdq  m3, m1
 693    PALIGNR     m4, m3, [r2+r1-16], 14, m0
 694    pslldq      m0, m4, 2
 695    pshuflw     m0, m0, 11100101b
 696    PRED4x4_LOWPASS m4, m3, m0, m4
 697    punpckhwd   m3, m4, m4
 698    punpcklwd   m4, m4
 699    pshufd      m0, m3, 0xff
 700    pshufd      m1, m3, 0xaa
 701    pshufd      m2, m3, 0x55
 702    pshufd      m3, m3, 0x00
 703    mova [r0+r3*0], m0
 704    mova [r0+r3*1], m1
 705    mova [r0+r3*2], m2
 706    mova [r0+r1*1], m3
 707    pshufd      m0, m4, 0xff
 708    pshufd      m1, m4, 0xaa
 709    pshufd      m2, m4, 0x55
 710    pshufd      m3, m4, 0x00
 711    mova [r2+r3*0], m0
 712    mova [r2+r3*1], m1
 713    mova [r2+r3*2], m2
 714    mova [r2+r1*1], m3
 715    RET
 716%endmacro
 717
 718INIT_XMM sse2
 719PRED8x8L_HORIZONTAL
 720INIT_XMM ssse3
 721PRED8x8L_HORIZONTAL
 722%if HAVE_AVX_EXTERNAL
 723INIT_XMM avx
 724PRED8x8L_HORIZONTAL
 725%endif
 726
 727;-----------------------------------------------------------------------------
 728; void ff_pred8x8l_down_left_10(pixel *src, int has_topleft, int has_topright,
 729;                               ptrdiff_t stride)
 730;-----------------------------------------------------------------------------
 731%macro PRED8x8L_DOWN_LEFT 0
 732cglobal pred8x8l_down_left_10, 4, 4, 7
 733    sub         r0, r3
 734    mova        m3, [r0]
 735    shr        r1d, 14
 736    neg         r1
 737    shr        r2d, 13
 738    pslldq      m1, m3, 2
 739    psrldq      m2, m3, 2
 740    pinsrw      m1, [r0+r1], 0
 741    pinsrw      m2, [r0+r2+14], 7
 742    PRED4x4_LOWPASS m6, m2, m1, m3
 743    jz .fix_tr ; flags from shr r2d
 744    mova        m1, [r0+16]
 745    psrldq      m5, m1, 2
 746    PALIGNR     m2, m1, m3, 14, m3
 747    pshufhw     m5, m5, 10100100b
 748    PRED4x4_LOWPASS m1, m2, m5, m1
 749.do_topright:
 750    lea         r1, [r3*3]
 751    psrldq      m5, m1, 14
 752    lea         r2, [r0+r3*4]
 753    PALIGNR     m2, m1, m6,  2, m0
 754    PALIGNR     m3, m1, m6, 14, m0
 755    PALIGNR     m5, m1,  2, m0
 756    pslldq      m4, m6, 2
 757    PRED4x4_LOWPASS m6, m4, m2, m6
 758    PRED4x4_LOWPASS m1, m3, m5, m1
 759    mova [r2+r3*4], m1
 760    PALIGNR     m1, m6, 14, m2
 761    pslldq      m6, 2
 762    mova [r2+r1*1], m1
 763    PALIGNR     m1, m6, 14, m2
 764    pslldq      m6, 2
 765    mova [r2+r3*2], m1
 766    PALIGNR     m1, m6, 14, m2
 767    pslldq      m6, 2
 768    mova [r2+r3*1], m1
 769    PALIGNR     m1, m6, 14, m2
 770    pslldq      m6, 2
 771    mova [r0+r3*4], m1
 772    PALIGNR     m1, m6, 14, m2
 773    pslldq      m6, 2
 774    mova [r0+r1*1], m1
 775    PALIGNR     m1, m6, 14, m2
 776    pslldq      m6, 2
 777    mova [r0+r3*2], m1
 778    PALIGNR     m1, m6, 14, m6
 779    mova [r0+r3*1], m1
 780    RET
 781.fix_tr:
 782    punpckhwd   m3, m3
 783    pshufd      m1, m3, 0xFF
 784    jmp .do_topright
 785%endmacro
 786
 787INIT_XMM sse2
 788PRED8x8L_DOWN_LEFT
 789INIT_XMM ssse3
 790PRED8x8L_DOWN_LEFT
 791%if HAVE_AVX_EXTERNAL
 792INIT_XMM avx
 793PRED8x8L_DOWN_LEFT
 794%endif
 795
 796;-----------------------------------------------------------------------------
 797; void ff_pred8x8l_down_right_10(pixel *src, int has_topleft,
 798;                                int has_topright, ptrdiff_t stride)
 799;-----------------------------------------------------------------------------
 800%macro PRED8x8L_DOWN_RIGHT 0
 801; standard forbids this when has_topleft is false
 802; no need to check
 803cglobal pred8x8l_down_right_10, 4, 5, 8
 804    sub         r0, r3
 805    lea         r4, [r0+r3*4]
 806    lea         r1, [r3*3]
 807    mova        m0, [r0+r3*1-16]
 808    punpckhwd   m0, [r0+r3*0-16]
 809    mova        m1, [r0+r1*1-16]
 810    punpckhwd   m1, [r0+r3*2-16]
 811    punpckhdq   m1, m0
 812    mova        m2, [r4+r3*1-16]
 813    punpckhwd   m2, [r4+r3*0-16]
 814    mova        m3, [r4+r1*1-16]
 815    punpckhwd   m3, [r4+r3*2-16]
 816    punpckhdq   m3, m2
 817    punpckhqdq  m3, m1
 818    mova        m0, [r4+r3*4-16]
 819    mova        m1, [r0]
 820    PALIGNR     m4, m3, m0, 14, m0
 821    PALIGNR     m1, m3,  2, m2
 822    pslldq      m0, m4, 2
 823    pshuflw     m0, m0, 11100101b
 824    PRED4x4_LOWPASS m6, m1, m4, m3
 825    PRED4x4_LOWPASS m4, m3, m0, m4
 826    mova        m3, [r0]
 827    shr        r2d, 13
 828    pslldq      m1, m3, 2
 829    psrldq      m2, m3, 2
 830    pinsrw      m1, [r0-2], 0
 831    pinsrw      m2, [r0+r2+14], 7
 832    PRED4x4_LOWPASS m3, m2, m1, m3
 833    PALIGNR     m2, m3, m6,  2, m0
 834    PALIGNR     m5, m3, m6, 14, m0
 835    psrldq      m7, m3, 2
 836    PRED4x4_LOWPASS m6, m4, m2, m6
 837    PRED4x4_LOWPASS m3, m5, m7, m3
 838    mova [r4+r3*4], m6
 839    PALIGNR     m3, m6, 14, m2
 840    pslldq      m6, 2
 841    mova [r0+r3*1], m3
 842    PALIGNR     m3, m6, 14, m2
 843    pslldq      m6, 2
 844    mova [r0+r3*2], m3
 845    PALIGNR     m3, m6, 14, m2
 846    pslldq      m6, 2
 847    mova [r0+r1*1], m3
 848    PALIGNR     m3, m6, 14, m2
 849    pslldq      m6, 2
 850    mova [r0+r3*4], m3
 851    PALIGNR     m3, m6, 14, m2
 852    pslldq      m6, 2
 853    mova [r4+r3*1], m3
 854    PALIGNR     m3, m6, 14, m2
 855    pslldq      m6, 2
 856    mova [r4+r3*2], m3
 857    PALIGNR     m3, m6, 14, m6
 858    mova [r4+r1*1], m3
 859    RET
 860%endmacro
 861
 862INIT_XMM sse2
 863PRED8x8L_DOWN_RIGHT
 864INIT_XMM ssse3
 865PRED8x8L_DOWN_RIGHT
 866%if HAVE_AVX_EXTERNAL
 867INIT_XMM avx
 868PRED8x8L_DOWN_RIGHT
 869%endif
 870
 871;-----------------------------------------------------------------------------
 872; void ff_pred8x8l_vertical_right_10(pixel *src, int has_topleft,
 873;                                    int has_topright, ptrdiff_t stride)
 874;-----------------------------------------------------------------------------
 875%macro PRED8x8L_VERTICAL_RIGHT 0
 876; likewise with 8x8l_down_right
 877cglobal pred8x8l_vertical_right_10, 4, 5, 7
 878    sub         r0, r3
 879    lea         r4, [r0+r3*4]
 880    lea         r1, [r3*3]
 881    mova        m0, [r0+r3*1-16]
 882    punpckhwd   m0, [r0+r3*0-16]
 883    mova        m1, [r0+r1*1-16]
 884    punpckhwd   m1, [r0+r3*2-16]
 885    punpckhdq   m1, m0
 886    mova        m2, [r4+r3*1-16]
 887    punpckhwd   m2, [r4+r3*0-16]
 888    mova        m3, [r4+r1*1-16]
 889    punpckhwd   m3, [r4+r3*2-16]
 890    punpckhdq   m3, m2
 891    punpckhqdq  m3, m1
 892    mova        m0, [r4+r3*4-16]
 893    mova        m1, [r0]
 894    PALIGNR     m4, m3, m0, 14, m0
 895    PALIGNR     m1, m3,  2, m2
 896    PRED4x4_LOWPASS m3, m1, m4, m3
 897    mova        m2, [r0]
 898    shr        r2d, 13
 899    pslldq      m1, m2, 2
 900    psrldq      m5, m2, 2
 901    pinsrw      m1, [r0-2], 0
 902    pinsrw      m5, [r0+r2+14], 7
 903    PRED4x4_LOWPASS m2, m5, m1, m2
 904    PALIGNR     m6, m2, m3, 12, m1
 905    PALIGNR     m5, m2, m3, 14, m0
 906    PRED4x4_LOWPASS m0, m6, m2, m5
 907    pavgw       m2, m5
 908    mova [r0+r3*2], m0
 909    mova [r0+r3*1], m2
 910    pslldq      m6, m3, 4
 911    pslldq      m1, m3, 2
 912    PRED4x4_LOWPASS m1, m3, m6, m1
 913    PALIGNR     m2, m1, 14, m4
 914    mova [r0+r1*1], m2
 915    pslldq      m1, 2
 916    PALIGNR     m0, m1, 14, m3
 917    mova [r0+r3*4], m0
 918    pslldq      m1, 2
 919    PALIGNR     m2, m1, 14, m4
 920    mova [r4+r3*1], m2
 921    pslldq      m1, 2
 922    PALIGNR     m0, m1, 14, m3
 923    mova [r4+r3*2], m0
 924    pslldq      m1, 2
 925    PALIGNR     m2, m1, 14, m4
 926    mova [r4+r1*1], m2
 927    pslldq      m1, 2
 928    PALIGNR     m0, m1, 14, m1
 929    mova [r4+r3*4], m0
 930    RET
 931%endmacro
 932
 933INIT_XMM sse2
 934PRED8x8L_VERTICAL_RIGHT
 935INIT_XMM ssse3
 936PRED8x8L_VERTICAL_RIGHT
 937%if HAVE_AVX_EXTERNAL
 938INIT_XMM avx
 939PRED8x8L_VERTICAL_RIGHT
 940%endif
 941
 942;-----------------------------------------------------------------------------
 943; void ff_pred8x8l_horizontal_up_10(pixel *src, int has_topleft,
 944;                                   int has_topright, ptrdiff_t stride)
 945;-----------------------------------------------------------------------------
 946%macro PRED8x8L_HORIZONTAL_UP 0
 947cglobal pred8x8l_horizontal_up_10, 4, 4, 6
 948    mova        m0, [r0+r3*0-16]
 949    punpckhwd   m0, [r0+r3*1-16]
 950    shr        r1d, 14
 951    dec         r1
 952    and         r1, r3
 953    sub         r1, r3
 954    mova        m4, [r0+r1*1-16]
 955    lea         r1, [r3*3]
 956    lea         r2, [r0+r3*4]
 957    mova        m1, [r0+r3*2-16]
 958    punpckhwd   m1, [r0+r1*1-16]
 959    punpckhdq   m0, m1
 960    mova        m2, [r2+r3*0-16]
 961    punpckhwd   m2, [r2+r3*1-16]
 962    mova        m3, [r2+r3*2-16]
 963    punpckhwd   m3, [r2+r1*1-16]
 964    punpckhdq   m2, m3
 965    punpckhqdq  m0, m2
 966    PALIGNR     m1, m0, m4, 14, m4
 967    psrldq      m2, m0, 2
 968    pshufhw     m2, m2, 10100100b
 969    PRED4x4_LOWPASS m0, m1, m2, m0
 970    psrldq      m1, m0, 2
 971    psrldq      m2, m0, 4
 972    pshufhw     m1, m1, 10100100b
 973    pshufhw     m2, m2, 01010100b
 974    pavgw       m4, m0, m1
 975    PRED4x4_LOWPASS m1, m2, m0, m1
 976    punpckhwd   m5, m4, m1
 977    punpcklwd   m4, m1
 978    mova [r2+r3*0], m5
 979    mova [r0+r3*0], m4
 980    pshufd      m0, m5, 11111001b
 981    pshufd      m1, m5, 11111110b
 982    pshufd      m2, m5, 11111111b
 983    mova [r2+r3*1], m0
 984    mova [r2+r3*2], m1
 985    mova [r2+r1*1], m2
 986    PALIGNR     m2, m5, m4, 4, m0
 987    PALIGNR     m3, m5, m4, 8, m1
 988    PALIGNR     m5, m5, m4, 12, m4
 989    mova [r0+r3*1], m2
 990    mova [r0+r3*2], m3
 991    mova [r0+r1*1], m5
 992    RET
 993%endmacro
 994
 995INIT_XMM sse2
 996PRED8x8L_HORIZONTAL_UP
 997INIT_XMM ssse3
 998PRED8x8L_HORIZONTAL_UP
 999%if HAVE_AVX_EXTERNAL
1000INIT_XMM avx
1001PRED8x8L_HORIZONTAL_UP
1002%endif
1003
1004
1005;-----------------------------------------------------------------------------
1006; void ff_pred16x16_vertical_10(pixel *src, ptrdiff_t stride)
1007;-----------------------------------------------------------------------------
1008%macro MOV16 3-5
1009    mova [%1+     0], %2
1010    mova [%1+mmsize], %3
1011%if mmsize==8
1012    mova [%1+    16], %4
1013    mova [%1+    24], %5
1014%endif
1015%endmacro
1016
1017%macro PRED16x16_VERTICAL 0
1018cglobal pred16x16_vertical_10, 2, 3
1019    sub   r0, r1
1020    mov  r2d, 8
1021    mova  m0, [r0+ 0]
1022    mova  m1, [r0+mmsize]
1023%if mmsize==8
1024    mova  m2, [r0+16]
1025    mova  m3, [r0+24]
1026%endif
1027.loop:
1028    MOV16 r0+r1*1, m0, m1, m2, m3
1029    MOV16 r0+r1*2, m0, m1, m2, m3
1030    lea   r0, [r0+r1*2]
1031    dec   r2d
1032    jg .loop
1033    REP_RET
1034%endmacro
1035
1036INIT_MMX mmxext
1037PRED16x16_VERTICAL
1038INIT_XMM sse2
1039PRED16x16_VERTICAL
1040
1041;-----------------------------------------------------------------------------
1042; void ff_pred16x16_horizontal_10(pixel *src, ptrdiff_t stride)
1043;-----------------------------------------------------------------------------
1044%macro PRED16x16_HORIZONTAL 0
1045cglobal pred16x16_horizontal_10, 2, 3
1046    mov   r2d, 8
1047.vloop:
1048    movd   m0, [r0+r1*0-4]
1049    movd   m1, [r0+r1*1-4]
1050    SPLATW m0, m0, 1
1051    SPLATW m1, m1, 1
1052    MOV16  r0+r1*0, m0, m0, m0, m0
1053    MOV16  r0+r1*1, m1, m1, m1, m1
1054    lea    r0, [r0+r1*2]
1055    dec    r2d
1056    jg .vloop
1057    REP_RET
1058%endmacro
1059
1060INIT_MMX mmxext
1061PRED16x16_HORIZONTAL
1062INIT_XMM sse2
1063PRED16x16_HORIZONTAL
1064
1065;-----------------------------------------------------------------------------
1066; void ff_pred16x16_dc_10(pixel *src, ptrdiff_t stride)
1067;-----------------------------------------------------------------------------
1068%macro PRED16x16_DC 0
1069cglobal pred16x16_dc_10, 2, 6
1070    mov        r5, r0
1071    sub        r0, r1
1072    mova       m0, [r0+0]
1073    paddw      m0, [r0+mmsize]
1074%if mmsize==8
1075    paddw      m0, [r0+16]
1076    paddw      m0, [r0+24]
1077%endif
1078    HADDW      m0, m2
1079
1080    lea        r0, [r0+r1-2]
1081    movzx     r3d, word [r0]
1082    movzx     r4d, word [r0+r1]
1083%rep 7
1084    lea        r0, [r0+r1*2]
1085    movzx     r2d, word [r0]
1086    add       r3d, r2d
1087    movzx     r2d, word [r0+r1]
1088    add       r4d, r2d
1089%endrep
1090    lea       r3d, [r3+r4+16]
1091
1092    movd       m1, r3d
1093    paddw      m0, m1
1094    psrlw      m0, 5
1095    SPLATW     m0, m0
1096    mov       r3d, 8
1097.loop:
1098    MOV16 r5+r1*0, m0, m0, m0, m0
1099    MOV16 r5+r1*1, m0, m0, m0, m0
1100    lea        r5, [r5+r1*2]
1101    dec       r3d
1102    jg .loop
1103    REP_RET
1104%endmacro
1105
1106INIT_MMX mmxext
1107PRED16x16_DC
1108INIT_XMM sse2
1109PRED16x16_DC
1110
1111;-----------------------------------------------------------------------------
1112; void ff_pred16x16_top_dc_10(pixel *src, ptrdiff_t stride)
1113;-----------------------------------------------------------------------------
1114%macro PRED16x16_TOP_DC 0
1115cglobal pred16x16_top_dc_10, 2, 3
1116    sub        r0, r1
1117    mova       m0, [r0+0]
1118    paddw      m0, [r0+mmsize]
1119%if mmsize==8
1120    paddw      m0, [r0+16]
1121    paddw      m0, [r0+24]
1122%endif
1123    HADDW      m0, m2
1124
1125    SPLATW     m0, m0
1126    paddw      m0, [pw_8]
1127    psrlw      m0, 4
1128    mov       r2d, 8
1129.loop:
1130    MOV16 r0+r1*1, m0, m0, m0, m0
1131    MOV16 r0+r1*2, m0, m0, m0, m0
1132    lea        r0, [r0+r1*2]
1133    dec       r2d
1134    jg .loop
1135    REP_RET
1136%endmacro
1137
1138INIT_MMX mmxext
1139PRED16x16_TOP_DC
1140INIT_XMM sse2
1141PRED16x16_TOP_DC
1142
1143;-----------------------------------------------------------------------------
1144; void ff_pred16x16_left_dc_10(pixel *src, ptrdiff_t stride)
1145;-----------------------------------------------------------------------------
1146%macro PRED16x16_LEFT_DC 0
1147cglobal pred16x16_left_dc_10, 2, 6
1148    mov        r5, r0
1149
1150    sub        r0, 2
1151    movzx     r3d, word [r0]
1152    movzx     r4d, word [r0+r1]
1153%rep 7
1154    lea        r0, [r0+r1*2]
1155    movzx     r2d, word [r0]
1156    add       r3d, r2d
1157    movzx     r2d, word [r0+r1]
1158    add       r4d, r2d
1159%endrep
1160    lea       r3d, [r3+r4+8]
1161    shr       r3d, 4
1162
1163    movd       m0, r3d
1164    SPLATW     m0, m0
1165    mov       r3d, 8
1166.loop:
1167    MOV16 r5+r1*0, m0, m0, m0, m0
1168    MOV16 r5+r1*1, m0, m0, m0, m0
1169    lea        r5, [r5+r1*2]
1170    dec       r3d
1171    jg .loop
1172    REP_RET
1173%endmacro
1174
1175INIT_MMX mmxext
1176PRED16x16_LEFT_DC
1177INIT_XMM sse2
1178PRED16x16_LEFT_DC
1179
1180;-----------------------------------------------------------------------------
1181; void ff_pred16x16_128_dc_10(pixel *src, ptrdiff_t stride)
1182;-----------------------------------------------------------------------------
1183%macro PRED16x16_128_DC 0
1184cglobal pred16x16_128_dc_10, 2,3
1185    mova       m0, [pw_512]
1186    mov       r2d, 8
1187.loop:
1188    MOV16 r0+r1*0, m0, m0, m0, m0
1189    MOV16 r0+r1*1, m0, m0, m0, m0
1190    lea        r0, [r0+r1*2]
1191    dec       r2d
1192    jg .loop
1193    REP_RET
1194%endmacro
1195
1196INIT_MMX mmxext
1197PRED16x16_128_DC
1198INIT_XMM sse2
1199PRED16x16_128_DC