PageRenderTime 106ms CodeModel.GetById 18ms app.highlight 77ms RepoModel.GetById 1ms app.codeStats 1ms

/libavcodec/x86/h264_intrapred.asm

http://github.com/FFmpeg/FFmpeg
Assembly | 2757 lines | 2482 code | 134 blank | 141 comment | 14 complexity | 54c2ba2d59cfdaf4f91ee096a84aef53 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1;******************************************************************************
   2;* H.264 intra prediction asm optimizations
   3;* Copyright (c) 2010 Fiona Glaser
   4;* Copyright (c) 2010 Holger Lubitz
   5;* Copyright (c) 2010 Loren Merritt
   6;* Copyright (c) 2010 Ronald S. Bultje
   7;*
   8;* This file is part of FFmpeg.
   9;*
  10;* FFmpeg is free software; you can redistribute it and/or
  11;* modify it under the terms of the GNU Lesser General Public
  12;* License as published by the Free Software Foundation; either
  13;* version 2.1 of the License, or (at your option) any later version.
  14;*
  15;* FFmpeg is distributed in the hope that it will be useful,
  16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18;* Lesser General Public License for more details.
  19;*
  20;* You should have received a copy of the GNU Lesser General Public
  21;* License along with FFmpeg; if not, write to the Free Software
  22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23;******************************************************************************
  24
  25%include "libavutil/x86/x86util.asm"
  26
  27SECTION_RODATA
  28
  29tm_shuf: times 8 db 0x03, 0x80
  30pw_ff00: times 8 dw 0xff00
  31plane_shuf:  db -8, -7, -6, -5, -4, -3, -2, -1
  32             db  1,  2,  3,  4,  5,  6,  7,  8
  33plane8_shuf: db -4, -3, -2, -1,  0,  0,  0,  0
  34             db  1,  2,  3,  4,  0,  0,  0,  0
  35pw_0to7:     dw  0,  1,  2,  3,  4,  5,  6,  7
  36pw_1to8:     dw  1,  2,  3,  4,  5,  6,  7,  8
  37pw_m8tom1:   dw -8, -7, -6, -5, -4, -3, -2, -1
  38pw_m4to4:    dw -4, -3, -2, -1,  1,  2,  3,  4
  39
  40SECTION .text
  41
  42cextern pb_1
  43cextern pb_3
  44cextern pw_4
  45cextern pw_5
  46cextern pw_8
  47cextern pw_16
  48cextern pw_17
  49cextern pw_32
  50
  51;-----------------------------------------------------------------------------
  52; void ff_pred16x16_vertical_8(uint8_t *src, ptrdiff_t stride)
  53;-----------------------------------------------------------------------------
  54
  55INIT_MMX mmx
  56cglobal pred16x16_vertical_8, 2,3
  57    sub   r0, r1
  58    mov   r2, 8
  59    movq mm0, [r0+0]
  60    movq mm1, [r0+8]
  61.loop:
  62    movq [r0+r1*1+0], mm0
  63    movq [r0+r1*1+8], mm1
  64    movq [r0+r1*2+0], mm0
  65    movq [r0+r1*2+8], mm1
  66    lea   r0, [r0+r1*2]
  67    dec   r2
  68    jg .loop
  69    REP_RET
  70
  71INIT_XMM sse
  72cglobal pred16x16_vertical_8, 2,3
  73    sub   r0, r1
  74    mov   r2, 4
  75    movaps xmm0, [r0]
  76.loop:
  77    movaps [r0+r1*1], xmm0
  78    movaps [r0+r1*2], xmm0
  79    lea   r0, [r0+r1*2]
  80    movaps [r0+r1*1], xmm0
  81    movaps [r0+r1*2], xmm0
  82    lea   r0, [r0+r1*2]
  83    dec   r2
  84    jg .loop
  85    REP_RET
  86
  87;-----------------------------------------------------------------------------
  88; void ff_pred16x16_horizontal_8(uint8_t *src, ptrdiff_t stride)
  89;-----------------------------------------------------------------------------
  90
  91%macro PRED16x16_H 0
  92cglobal pred16x16_horizontal_8, 2,3
  93    mov       r2, 8
  94%if cpuflag(ssse3)
  95    mova      m2, [pb_3]
  96%endif
  97.loop:
  98    movd      m0, [r0+r1*0-4]
  99    movd      m1, [r0+r1*1-4]
 100
 101%if cpuflag(ssse3)
 102    pshufb    m0, m2
 103    pshufb    m1, m2
 104%else
 105    punpcklbw m0, m0
 106    punpcklbw m1, m1
 107    SPLATW    m0, m0, 3
 108    SPLATW    m1, m1, 3
 109    mova [r0+r1*0+8], m0
 110    mova [r0+r1*1+8], m1
 111%endif
 112
 113    mova [r0+r1*0], m0
 114    mova [r0+r1*1], m1
 115    lea       r0, [r0+r1*2]
 116    dec       r2
 117    jg .loop
 118    REP_RET
 119%endmacro
 120
 121INIT_MMX mmx
 122PRED16x16_H
 123INIT_MMX mmxext
 124PRED16x16_H
 125INIT_XMM ssse3
 126PRED16x16_H
 127
 128;-----------------------------------------------------------------------------
 129; void ff_pred16x16_dc_8(uint8_t *src, ptrdiff_t stride)
 130;-----------------------------------------------------------------------------
 131
 132%macro PRED16x16_DC 0
 133cglobal pred16x16_dc_8, 2,7
 134    mov       r4, r0
 135    sub       r0, r1
 136    pxor      mm0, mm0
 137    pxor      mm1, mm1
 138    psadbw    mm0, [r0+0]
 139    psadbw    mm1, [r0+8]
 140    dec        r0
 141    movzx     r5d, byte [r0+r1*1]
 142    paddw     mm0, mm1
 143    movd      r6d, mm0
 144    lea        r0, [r0+r1*2]
 145%rep 7
 146    movzx     r2d, byte [r0+r1*0]
 147    movzx     r3d, byte [r0+r1*1]
 148    add       r5d, r2d
 149    add       r6d, r3d
 150    lea        r0, [r0+r1*2]
 151%endrep
 152    movzx     r2d, byte [r0+r1*0]
 153    add       r5d, r6d
 154    lea       r2d, [r2+r5+16]
 155    shr       r2d, 5
 156%if cpuflag(ssse3)
 157    pxor       m1, m1
 158%endif
 159    SPLATB_REG m0, r2, m1
 160
 161%if mmsize==8
 162    mov       r3d, 8
 163.loop:
 164    mova [r4+r1*0+0], m0
 165    mova [r4+r1*0+8], m0
 166    mova [r4+r1*1+0], m0
 167    mova [r4+r1*1+8], m0
 168%else
 169    mov       r3d, 4
 170.loop:
 171    mova [r4+r1*0], m0
 172    mova [r4+r1*1], m0
 173    lea   r4, [r4+r1*2]
 174    mova [r4+r1*0], m0
 175    mova [r4+r1*1], m0
 176%endif
 177    lea   r4, [r4+r1*2]
 178    dec   r3d
 179    jg .loop
 180    REP_RET
 181%endmacro
 182
 183INIT_MMX mmxext
 184PRED16x16_DC
 185INIT_XMM sse2
 186PRED16x16_DC
 187INIT_XMM ssse3
 188PRED16x16_DC
 189
 190;-----------------------------------------------------------------------------
 191; void ff_pred16x16_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
 192;-----------------------------------------------------------------------------
 193
 194%macro PRED16x16_TM 0
 195cglobal pred16x16_tm_vp8_8, 2,5
 196    sub        r0, r1
 197    pxor      mm7, mm7
 198    movq      mm0, [r0+0]
 199    movq      mm2, [r0+8]
 200    movq      mm1, mm0
 201    movq      mm3, mm2
 202    punpcklbw mm0, mm7
 203    punpckhbw mm1, mm7
 204    punpcklbw mm2, mm7
 205    punpckhbw mm3, mm7
 206    movzx     r3d, byte [r0-1]
 207    mov       r4d, 16
 208.loop:
 209    movzx     r2d, byte [r0+r1-1]
 210    sub       r2d, r3d
 211    movd      mm4, r2d
 212    SPLATW    mm4, mm4, 0
 213    movq      mm5, mm4
 214    movq      mm6, mm4
 215    movq      mm7, mm4
 216    paddw     mm4, mm0
 217    paddw     mm5, mm1
 218    paddw     mm6, mm2
 219    paddw     mm7, mm3
 220    packuswb  mm4, mm5
 221    packuswb  mm6, mm7
 222    movq [r0+r1+0], mm4
 223    movq [r0+r1+8], mm6
 224    add        r0, r1
 225    dec       r4d
 226    jg .loop
 227    REP_RET
 228%endmacro
 229
 230INIT_MMX mmx
 231PRED16x16_TM
 232INIT_MMX mmxext
 233PRED16x16_TM
 234
 235INIT_XMM sse2
 236cglobal pred16x16_tm_vp8_8, 2,6,6
 237    sub          r0, r1
 238    pxor       xmm2, xmm2
 239    movdqa     xmm0, [r0]
 240    movdqa     xmm1, xmm0
 241    punpcklbw  xmm0, xmm2
 242    punpckhbw  xmm1, xmm2
 243    movzx       r4d, byte [r0-1]
 244    mov         r5d, 8
 245.loop:
 246    movzx       r2d, byte [r0+r1*1-1]
 247    movzx       r3d, byte [r0+r1*2-1]
 248    sub         r2d, r4d
 249    sub         r3d, r4d
 250    movd       xmm2, r2d
 251    movd       xmm4, r3d
 252    pshuflw    xmm2, xmm2, 0
 253    pshuflw    xmm4, xmm4, 0
 254    punpcklqdq xmm2, xmm2
 255    punpcklqdq xmm4, xmm4
 256    movdqa     xmm3, xmm2
 257    movdqa     xmm5, xmm4
 258    paddw      xmm2, xmm0
 259    paddw      xmm3, xmm1
 260    paddw      xmm4, xmm0
 261    paddw      xmm5, xmm1
 262    packuswb   xmm2, xmm3
 263    packuswb   xmm4, xmm5
 264    movdqa [r0+r1*1], xmm2
 265    movdqa [r0+r1*2], xmm4
 266    lea          r0, [r0+r1*2]
 267    dec         r5d
 268    jg .loop
 269    REP_RET
 270
 271%if HAVE_AVX2_EXTERNAL
 272INIT_YMM avx2
 273cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration
 274    sub                       dstq, strideq
 275    pmovzxbw                    m0, [dstq]
 276    vpbroadcastb               xm1, [r0-1]
 277    pmovzxbw                    m1, xm1
 278    psubw                       m0, m1
 279    mov                 iterationd, 4
 280    lea                   stride3q, [strideq*3]
 281.loop:
 282    vpbroadcastb               xm1, [dstq+strideq*1-1]
 283    vpbroadcastb               xm2, [dstq+strideq*2-1]
 284    vpbroadcastb               xm3, [dstq+stride3q-1]
 285    vpbroadcastb               xm4, [dstq+strideq*4-1]
 286    pmovzxbw                    m1, xm1
 287    pmovzxbw                    m2, xm2
 288    pmovzxbw                    m3, xm3
 289    pmovzxbw                    m4, xm4
 290    paddw                       m1, m0
 291    paddw                       m2, m0
 292    paddw                       m3, m0
 293    paddw                       m4, m0
 294    vpackuswb                   m1, m1, m2
 295    vpackuswb                   m3, m3, m4
 296    vpermq                      m1, m1, q3120
 297    vpermq                      m3, m3, q3120
 298    movdqa        [dstq+strideq*1], xm1
 299    vextracti128  [dstq+strideq*2], m1, 1
 300    movdqa       [dstq+stride3q*1], xm3
 301    vextracti128  [dstq+strideq*4], m3, 1
 302    lea                       dstq, [dstq+strideq*4]
 303    dec                 iterationd
 304    jg .loop
 305    REP_RET
 306%endif
 307
 308;-----------------------------------------------------------------------------
 309; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride)
 310;-----------------------------------------------------------------------------
 311
 312%macro H264_PRED16x16_PLANE 1
 313cglobal pred16x16_plane_%1_8, 2,9,7
 314    mov          r2, r1           ; +stride
 315    neg          r1               ; -stride
 316
 317    movh         m0, [r0+r1  -1]
 318%if mmsize == 8
 319    pxor         m4, m4
 320    movh         m1, [r0+r1  +3 ]
 321    movh         m2, [r0+r1  +8 ]
 322    movh         m3, [r0+r1  +12]
 323    punpcklbw    m0, m4
 324    punpcklbw    m1, m4
 325    punpcklbw    m2, m4
 326    punpcklbw    m3, m4
 327    pmullw       m0, [pw_m8tom1  ]
 328    pmullw       m1, [pw_m8tom1+8]
 329    pmullw       m2, [pw_1to8    ]
 330    pmullw       m3, [pw_1to8  +8]
 331    paddw        m0, m2
 332    paddw        m1, m3
 333%else ; mmsize == 16
 334%if cpuflag(ssse3)
 335    movhps       m0, [r0+r1  +8]
 336    pmaddubsw    m0, [plane_shuf] ; H coefficients
 337%else ; sse2
 338    pxor         m2, m2
 339    movh         m1, [r0+r1  +8]
 340    punpcklbw    m0, m2
 341    punpcklbw    m1, m2
 342    pmullw       m0, [pw_m8tom1]
 343    pmullw       m1, [pw_1to8]
 344    paddw        m0, m1
 345%endif
 346    movhlps      m1, m0
 347%endif
 348    paddw        m0, m1
 349%if cpuflag(mmxext)
 350    PSHUFLW      m1, m0, 0xE
 351%elif cpuflag(mmx)
 352    mova         m1, m0
 353    psrlq        m1, 32
 354%endif
 355    paddw        m0, m1
 356%if cpuflag(mmxext)
 357    PSHUFLW      m1, m0, 0x1
 358%elif cpuflag(mmx)
 359    mova         m1, m0
 360    psrlq        m1, 16
 361%endif
 362    paddw        m0, m1           ; sum of H coefficients
 363
 364    lea          r4, [r0+r2*8-1]
 365    lea          r3, [r0+r2*4-1]
 366    add          r4, r2
 367
 368%if ARCH_X86_64
 369%define e_reg r8
 370%else
 371%define e_reg r0
 372%endif
 373
 374    movzx     e_reg, byte [r3+r2*2   ]
 375    movzx        r5, byte [r4+r1     ]
 376    sub          r5, e_reg
 377
 378    movzx     e_reg, byte [r3+r2     ]
 379    movzx        r6, byte [r4        ]
 380    sub          r6, e_reg
 381    lea          r5, [r5+r6*2]
 382
 383    movzx     e_reg, byte [r3+r1     ]
 384    movzx        r6, byte [r4+r2*2   ]
 385    sub          r6, e_reg
 386    lea          r5, [r5+r6*4]
 387
 388    movzx     e_reg, byte [r3        ]
 389%if ARCH_X86_64
 390    movzx        r7, byte [r4+r2     ]
 391    sub          r7, e_reg
 392%else
 393    movzx        r6, byte [r4+r2     ]
 394    sub          r6, e_reg
 395    lea          r5, [r5+r6*4]
 396    sub          r5, r6
 397%endif
 398
 399    lea       e_reg, [r3+r1*4]
 400    lea          r3, [r4+r2*4]
 401
 402    movzx        r4, byte [e_reg+r2  ]
 403    movzx        r6, byte [r3        ]
 404    sub          r6, r4
 405%if ARCH_X86_64
 406    lea          r6, [r7+r6*2]
 407    lea          r5, [r5+r6*2]
 408    add          r5, r6
 409%else
 410    lea          r5, [r5+r6*4]
 411    lea          r5, [r5+r6*2]
 412%endif
 413
 414    movzx        r4, byte [e_reg     ]
 415%if ARCH_X86_64
 416    movzx        r7, byte [r3   +r2  ]
 417    sub          r7, r4
 418    sub          r5, r7
 419%else
 420    movzx        r6, byte [r3   +r2  ]
 421    sub          r6, r4
 422    lea          r5, [r5+r6*8]
 423    sub          r5, r6
 424%endif
 425
 426    movzx        r4, byte [e_reg+r1  ]
 427    movzx        r6, byte [r3   +r2*2]
 428    sub          r6, r4
 429%if ARCH_X86_64
 430    add          r6, r7
 431%endif
 432    lea          r5, [r5+r6*8]
 433
 434    movzx        r4, byte [e_reg+r2*2]
 435    movzx        r6, byte [r3   +r1  ]
 436    sub          r6, r4
 437    lea          r5, [r5+r6*4]
 438    add          r5, r6           ; sum of V coefficients
 439
 440%if ARCH_X86_64 == 0
 441    mov          r0, r0m
 442%endif
 443
 444%ifidn %1, h264
 445    lea          r5, [r5*5+32]
 446    sar          r5, 6
 447%elifidn %1, rv40
 448    lea          r5, [r5*5]
 449    sar          r5, 6
 450%elifidn %1, svq3
 451    test         r5, r5
 452    lea          r6, [r5+3]
 453    cmovs        r5, r6
 454    sar          r5, 2            ; V/4
 455    lea          r5, [r5*5]       ; 5*(V/4)
 456    test         r5, r5
 457    lea          r6, [r5+15]
 458    cmovs        r5, r6
 459    sar          r5, 4            ; (5*(V/4))/16
 460%endif
 461
 462    movzx        r4, byte [r0+r1  +15]
 463    movzx        r3, byte [r3+r2*2   ]
 464    lea          r3, [r3+r4+1]
 465    shl          r3, 4
 466
 467    movd        r1d, m0
 468    movsx       r1d, r1w
 469%ifnidn %1, svq3
 470%ifidn %1, h264
 471    lea         r1d, [r1d*5+32]
 472%else ; rv40
 473    lea         r1d, [r1d*5]
 474%endif
 475    sar         r1d, 6
 476%else ; svq3
 477    test        r1d, r1d
 478    lea         r4d, [r1d+3]
 479    cmovs       r1d, r4d
 480    sar         r1d, 2           ; H/4
 481    lea         r1d, [r1d*5]     ; 5*(H/4)
 482    test        r1d, r1d
 483    lea         r4d, [r1d+15]
 484    cmovs       r1d, r4d
 485    sar         r1d, 4           ; (5*(H/4))/16
 486%endif
 487    movd         m0, r1d
 488
 489    add         r1d, r5d
 490    add         r3d, r1d
 491    shl         r1d, 3
 492    sub         r3d, r1d          ; a
 493
 494    movd         m1, r5d
 495    movd         m3, r3d
 496    SPLATW       m0, m0, 0        ; H
 497    SPLATW       m1, m1, 0        ; V
 498    SPLATW       m3, m3, 0        ; a
 499%ifidn %1, svq3
 500    SWAP          0, 1
 501%endif
 502    mova         m2, m0
 503%if mmsize == 8
 504    mova         m5, m0
 505%endif
 506    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
 507%if mmsize == 16
 508    psllw        m2, 3
 509%else
 510    psllw        m5, 3
 511    psllw        m2, 2
 512    mova         m6, m5
 513    paddw        m6, m2
 514%endif
 515    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
 516    paddw        m2, m0           ; a + {8,9,10,11,12,13,14,15}*H
 517%if mmsize == 8
 518    paddw        m5, m0           ; a + {8,9,10,11}*H
 519    paddw        m6, m0           ; a + {12,13,14,15}*H
 520%endif
 521
 522    mov          r4, 8
 523.loop:
 524    mova         m3, m0           ; b[0..7]
 525    mova         m4, m2           ; b[8..15]
 526    psraw        m3, 5
 527    psraw        m4, 5
 528    packuswb     m3, m4
 529    mova       [r0], m3
 530%if mmsize == 8
 531    mova         m3, m5           ; b[8..11]
 532    mova         m4, m6           ; b[12..15]
 533    psraw        m3, 5
 534    psraw        m4, 5
 535    packuswb     m3, m4
 536    mova     [r0+8], m3
 537%endif
 538    paddw        m0, m1
 539    paddw        m2, m1
 540%if mmsize == 8
 541    paddw        m5, m1
 542    paddw        m6, m1
 543%endif
 544
 545    mova         m3, m0           ; b[0..7]
 546    mova         m4, m2           ; b[8..15]
 547    psraw        m3, 5
 548    psraw        m4, 5
 549    packuswb     m3, m4
 550    mova    [r0+r2], m3
 551%if mmsize == 8
 552    mova         m3, m5           ; b[8..11]
 553    mova         m4, m6           ; b[12..15]
 554    psraw        m3, 5
 555    psraw        m4, 5
 556    packuswb     m3, m4
 557    mova  [r0+r2+8], m3
 558%endif
 559    paddw        m0, m1
 560    paddw        m2, m1
 561%if mmsize == 8
 562    paddw        m5, m1
 563    paddw        m6, m1
 564%endif
 565
 566    lea          r0, [r0+r2*2]
 567    dec          r4
 568    jg .loop
 569    REP_RET
 570%endmacro
 571
 572INIT_MMX mmx
 573H264_PRED16x16_PLANE h264
 574H264_PRED16x16_PLANE rv40
 575H264_PRED16x16_PLANE svq3
 576INIT_MMX mmxext
 577H264_PRED16x16_PLANE h264
 578H264_PRED16x16_PLANE rv40
 579H264_PRED16x16_PLANE svq3
 580INIT_XMM sse2
 581H264_PRED16x16_PLANE h264
 582H264_PRED16x16_PLANE rv40
 583H264_PRED16x16_PLANE svq3
 584INIT_XMM ssse3
 585H264_PRED16x16_PLANE h264
 586H264_PRED16x16_PLANE rv40
 587H264_PRED16x16_PLANE svq3
 588
 589;-----------------------------------------------------------------------------
 590; void ff_pred8x8_plane_8(uint8_t *src, ptrdiff_t stride)
 591;-----------------------------------------------------------------------------
 592
 593%macro H264_PRED8x8_PLANE 0
 594cglobal pred8x8_plane_8, 2,9,7
 595    mov          r2, r1           ; +stride
 596    neg          r1               ; -stride
 597
 598    movd         m0, [r0+r1  -1]
 599%if mmsize == 8
 600    pxor         m2, m2
 601    movh         m1, [r0+r1  +4 ]
 602    punpcklbw    m0, m2
 603    punpcklbw    m1, m2
 604    pmullw       m0, [pw_m4to4]
 605    pmullw       m1, [pw_m4to4+8]
 606%else ; mmsize == 16
 607%if cpuflag(ssse3)
 608    movhps       m0, [r0+r1  +4]   ; this reads 4 bytes more than necessary
 609    pmaddubsw    m0, [plane8_shuf] ; H coefficients
 610%else ; sse2
 611    pxor         m2, m2
 612    movd         m1, [r0+r1  +4]
 613    punpckldq    m0, m1
 614    punpcklbw    m0, m2
 615    pmullw       m0, [pw_m4to4]
 616%endif
 617    movhlps      m1, m0
 618%endif
 619    paddw        m0, m1
 620
 621%if notcpuflag(ssse3)
 622%if cpuflag(mmxext)
 623    PSHUFLW      m1, m0, 0xE
 624%elif cpuflag(mmx)
 625    mova         m1, m0
 626    psrlq        m1, 32
 627%endif
 628    paddw        m0, m1
 629%endif ; !ssse3
 630
 631%if cpuflag(mmxext)
 632    PSHUFLW      m1, m0, 0x1
 633%elif cpuflag(mmx)
 634    mova         m1, m0
 635    psrlq        m1, 16
 636%endif
 637    paddw        m0, m1           ; sum of H coefficients
 638
 639    lea          r4, [r0+r2*4-1]
 640    lea          r3, [r0     -1]
 641    add          r4, r2
 642
 643%if ARCH_X86_64
 644%define e_reg r8
 645%else
 646%define e_reg r0
 647%endif
 648
 649    movzx     e_reg, byte [r3+r2*2   ]
 650    movzx        r5, byte [r4+r1     ]
 651    sub          r5, e_reg
 652
 653    movzx     e_reg, byte [r3        ]
 654%if ARCH_X86_64
 655    movzx        r7, byte [r4+r2     ]
 656    sub          r7, e_reg
 657    sub          r5, r7
 658%else
 659    movzx        r6, byte [r4+r2     ]
 660    sub          r6, e_reg
 661    lea          r5, [r5+r6*4]
 662    sub          r5, r6
 663%endif
 664
 665    movzx     e_reg, byte [r3+r1     ]
 666    movzx        r6, byte [r4+r2*2   ]
 667    sub          r6, e_reg
 668%if ARCH_X86_64
 669    add          r6, r7
 670%endif
 671    lea          r5, [r5+r6*4]
 672
 673    movzx     e_reg, byte [r3+r2     ]
 674    movzx        r6, byte [r4        ]
 675    sub          r6, e_reg
 676    lea          r6, [r5+r6*2]
 677
 678    lea          r5, [r6*9+16]
 679    lea          r5, [r5+r6*8]
 680    sar          r5, 5
 681
 682%if ARCH_X86_64 == 0
 683    mov          r0, r0m
 684%endif
 685
 686    movzx        r3, byte [r4+r2*2  ]
 687    movzx        r4, byte [r0+r1  +7]
 688    lea          r3, [r3+r4+1]
 689    shl          r3, 4
 690    movd        r1d, m0
 691    movsx       r1d, r1w
 692    imul        r1d, 17
 693    add         r1d, 16
 694    sar         r1d, 5
 695    movd         m0, r1d
 696    add         r1d, r5d
 697    sub         r3d, r1d
 698    add         r1d, r1d
 699    sub         r3d, r1d          ; a
 700
 701    movd         m1, r5d
 702    movd         m3, r3d
 703    SPLATW       m0, m0, 0        ; H
 704    SPLATW       m1, m1, 0        ; V
 705    SPLATW       m3, m3, 0        ; a
 706%if mmsize == 8
 707    mova         m2, m0
 708%endif
 709    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
 710    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
 711%if mmsize == 8
 712    psllw        m2, 2
 713    paddw        m2, m0           ; a + {4,5,6,7}*H
 714%endif
 715
 716    mov          r4, 4
 717ALIGN 16
 718.loop:
 719%if mmsize == 16
 720    mova         m3, m0           ; b[0..7]
 721    paddw        m0, m1
 722    psraw        m3, 5
 723    mova         m4, m0           ; V+b[0..7]
 724    paddw        m0, m1
 725    psraw        m4, 5
 726    packuswb     m3, m4
 727    movh       [r0], m3
 728    movhps  [r0+r2], m3
 729%else ; mmsize == 8
 730    mova         m3, m0           ; b[0..3]
 731    mova         m4, m2           ; b[4..7]
 732    paddw        m0, m1
 733    paddw        m2, m1
 734    psraw        m3, 5
 735    psraw        m4, 5
 736    mova         m5, m0           ; V+b[0..3]
 737    mova         m6, m2           ; V+b[4..7]
 738    paddw        m0, m1
 739    paddw        m2, m1
 740    psraw        m5, 5
 741    psraw        m6, 5
 742    packuswb     m3, m4
 743    packuswb     m5, m6
 744    mova       [r0], m3
 745    mova    [r0+r2], m5
 746%endif
 747
 748    lea          r0, [r0+r2*2]
 749    dec          r4
 750    jg .loop
 751    REP_RET
 752%endmacro
 753
 754INIT_MMX mmx
 755H264_PRED8x8_PLANE
 756INIT_MMX mmxext
 757H264_PRED8x8_PLANE
 758INIT_XMM sse2
 759H264_PRED8x8_PLANE
 760INIT_XMM ssse3
 761H264_PRED8x8_PLANE
 762
 763;-----------------------------------------------------------------------------
 764; void ff_pred8x8_vertical_8(uint8_t *src, ptrdiff_t stride)
 765;-----------------------------------------------------------------------------
 766
 767INIT_MMX mmx
 768cglobal pred8x8_vertical_8, 2,2
 769    sub    r0, r1
 770    movq  mm0, [r0]
 771%rep 3
 772    movq [r0+r1*1], mm0
 773    movq [r0+r1*2], mm0
 774    lea    r0, [r0+r1*2]
 775%endrep
 776    movq [r0+r1*1], mm0
 777    movq [r0+r1*2], mm0
 778    RET
 779
 780;-----------------------------------------------------------------------------
 781; void ff_pred8x8_horizontal_8(uint8_t *src, ptrdiff_t stride)
 782;-----------------------------------------------------------------------------
 783
 784%macro PRED8x8_H 0
 785cglobal pred8x8_horizontal_8, 2,3
 786    mov       r2, 4
 787%if cpuflag(ssse3)
 788    mova      m2, [pb_3]
 789%endif
 790.loop:
 791    SPLATB_LOAD m0, r0+r1*0-1, m2
 792    SPLATB_LOAD m1, r0+r1*1-1, m2
 793    mova [r0+r1*0], m0
 794    mova [r0+r1*1], m1
 795    lea       r0, [r0+r1*2]
 796    dec       r2
 797    jg .loop
 798    REP_RET
 799%endmacro
 800
 801INIT_MMX mmx
 802PRED8x8_H
 803INIT_MMX mmxext
 804PRED8x8_H
 805INIT_MMX ssse3
 806PRED8x8_H
 807
 808;-----------------------------------------------------------------------------
 809; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
 810;-----------------------------------------------------------------------------
 811INIT_MMX mmxext
 812cglobal pred8x8_top_dc_8, 2,5
 813    sub         r0, r1
 814    movq       mm0, [r0]
 815    pxor       mm1, mm1
 816    pxor       mm2, mm2
 817    lea         r2, [r0+r1*2]
 818    punpckhbw  mm1, mm0
 819    punpcklbw  mm0, mm2
 820    psadbw     mm1, mm2        ; s1
 821    lea         r3, [r2+r1*2]
 822    psadbw     mm0, mm2        ; s0
 823    psrlw      mm1, 1
 824    psrlw      mm0, 1
 825    pavgw      mm1, mm2
 826    lea         r4, [r3+r1*2]
 827    pavgw      mm0, mm2
 828    pshufw     mm1, mm1, 0
 829    pshufw     mm0, mm0, 0     ; dc0 (w)
 830    packuswb   mm0, mm1        ; dc0,dc1 (b)
 831    movq [r0+r1*1], mm0
 832    movq [r0+r1*2], mm0
 833    lea         r0, [r3+r1*2]
 834    movq [r2+r1*1], mm0
 835    movq [r2+r1*2], mm0
 836    movq [r3+r1*1], mm0
 837    movq [r3+r1*2], mm0
 838    movq [r0+r1*1], mm0
 839    movq [r0+r1*2], mm0
 840    RET
 841
 842;-----------------------------------------------------------------------------
 843; void ff_pred8x8_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
 844;-----------------------------------------------------------------------------
 845
 846INIT_MMX mmxext
 847cglobal pred8x8_dc_8, 2,5
 848    sub       r0, r1
 849    pxor      m7, m7
 850    movd      m0, [r0+0]
 851    movd      m1, [r0+4]
 852    psadbw    m0, m7            ; s0
 853    mov       r4, r0
 854    psadbw    m1, m7            ; s1
 855
 856    movzx    r2d, byte [r0+r1*1-1]
 857    movzx    r3d, byte [r0+r1*2-1]
 858    lea       r0, [r0+r1*2]
 859    add      r2d, r3d
 860    movzx    r3d, byte [r0+r1*1-1]
 861    add      r2d, r3d
 862    movzx    r3d, byte [r0+r1*2-1]
 863    add      r2d, r3d
 864    lea       r0, [r0+r1*2]
 865    movd      m2, r2d            ; s2
 866    movzx    r2d, byte [r0+r1*1-1]
 867    movzx    r3d, byte [r0+r1*2-1]
 868    lea       r0, [r0+r1*2]
 869    add      r2d, r3d
 870    movzx    r3d, byte [r0+r1*1-1]
 871    add      r2d, r3d
 872    movzx    r3d, byte [r0+r1*2-1]
 873    add      r2d, r3d
 874    movd      m3, r2d            ; s3
 875
 876    punpcklwd m0, m1
 877    mov       r0, r4
 878    punpcklwd m2, m3
 879    punpckldq m0, m2            ; s0, s1, s2, s3
 880    pshufw    m3, m0, 11110110b ; s2, s1, s3, s3
 881    lea       r2, [r0+r1*2]
 882    pshufw    m0, m0, 01110100b ; s0, s1, s3, s1
 883    paddw     m0, m3
 884    lea       r3, [r2+r1*2]
 885    psrlw     m0, 2
 886    pavgw     m0, m7            ; s0+s2, s1, s3, s1+s3
 887    lea       r4, [r3+r1*2]
 888    packuswb  m0, m0
 889    punpcklbw m0, m0
 890    movq      m1, m0
 891    punpcklbw m0, m0
 892    punpckhbw m1, m1
 893    movq [r0+r1*1], m0
 894    movq [r0+r1*2], m0
 895    movq [r2+r1*1], m0
 896    movq [r2+r1*2], m0
 897    movq [r3+r1*1], m1
 898    movq [r3+r1*2], m1
 899    movq [r4+r1*1], m1
 900    movq [r4+r1*2], m1
 901    RET
 902
 903;-----------------------------------------------------------------------------
 904; void ff_pred8x8_dc_rv40_8(uint8_t *src, ptrdiff_t stride)
 905;-----------------------------------------------------------------------------
 906
 907INIT_MMX mmxext
 908cglobal pred8x8_dc_rv40_8, 2,7
 909    mov       r4, r0
 910    sub       r0, r1
 911    pxor      mm0, mm0
 912    psadbw    mm0, [r0]
 913    dec        r0
 914    movzx     r5d, byte [r0+r1*1]
 915    movd      r6d, mm0
 916    lea        r0, [r0+r1*2]
 917%rep 3
 918    movzx     r2d, byte [r0+r1*0]
 919    movzx     r3d, byte [r0+r1*1]
 920    add       r5d, r2d
 921    add       r6d, r3d
 922    lea        r0, [r0+r1*2]
 923%endrep
 924    movzx     r2d, byte [r0+r1*0]
 925    add       r5d, r6d
 926    lea       r2d, [r2+r5+8]
 927    shr       r2d, 4
 928    movd      mm0, r2d
 929    punpcklbw mm0, mm0
 930    pshufw    mm0, mm0, 0
 931    mov       r3d, 4
 932.loop:
 933    movq [r4+r1*0], mm0
 934    movq [r4+r1*1], mm0
 935    lea   r4, [r4+r1*2]
 936    dec   r3d
 937    jg .loop
 938    REP_RET
 939
 940;-----------------------------------------------------------------------------
 941; void ff_pred8x8_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
 942;-----------------------------------------------------------------------------
 943
 944%macro PRED8x8_TM 0
 945cglobal pred8x8_tm_vp8_8, 2,6
 946    sub        r0, r1
 947    pxor      mm7, mm7
 948    movq      mm0, [r0]
 949    movq      mm1, mm0
 950    punpcklbw mm0, mm7
 951    punpckhbw mm1, mm7
 952    movzx     r4d, byte [r0-1]
 953    mov       r5d, 4
 954.loop:
 955    movzx     r2d, byte [r0+r1*1-1]
 956    movzx     r3d, byte [r0+r1*2-1]
 957    sub       r2d, r4d
 958    sub       r3d, r4d
 959    movd      mm2, r2d
 960    movd      mm4, r3d
 961    SPLATW    mm2, mm2, 0
 962    SPLATW    mm4, mm4, 0
 963    movq      mm3, mm2
 964    movq      mm5, mm4
 965    paddw     mm2, mm0
 966    paddw     mm3, mm1
 967    paddw     mm4, mm0
 968    paddw     mm5, mm1
 969    packuswb  mm2, mm3
 970    packuswb  mm4, mm5
 971    movq [r0+r1*1], mm2
 972    movq [r0+r1*2], mm4
 973    lea        r0, [r0+r1*2]
 974    dec       r5d
 975    jg .loop
 976    REP_RET
 977%endmacro
 978
 979INIT_MMX mmx
 980PRED8x8_TM
 981INIT_MMX mmxext
 982PRED8x8_TM
 983
 984INIT_XMM sse2
 985cglobal pred8x8_tm_vp8_8, 2,6,4
 986    sub          r0, r1
 987    pxor       xmm1, xmm1
 988    movq       xmm0, [r0]
 989    punpcklbw  xmm0, xmm1
 990    movzx       r4d, byte [r0-1]
 991    mov         r5d, 4
 992.loop:
 993    movzx       r2d, byte [r0+r1*1-1]
 994    movzx       r3d, byte [r0+r1*2-1]
 995    sub         r2d, r4d
 996    sub         r3d, r4d
 997    movd       xmm2, r2d
 998    movd       xmm3, r3d
 999    pshuflw    xmm2, xmm2, 0
1000    pshuflw    xmm3, xmm3, 0
1001    punpcklqdq xmm2, xmm2
1002    punpcklqdq xmm3, xmm3
1003    paddw      xmm2, xmm0
1004    paddw      xmm3, xmm0
1005    packuswb   xmm2, xmm3
1006    movq   [r0+r1*1], xmm2
1007    movhps [r0+r1*2], xmm2
1008    lea          r0, [r0+r1*2]
1009    dec         r5d
1010    jg .loop
1011    REP_RET
1012
1013INIT_XMM ssse3
1014cglobal pred8x8_tm_vp8_8, 2,3,6
1015    sub          r0, r1
1016    movdqa     xmm4, [tm_shuf]
1017    pxor       xmm1, xmm1
1018    movq       xmm0, [r0]
1019    punpcklbw  xmm0, xmm1
1020    movd       xmm5, [r0-4]
1021    pshufb     xmm5, xmm4
1022    mov         r2d, 4
1023.loop:
1024    movd       xmm2, [r0+r1*1-4]
1025    movd       xmm3, [r0+r1*2-4]
1026    pshufb     xmm2, xmm4
1027    pshufb     xmm3, xmm4
1028    psubw      xmm2, xmm5
1029    psubw      xmm3, xmm5
1030    paddw      xmm2, xmm0
1031    paddw      xmm3, xmm0
1032    packuswb   xmm2, xmm3
1033    movq   [r0+r1*1], xmm2
1034    movhps [r0+r1*2], xmm2
1035    lea          r0, [r0+r1*2]
1036    dec         r2d
1037    jg .loop
1038    REP_RET
1039
1040; dest, left, right, src, tmp
1041; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1042%macro PRED4x4_LOWPASS 5
1043    mova    %5, %2
1044    pavgb   %2, %3
1045    pxor    %3, %5
1046    mova    %1, %4
1047    pand    %3, [pb_1]
1048    psubusb %2, %3
1049    pavgb   %1, %2
1050%endmacro
1051
1052;-----------------------------------------------------------------------------
1053; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright,
1054;                           ptrdiff_t stride)
1055;-----------------------------------------------------------------------------
1056%macro PRED8x8L_TOP_DC 0
1057cglobal pred8x8l_top_dc_8, 4,4
1058    sub          r0, r3
1059    pxor        mm7, mm7
1060    movq        mm0, [r0-8]
1061    movq        mm3, [r0]
1062    movq        mm1, [r0+8]
1063    movq        mm2, mm3
1064    movq        mm4, mm3
1065    PALIGNR     mm2, mm0, 7, mm0
1066    PALIGNR     mm1, mm4, 1, mm4
1067    test        r1d, r1d ; top_left
1068    jz .fix_lt_2
1069    test        r2d, r2d ; top_right
1070    jz .fix_tr_1
1071    jmp .body
1072.fix_lt_2:
1073    movq        mm5, mm3
1074    pxor        mm5, mm2
1075    psllq       mm5, 56
1076    psrlq       mm5, 56
1077    pxor        mm2, mm5
1078    test        r2d, r2d ; top_right
1079    jnz .body
1080.fix_tr_1:
1081    movq        mm5, mm3
1082    pxor        mm5, mm1
1083    psrlq       mm5, 56
1084    psllq       mm5, 56
1085    pxor        mm1, mm5
1086.body:
1087    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1088    psadbw   mm7, mm0
1089    paddw    mm7, [pw_4]
1090    psrlw    mm7, 3
1091    pshufw   mm7, mm7, 0
1092    packuswb mm7, mm7
1093%rep 3
1094    movq [r0+r3*1], mm7
1095    movq [r0+r3*2], mm7
1096    lea    r0, [r0+r3*2]
1097%endrep
1098    movq [r0+r3*1], mm7
1099    movq [r0+r3*2], mm7
1100    RET
1101%endmacro
1102
1103INIT_MMX mmxext
1104PRED8x8L_TOP_DC
1105INIT_MMX ssse3
1106PRED8x8L_TOP_DC
1107
1108;-----------------------------------------------------------------------------
1109; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright,
1110;                       ptrdiff_t stride)
1111;-----------------------------------------------------------------------------
1112
1113%macro PRED8x8L_DC 0
1114cglobal pred8x8l_dc_8, 4,5
1115    sub          r0, r3
1116    lea          r4, [r0+r3*2]
1117    movq        mm0, [r0+r3*1-8]
1118    punpckhbw   mm0, [r0+r3*0-8]
1119    movq        mm1, [r4+r3*1-8]
1120    punpckhbw   mm1, [r0+r3*2-8]
1121    mov          r4, r0
1122    punpckhwd   mm1, mm0
1123    lea          r0, [r0+r3*4]
1124    movq        mm2, [r0+r3*1-8]
1125    punpckhbw   mm2, [r0+r3*0-8]
1126    lea          r0, [r0+r3*2]
1127    movq        mm3, [r0+r3*1-8]
1128    punpckhbw   mm3, [r0+r3*0-8]
1129    punpckhwd   mm3, mm2
1130    punpckhdq   mm3, mm1
1131    lea          r0, [r0+r3*2]
1132    movq        mm0, [r0+r3*0-8]
1133    movq        mm1, [r4]
1134    mov          r0, r4
1135    movq        mm4, mm3
1136    movq        mm2, mm3
1137    PALIGNR     mm4, mm0, 7, mm0
1138    PALIGNR     mm1, mm2, 1, mm2
1139    test        r1d, r1d
1140    jnz .do_left
1141.fix_lt_1:
1142    movq        mm5, mm3
1143    pxor        mm5, mm4
1144    psrlq       mm5, 56
1145    psllq       mm5, 48
1146    pxor        mm1, mm5
1147    jmp .do_left
1148.fix_lt_2:
1149    movq        mm5, mm3
1150    pxor        mm5, mm2
1151    psllq       mm5, 56
1152    psrlq       mm5, 56
1153    pxor        mm2, mm5
1154    test        r2d, r2d
1155    jnz .body
1156.fix_tr_1:
1157    movq        mm5, mm3
1158    pxor        mm5, mm1
1159    psrlq       mm5, 56
1160    psllq       mm5, 56
1161    pxor        mm1, mm5
1162    jmp .body
1163.do_left:
1164    movq        mm0, mm4
1165    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1166    movq        mm4, mm0
1167    movq        mm7, mm2
1168    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1169    psllq       mm1, 56
1170    PALIGNR     mm7, mm1, 7, mm3
1171    movq        mm0, [r0-8]
1172    movq        mm3, [r0]
1173    movq        mm1, [r0+8]
1174    movq        mm2, mm3
1175    movq        mm4, mm3
1176    PALIGNR     mm2, mm0, 7, mm0
1177    PALIGNR     mm1, mm4, 1, mm4
1178    test        r1d, r1d
1179    jz .fix_lt_2
1180    test        r2d, r2d
1181    jz .fix_tr_1
1182.body:
1183    lea          r1, [r0+r3*2]
1184    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1185    pxor        mm0, mm0
1186    pxor        mm1, mm1
1187    lea          r2, [r1+r3*2]
1188    psadbw      mm0, mm7
1189    psadbw      mm1, mm6
1190    paddw       mm0, [pw_8]
1191    paddw       mm0, mm1
1192    lea          r4, [r2+r3*2]
1193    psrlw       mm0, 4
1194    pshufw      mm0, mm0, 0
1195    packuswb    mm0, mm0
1196    movq [r0+r3*1], mm0
1197    movq [r0+r3*2], mm0
1198    movq [r1+r3*1], mm0
1199    movq [r1+r3*2], mm0
1200    movq [r2+r3*1], mm0
1201    movq [r2+r3*2], mm0
1202    movq [r4+r3*1], mm0
1203    movq [r4+r3*2], mm0
1204    RET
1205%endmacro
1206
1207INIT_MMX mmxext
1208PRED8x8L_DC
1209INIT_MMX ssse3
1210PRED8x8L_DC
1211
1212;-----------------------------------------------------------------------------
1213; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft,
1214;                               int has_topright, ptrdiff_t stride)
1215;-----------------------------------------------------------------------------
1216
1217%macro PRED8x8L_HORIZONTAL 0
1218cglobal pred8x8l_horizontal_8, 4,4
1219    sub          r0, r3
1220    lea          r2, [r0+r3*2]
1221    movq        mm0, [r0+r3*1-8]
1222    test        r1d, r1d
1223    lea          r1, [r0+r3]
1224    cmovnz       r1, r0
1225    punpckhbw   mm0, [r1+r3*0-8]
1226    movq        mm1, [r2+r3*1-8]
1227    punpckhbw   mm1, [r0+r3*2-8]
1228    mov          r2, r0
1229    punpckhwd   mm1, mm0
1230    lea          r0, [r0+r3*4]
1231    movq        mm2, [r0+r3*1-8]
1232    punpckhbw   mm2, [r0+r3*0-8]
1233    lea          r0, [r0+r3*2]
1234    movq        mm3, [r0+r3*1-8]
1235    punpckhbw   mm3, [r0+r3*0-8]
1236    punpckhwd   mm3, mm2
1237    punpckhdq   mm3, mm1
1238    lea          r0, [r0+r3*2]
1239    movq        mm0, [r0+r3*0-8]
1240    movq        mm1, [r1+r3*0-8]
1241    mov          r0, r2
1242    movq        mm4, mm3
1243    movq        mm2, mm3
1244    PALIGNR     mm4, mm0, 7, mm0
1245    PALIGNR     mm1, mm2, 1, mm2
1246    movq        mm0, mm4
1247    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1248    movq        mm4, mm0
1249    movq        mm7, mm2
1250    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1251    psllq       mm1, 56
1252    PALIGNR     mm7, mm1, 7, mm3
1253    movq        mm3, mm7
1254    lea         r1, [r0+r3*2]
1255    movq       mm7, mm3
1256    punpckhbw  mm3, mm3
1257    punpcklbw  mm7, mm7
1258    pshufw     mm0, mm3, 0xff
1259    pshufw     mm1, mm3, 0xaa
1260    lea         r2, [r1+r3*2]
1261    pshufw     mm2, mm3, 0x55
1262    pshufw     mm3, mm3, 0x00
1263    pshufw     mm4, mm7, 0xff
1264    pshufw     mm5, mm7, 0xaa
1265    pshufw     mm6, mm7, 0x55
1266    pshufw     mm7, mm7, 0x00
1267    movq [r0+r3*1], mm0
1268    movq [r0+r3*2], mm1
1269    movq [r1+r3*1], mm2
1270    movq [r1+r3*2], mm3
1271    movq [r2+r3*1], mm4
1272    movq [r2+r3*2], mm5
1273    lea         r0, [r2+r3*2]
1274    movq [r0+r3*1], mm6
1275    movq [r0+r3*2], mm7
1276    RET
1277%endmacro
1278
1279INIT_MMX mmxext
1280PRED8x8L_HORIZONTAL
1281INIT_MMX ssse3
1282PRED8x8L_HORIZONTAL
1283
1284;-----------------------------------------------------------------------------
1285; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright,
1286;                             ptrdiff_t stride)
1287;-----------------------------------------------------------------------------
1288
1289%macro PRED8x8L_VERTICAL 0
1290cglobal pred8x8l_vertical_8, 4,4
1291    sub          r0, r3
1292    movq        mm0, [r0-8]
1293    movq        mm3, [r0]
1294    movq        mm1, [r0+8]
1295    movq        mm2, mm3
1296    movq        mm4, mm3
1297    PALIGNR     mm2, mm0, 7, mm0
1298    PALIGNR     mm1, mm4, 1, mm4
1299    test        r1d, r1d ; top_left
1300    jz .fix_lt_2
1301    test        r2d, r2d ; top_right
1302    jz .fix_tr_1
1303    jmp .body
1304.fix_lt_2:
1305    movq        mm5, mm3
1306    pxor        mm5, mm2
1307    psllq       mm5, 56
1308    psrlq       mm5, 56
1309    pxor        mm2, mm5
1310    test        r2d, r2d ; top_right
1311    jnz .body
1312.fix_tr_1:
1313    movq        mm5, mm3
1314    pxor        mm5, mm1
1315    psrlq       mm5, 56
1316    psllq       mm5, 56
1317    pxor        mm1, mm5
1318.body:
1319    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1320%rep 3
1321    movq [r0+r3*1], mm0
1322    movq [r0+r3*2], mm0
1323    lea    r0, [r0+r3*2]
1324%endrep
1325    movq [r0+r3*1], mm0
1326    movq [r0+r3*2], mm0
1327    RET
1328%endmacro
1329
1330INIT_MMX mmxext
1331PRED8x8L_VERTICAL
1332INIT_MMX ssse3
1333PRED8x8L_VERTICAL
1334
1335;-----------------------------------------------------------------------------
1336; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft,
1337;                              int has_topright, ptrdiff_t stride)
1338;-----------------------------------------------------------------------------
1339
1340INIT_MMX mmxext
1341cglobal pred8x8l_down_left_8, 4,5
1342    sub          r0, r3
1343    movq        mm0, [r0-8]
1344    movq        mm3, [r0]
1345    movq        mm1, [r0+8]
1346    movq        mm2, mm3
1347    movq        mm4, mm3
1348    PALIGNR     mm2, mm0, 7, mm0
1349    PALIGNR     mm1, mm4, 1, mm4
1350    test        r1d, r1d
1351    jz .fix_lt_2
1352    test        r2d, r2d
1353    jz .fix_tr_1
1354    jmp .do_top
1355.fix_lt_2:
1356    movq        mm5, mm3
1357    pxor        mm5, mm2
1358    psllq       mm5, 56
1359    psrlq       mm5, 56
1360    pxor        mm2, mm5
1361    test        r2d, r2d
1362    jnz .do_top
1363.fix_tr_1:
1364    movq        mm5, mm3
1365    pxor        mm5, mm1
1366    psrlq       mm5, 56
1367    psllq       mm5, 56
1368    pxor        mm1, mm5
1369    jmp .do_top
1370.fix_tr_2:
1371    punpckhbw   mm3, mm3
1372    pshufw      mm1, mm3, 0xFF
1373    jmp .do_topright
1374.do_top:
1375    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1376    movq        mm7, mm4
1377    test        r2d, r2d
1378    jz .fix_tr_2
1379    movq        mm0, [r0+8]
1380    movq        mm5, mm0
1381    movq        mm2, mm0
1382    movq        mm4, mm0
1383    psrlq       mm5, 56
1384    PALIGNR     mm2, mm3, 7, mm3
1385    PALIGNR     mm5, mm4, 1, mm4
1386    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1387.do_topright:
1388    lea          r1, [r0+r3*2]
1389    movq        mm6, mm1
1390    psrlq       mm1, 56
1391    movq        mm4, mm1
1392    lea          r2, [r1+r3*2]
1393    movq        mm2, mm6
1394    PALIGNR     mm2, mm7, 1, mm0
1395    movq        mm3, mm6
1396    PALIGNR     mm3, mm7, 7, mm0
1397    PALIGNR     mm4, mm6, 1, mm0
1398    movq        mm5, mm7
1399    movq        mm1, mm7
1400    movq        mm7, mm6
1401    lea          r4, [r2+r3*2]
1402    psllq       mm1, 8
1403    PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1404    PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1405    movq  [r4+r3*2], mm1
1406    movq        mm2, mm0
1407    psllq       mm1, 8
1408    psrlq       mm2, 56
1409    psllq       mm0, 8
1410    por         mm1, mm2
1411    movq  [r4+r3*1], mm1
1412    movq        mm2, mm0
1413    psllq       mm1, 8
1414    psrlq       mm2, 56
1415    psllq       mm0, 8
1416    por         mm1, mm2
1417    movq  [r2+r3*2], mm1
1418    movq        mm2, mm0
1419    psllq       mm1, 8
1420    psrlq       mm2, 56
1421    psllq       mm0, 8
1422    por         mm1, mm2
1423    movq  [r2+r3*1], mm1
1424    movq        mm2, mm0
1425    psllq       mm1, 8
1426    psrlq       mm2, 56
1427    psllq       mm0, 8
1428    por         mm1, mm2
1429    movq  [r1+r3*2], mm1
1430    movq        mm2, mm0
1431    psllq       mm1, 8
1432    psrlq       mm2, 56
1433    psllq       mm0, 8
1434    por         mm1, mm2
1435    movq  [r1+r3*1], mm1
1436    movq        mm2, mm0
1437    psllq       mm1, 8
1438    psrlq       mm2, 56
1439    psllq       mm0, 8
1440    por         mm1, mm2
1441    movq  [r0+r3*2], mm1
1442    psllq       mm1, 8
1443    psrlq       mm0, 56
1444    por         mm1, mm0
1445    movq  [r0+r3*1], mm1
1446    RET
1447
1448%macro PRED8x8L_DOWN_LEFT 0
1449cglobal pred8x8l_down_left_8, 4,4
1450    sub          r0, r3
1451    movq        mm0, [r0-8]
1452    movq        mm3, [r0]
1453    movq        mm1, [r0+8]
1454    movq        mm2, mm3
1455    movq        mm4, mm3
1456    PALIGNR     mm2, mm0, 7, mm0
1457    PALIGNR     mm1, mm4, 1, mm4
1458    test        r1d, r1d ; top_left
1459    jz .fix_lt_2
1460    test        r2d, r2d ; top_right
1461    jz .fix_tr_1
1462    jmp .do_top
1463.fix_lt_2:
1464    movq        mm5, mm3
1465    pxor        mm5, mm2
1466    psllq       mm5, 56
1467    psrlq       mm5, 56
1468    pxor        mm2, mm5
1469    test        r2d, r2d ; top_right
1470    jnz .do_top
1471.fix_tr_1:
1472    movq        mm5, mm3
1473    pxor        mm5, mm1
1474    psrlq       mm5, 56
1475    psllq       mm5, 56
1476    pxor        mm1, mm5
1477    jmp .do_top
1478.fix_tr_2:
1479    punpckhbw   mm3, mm3
1480    pshufw      mm1, mm3, 0xFF
1481    jmp .do_topright
1482.do_top:
1483    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1484    movq2dq    xmm3, mm4
1485    test        r2d, r2d ; top_right
1486    jz .fix_tr_2
1487    movq        mm0, [r0+8]
1488    movq        mm5, mm0
1489    movq        mm2, mm0
1490    movq        mm4, mm0
1491    psrlq       mm5, 56
1492    PALIGNR     mm2, mm3, 7, mm3
1493    PALIGNR     mm5, mm4, 1, mm4
1494    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1495.do_topright:
1496    movq2dq    xmm4, mm1
1497    psrlq       mm1, 56
1498    movq2dq    xmm5, mm1
1499    lea         r1, [r0+r3*2]
1500    pslldq    xmm4, 8
1501    por       xmm3, xmm4
1502    movdqa    xmm2, xmm3
1503    psrldq    xmm2, 1
1504    pslldq    xmm5, 15
1505    por       xmm2, xmm5
1506    lea         r2, [r1+r3*2]
1507    movdqa    xmm1, xmm3
1508    pslldq    xmm1, 1
1509INIT_XMM cpuname
1510    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1511    psrldq    xmm0, 1
1512    movq [r0+r3*1], xmm0
1513    psrldq    xmm0, 1
1514    movq [r0+r3*2], xmm0
1515    psrldq    xmm0, 1
1516    lea         r0, [r2+r3*2]
1517    movq [r1+r3*1], xmm0
1518    psrldq    xmm0, 1
1519    movq [r1+r3*2], xmm0
1520    psrldq    xmm0, 1
1521    movq [r2+r3*1], xmm0
1522    psrldq    xmm0, 1
1523    movq [r2+r3*2], xmm0
1524    psrldq    xmm0, 1
1525    movq [r0+r3*1], xmm0
1526    psrldq    xmm0, 1
1527    movq [r0+r3*2], xmm0
1528    RET
1529%endmacro
1530
1531INIT_MMX sse2
1532PRED8x8L_DOWN_LEFT
1533INIT_MMX ssse3
1534PRED8x8L_DOWN_LEFT
1535
1536;-----------------------------------------------------------------------------
1537; void ff_pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft,
1538;                                      int has_topright, ptrdiff_t stride)
1539;-----------------------------------------------------------------------------
1540
1541INIT_MMX mmxext
1542cglobal pred8x8l_down_right_8, 4,5
1543    sub          r0, r3
1544    lea          r4, [r0+r3*2]
1545    movq        mm0, [r0+r3*1-8]
1546    punpckhbw   mm0, [r0+r3*0-8]
1547    movq        mm1, [r4+r3*1-8]
1548    punpckhbw   mm1, [r0+r3*2-8]
1549    mov          r4, r0
1550    punpckhwd   mm1, mm0
1551    lea          r0, [r0+r3*4]
1552    movq        mm2, [r0+r3*1-8]
1553    punpckhbw   mm2, [r0+r3*0-8]
1554    lea          r0, [r0+r3*2]
1555    movq        mm3, [r0+r3*1-8]
1556    punpckhbw   mm3, [r0+r3*0-8]
1557    punpckhwd   mm3, mm2
1558    punpckhdq   mm3, mm1
1559    lea          r0, [r0+r3*2]
1560    movq        mm0, [r0+r3*0-8]
1561    movq        mm1, [r4]
1562    mov          r0, r4
1563    movq        mm4, mm3
1564    movq        mm2, mm3
1565    PALIGNR     mm4, mm0, 7, mm0
1566    PALIGNR     mm1, mm2, 1, mm2
1567    test        r1d, r1d ; top_left
1568    jz .fix_lt_1
1569.do_left:
1570    movq        mm0, mm4
1571    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1572    movq        mm4, mm0
1573    movq        mm7, mm2
1574    movq        mm6, mm2
1575    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1576    psllq       mm1, 56
1577    PALIGNR     mm7, mm1, 7, mm3
1578    movq        mm0, [r0-8]
1579    movq        mm3, [r0]
1580    movq        mm1, [r0+8]
1581    movq        mm2, mm3
1582    movq        mm4, mm3
1583    PALIGNR     mm2, mm0, 7, mm0
1584    PALIGNR     mm1, mm4, 1, mm4
1585    test        r1d, r1d ; top_left
1586    jz .fix_lt_2
1587    test        r2d, r2d ; top_right
1588    jz .fix_tr_1
1589.do_top:
1590    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1591    movq        mm5, mm4
1592    jmp .body
1593.fix_lt_1:
1594    movq        mm5, mm3
1595    pxor        mm5, mm4
1596    psrlq       mm5, 56
1597    psllq       mm5, 48
1598    pxor        mm1, mm5
1599    jmp .do_left
1600.fix_lt_2:
1601    movq        mm5, mm3
1602    pxor        mm5, mm2
1603    psllq       mm5, 56
1604    psrlq       mm5, 56
1605    pxor        mm2, mm5
1606    test        r2d, r2d ; top_right
1607    jnz .do_top
1608.fix_tr_1:
1609    movq        mm5, mm3
1610    pxor        mm5, mm1
1611    psrlq       mm5, 56
1612    psllq       mm5, 56
1613    pxor        mm1, mm5
1614    jmp .do_top
1615.body:
1616    lea         r1, [r0+r3*2]
1617    movq       mm1, mm7
1618    movq       mm7, mm5
1619    movq       mm5, mm6
1620    movq       mm2, mm7
1621    lea         r2, [r1+r3*2]
1622    PALIGNR    mm2, mm6, 1, mm0
1623    movq       mm3, mm7
1624    PALIGNR    mm3, mm6, 7, mm0
1625    movq       mm4, mm7
1626    lea         r4, [r2+r3*2]
1627    psrlq      mm4, 8
1628    PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1629    PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1630    movq [r4+r3*2], mm0
1631    movq       mm2, mm1
1632    psrlq      mm0, 8
1633    psllq      mm2, 56
1634    psrlq      mm1, 8
1635    por        mm0, mm2
1636    movq [r4+r3*1], mm0
1637    movq       mm2, mm1
1638    psrlq      mm0, 8
1639    psllq      mm2, 56
1640    psrlq      mm1, 8
1641    por        mm0, mm2
1642    movq [r2+r3*2], mm0
1643    movq       mm2, mm1
1644    psrlq      mm0, 8
1645    psllq      mm2, 56
1646    psrlq      mm1, 8
1647    por        mm0, mm2
1648    movq [r2+r3*1], mm0
1649    movq       mm2, mm1
1650    psrlq      mm0, 8
1651    psllq      mm2, 56
1652    psrlq      mm1, 8
1653    por        mm0, mm2
1654    movq [r1+r3*2], mm0
1655    movq       mm2, mm1
1656    psrlq      mm0, 8
1657    psllq      mm2, 56
1658    psrlq      mm1, 8
1659    por        mm0, mm2
1660    movq [r1+r3*1], mm0
1661    movq       mm2, mm1
1662    psrlq      mm0, 8
1663    psllq      mm2, 56
1664    psrlq      mm1, 8
1665    por        mm0, mm2
1666    movq [r0+r3*2], mm0
1667    psrlq      mm0, 8
1668    psllq      mm1, 56
1669    por        mm0, mm1
1670    movq [r0+r3*1], mm0
1671    RET
1672
1673%macro PRED8x8L_DOWN_RIGHT 0
1674cglobal pred8x8l_down_right_8, 4,5
1675    sub          r0, r3
1676    lea          r4, [r0+r3*2]
1677    movq        mm0, [r0+r3*1-8]
1678    punpckhbw   mm0, [r0+r3*0-8]
1679    movq        mm1, [r4+r3*1-8]
1680    punpckhbw   mm1, [r0+r3*2-8]
1681    mov          r4, r0
1682    punpckhwd   mm1, mm0
1683    lea          r0, [r0+r3*4]
1684    movq        mm2, [r0+r3*1-8]
1685    punpckhbw   mm2, [r0+r3*0-8]
1686    lea          r0, [r0+r3*2]
1687    movq        mm3, [r0+r3*1-8]
1688    punpckhbw   mm3, [r0+r3*0-8]
1689    punpckhwd   mm3, mm2
1690    punpckhdq   mm3, mm1
1691    lea          r0, [r0+r3*2]
1692    movq        mm0, [r0+r3*0-8]
1693    movq        mm1, [r4]
1694    mov          r0, r4
1695    movq        mm4, mm3
1696    movq        mm2, mm3
1697    PALIGNR     mm4, mm0, 7, mm0
1698    PALIGNR     mm1, mm2, 1, mm2
1699    test        r1d, r1d
1700    jz .fix_lt_1
1701    jmp .do_left
1702.fix_lt_1:
1703    movq        mm5, mm3
1704    pxor        mm5, mm4
1705    psrlq       mm5, 56
1706    psllq       mm5, 48
1707    pxor        mm1, mm5
1708    jmp .do_left
1709.fix_lt_2:
1710    movq        mm5, mm3
1711    pxor        mm5, mm2
1712    psllq       mm5, 56
1713    psrlq       mm5, 56
1714    pxor        mm2, mm5
1715    test        r2d, r2d
1716    jnz .do_top
1717.fix_tr_1:
1718    movq        mm5, mm3
1719    pxor        mm5, mm1
1720    psrlq       mm5, 56
1721    psllq       mm5, 56
1722    pxor        mm1, mm5
1723    jmp .do_top
1724.do_left:
1725    movq        mm0, mm4
1726    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1727    movq        mm4, mm0
1728    movq        mm7, mm2
1729    movq2dq    xmm3, mm2
1730    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1731    psllq       mm1, 56
1732    PALIGNR     mm7, mm1, 7, mm3
1733    movq2dq    xmm1, mm7
1734    movq        mm0, [r0-8]
1735    movq        mm3, [r0]
1736    movq        mm1, [r0+8]
1737    movq        mm2, mm3
1738    movq        mm4, mm3
1739    PALIGNR     mm2, mm0, 7, mm0
1740    PALIGNR     mm1, mm4, 1, mm4
1741    test        r1d, r1d
1742    jz .fix_lt_2
1743    test        r2d, r2d
1744    jz .fix_tr_1
1745.do_top:
1746    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1747    movq2dq   xmm4, mm4
1748    lea         r1, [r0+r3*2]
1749    movdqa    xmm0, xmm3
1750    pslldq    xmm4, 8
1751    por       xmm3, xmm4
1752    lea         r2, [r1+r3*2]
1753    pslldq    xmm4, 1
1754    por       xmm1, xmm4
1755    psrldq    xmm0, 7
1756    pslldq    xmm0, 15
1757    psrldq    xmm0, 7
1758    por       xmm1, xmm0
1759    lea         r0, [r2+r3*2]
1760    movdqa    xmm2, xmm3
1761    psrldq    xmm2, 1
1762INIT_XMM cpuname
1763    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1764    movdqa    xmm1, xmm0
1765    psrldq    xmm1, 1
1766    movq [r0+r3*2], xmm0
1767    movq [r0+r3*1], xmm1
1768    psrldq    xmm0, 2
1769    psrldq    xmm1, 2
1770    movq [r2+r3*2], xmm0
1771    movq [r2+r3*1], xmm1
1772    psrldq    xmm0, 2
1773    psrldq    xmm1, 2
1774    movq [r1+r3*2], xmm0
1775    movq [r1+r3*1], xmm1
1776    psrldq    xmm0, 2
1777    psrldq    xmm1, 2
1778    movq [r4+r3*2], xmm0
1779    movq [r4+r3*1], xmm1
1780    RET
1781%endmacro
1782
1783INIT_MMX sse2
1784PRED8x8L_DOWN_RIGHT
1785INIT_MMX ssse3
1786PRED8x8L_DOWN_RIGHT
1787
1788;-----------------------------------------------------------------------------
1789; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft,
1790;                                   int has_topright, ptrdiff_t stride)
1791;-----------------------------------------------------------------------------
1792
1793INIT_MMX mmxext
1794cglobal pred8x8l_vertical_right_8, 4,5
1795    sub          r0, r3
1796    lea          r4, [r0+r3*2]
1797    movq        mm0, [r0+r3*1-8]
1798    punpckhbw   mm0, [r0+r3*0-8]
1799    movq        mm1, [r4+r3*1-8]
1800    punpckhbw   mm1, [r0+r3*2-8]
1801    mov          r4, r0
1802    punpckhwd   mm1, mm0
1803    lea          r0, [r0+r3*4]
1804    movq        mm2, [r0+r3*1-8]
1805    punpckhbw   mm2, [r0+r3*0-8]
1806    lea          r0, [r0+r3*2]
1807    movq        mm3, [r0+r3*1-8]
1808    punpckhbw   mm3, [r0+r3*0-8]
1809    punpckhwd   mm3, mm2
1810    punpckhdq   mm3, mm1
1811    lea          r0, [r0+r3*2]
1812    movq        mm0, [r0+r3*0-8]
1813    movq        mm1, [r4]
1814    mov          r0, r4
1815    movq        mm4, mm3
1816    movq        mm2, mm3
1817    PALIGNR     mm4, mm0, 7, mm0
1818    PALIGNR     mm1, mm2, 1, mm2
1819    test        r1d, r1d
1820    jz .fix_lt_1
1821    jmp .do_left
1822.fix_lt_1:
1823    movq        mm5, mm3
1824    pxor        mm5, mm4
1825    psrlq       mm5, 56
1826    psllq       mm5, 48
1827    pxor        mm1, mm5
1828    jmp .do_left
1829.fix_lt_2:
1830    movq        mm5, mm3
1831    pxor        mm5, mm2
1832    psllq       mm5, 56
1833    psrlq       mm5, 56
1834    pxor        mm2, mm5
1835    test        r2d, r2d
1836    jnz .do_top
1837.fix_tr_1:
1838    movq        mm5, mm3
1839    pxor        mm5, mm1
1840    psrlq       mm5, 56
1841    psllq       mm5, 56
1842    pxor        mm1, mm5
1843    jmp .do_top
1844.do_left:
1845    movq        mm0, mm4
1846    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1847    movq        mm7, mm2
1848    movq        mm0, [r0-8]
1849    movq        mm3, [r0]
1850    movq        mm1, [r0+8]
1851    movq        mm2, mm3
1852    movq        mm4, mm3
1853    PALIGNR     mm2, mm0, 7, mm0
1854    PALIGNR     mm1, mm4, 1, mm4
1855    test        r1d, r1d
1856    jz .fix_lt_2
1857    test        r2d, r2d
1858    jz .fix_tr_1
1859.do_top:
1860    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1861    lea         r1, [r0+r3*2]
1862    movq       mm2, mm6
1863    movq       mm3, mm6
1864    PALIGNR    mm3, mm7, 7, mm0
1865    PALIGNR    mm6, mm7, 6, mm1
1866    movq       mm4, mm3
1867    pavgb      mm3, mm2
1868    lea         r2, [r1+r3*2]
1869    PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1870    movq [r0+r3*1], mm3
1871    movq [r0+r3*2], mm0
1872    movq       mm5, mm0
1873    movq       mm6, mm3
1874    movq       mm1, mm7
1875    movq       mm2, mm1
1876    psllq      mm2, 8
1877    movq       mm3, mm1
1878    psllq      mm3, 16
1879    lea         r4, [r2+r3*2]
1880    PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1881    PALIGNR    mm6, mm0, 7, mm2
1882    movq [r1+r3*1], mm6
1883    psllq      mm0, 8
1884    PALIGNR    mm5, mm0, 7, mm1
1885    movq [r1+r3*2], mm5
1886    psllq      mm0, 8
1887    PALIGNR    mm6, mm0, 7, mm2
1888    movq [r2+r3*1], mm6
1889    psllq      mm0, 8
1890    PALIGNR    mm5, mm0, 7, mm1
1891    movq [r2+r3*2], mm5
1892    psllq      mm0, 8
1893    PALIGNR    mm6, mm0, 7, mm2
1894    movq [r4+r3*1], mm6
1895    psllq      mm0, 8
1896    PALIGNR    mm5, mm0, 7, mm1
1897    movq [r4+r3*2], mm5
1898    RET
1899
1900%macro PRED8x8L_VERTICAL_RIGHT 0
1901cglobal pred8x8l_vertical_right_8, 4,5,7
1902    ; manually spill XMM registers for Win64 because
1903    ; the code here is initialized with INIT_MMX
1904    WIN64_SPILL_XMM 7
1905    sub          r0, r3
1906    lea          r4, [r0+r3*2]
1907    movq        mm0, [r0+r3*1-8]
1908    punpckhbw   mm0, [r0+r3*0-8]
1909    movq        mm1, [r4+r3*1-8]
1910    punpckhbw   mm1, [r0+r3*2-8]
1911    mov          r4, r0
1912    punpckhwd   mm1, mm0
1913    lea          r0, [r0+r3*4]
1914    movq        mm2, [r0+r3*1-8]
1915    punpckhbw   mm2, [r0+r3*0-8]
1916    lea          r0, [r0+r3*2]
1917    movq        mm3, [r0+r3*1-8]
1918    punpckhbw   mm3, [r0+r3*0-8]
1919    punpckhwd   mm3, mm2
1920    punpckhdq   mm3, mm1
1921    lea          r0, [r0+r3*2]
1922    movq        mm0, [r0+r3*0-8]
1923    movq        mm1, [r4]
1924    mov          r0, r4
1925    movq        mm4, mm3
1926    movq        mm2, mm3
1927    PALIGNR     mm4, mm0, 7, mm0
1928    PALIGNR     mm1, mm2, 1, mm2
1929    test        r1d, r1d
1930    jnz .do_left
1931.fix_lt_1:
1932    movq        mm5, mm3
1933    pxor        mm5, mm4
1934    psrlq       mm5, 56
1935    psllq       mm5, 48
1936    pxor        mm1, mm5
1937    jmp .do_left
1938.fix_lt_2:
1939    movq        mm5, mm3
1940    pxor        mm5, mm2
1941    psllq       mm5, 56
1942    psrlq       mm5, 56
1943    pxor        mm2, mm5
1944    test        r2d, r2d
1945    jnz .do_top
1946.fix_tr_1:
1947    movq        mm5, mm3
1948    pxor        mm5, mm1
1949    psrlq       mm5, 56
1950    psllq       mm5, 56
1951    pxor        mm1, mm5
1952    jmp .do_top
1953.do_left:
1954    movq        mm0, mm4
1955    PRE

Large files files are truncated, but you can click here to view the full file