PageRenderTime 60ms CodeModel.GetById 11ms app.highlight 43ms RepoModel.GetById 1ms app.codeStats 0ms

/media/libvpx/vp8/common/x86/subpixel_mmx.asm

http://github.com/zpao/v8monkey
Assembly | 727 lines | 466 code | 181 blank | 80 comment | 0 complexity | c5007295dc4543de92fdd3602b3e30ec MD5 | raw file
  1;
  2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3;
  4;  Use of this source code is governed by a BSD-style license
  5;  that can be found in the LICENSE file in the root of the source
  6;  tree. An additional intellectual property rights grant can be found
  7;  in the file PATENTS.  All contributing project authors may
  8;  be found in the AUTHORS file in the root of the source tree.
  9;
 10
 11
 12%include "vpx_ports/x86_abi_support.asm"
 13
 14
 15%define BLOCK_HEIGHT_WIDTH 4
 16%define vp8_filter_weight 128
 17%define VP8_FILTER_SHIFT  7
 18
 19
 20;void vp8_filter_block1d_h6_mmx
 21;(
 22;    unsigned char   *src_ptr,
 23;    unsigned short  *output_ptr,
 24;    unsigned int    src_pixels_per_line,
 25;    unsigned int    pixel_step,
 26;    unsigned int    output_height,
 27;    unsigned int    output_width,
 28;    short           * vp8_filter
 29;)
 30global sym(vp8_filter_block1d_h6_mmx)
 31sym(vp8_filter_block1d_h6_mmx):
 32    push        rbp
 33    mov         rbp, rsp
 34    SHADOW_ARGS_TO_STACK 7
 35    GET_GOT     rbx
 36    push        rsi
 37    push        rdi
 38    ; end prolog
 39
 40        mov         rdx,    arg(6) ;vp8_filter
 41
 42        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
 43        movq        mm2,    [rdx + 32]         ;
 44        movq        mm6,    [rdx + 48]        ;
 45        movq        mm7,    [rdx + 64]        ;
 46
 47        mov         rdi,    arg(1) ;output_ptr
 48        mov         rsi,    arg(0) ;src_ptr
 49        movsxd      rcx,    dword ptr arg(4) ;output_height
 50        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
 51        pxor        mm0,    mm0              ; mm0 = 00000000
 52
 53nextrow:
 54        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
 55        movq        mm4,    mm3              ; mm4 = p-2..p5
 56        psrlq       mm3,    8                ; mm3 = p-1..p5
 57        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
 58        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
 59
 60        movq        mm5,    mm4              ; mm5 = p-2..p5
 61        punpckhbw   mm4,    mm0              ; mm5 = p2..p5
 62        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
 63        paddsw      mm3,    mm4              ; mm3 += mm5
 64
 65        movq        mm4,    mm5              ; mm4 = p-2..p5;
 66        psrlq       mm5,    16               ; mm5 = p0..p5;
 67        punpcklbw   mm5,    mm0              ; mm5 = p0..p3
 68        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
 69        paddsw      mm3,    mm5              ; mm3 += mm5
 70
 71        movq        mm5,    mm4              ; mm5 = p-2..p5
 72        psrlq       mm4,    24               ; mm4 = p1..p5
 73        punpcklbw   mm4,    mm0              ; mm4 = p1..p4
 74        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
 75        paddsw      mm3,    mm4              ; mm3 += mm5
 76
 77        ; do outer positive taps
 78        movd        mm4,    [rsi+3]
 79        punpcklbw   mm4,    mm0              ; mm5 = p3..p6
 80        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
 81        paddsw      mm3,    mm4              ; mm3 += mm5
 82
 83        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
 84        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
 85        paddsw      mm3,    mm5              ; mm3 += mm5
 86
 87        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
 88        psraw       mm3,    VP8_FILTER_SHIFT     ; mm3 /= 128
 89        packuswb    mm3,    mm0              ; pack and unpack to saturate
 90        punpcklbw   mm3,    mm0              ;
 91
 92        movq        [rdi],  mm3              ; store the results in the destination
 93
 94%if ABI_IS_32BIT
 95        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
 96        add         rdi,    rax;
 97%else
 98        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
 99        add         rdi,    rax;
100
101        add         rsi,    r8               ; next line
102%endif
103
104        dec         rcx                      ; decrement count
105        jnz         nextrow                  ; next row
106
107    ; begin epilog
108    pop rdi
109    pop rsi
110    RESTORE_GOT
111    UNSHADOW_ARGS
112    pop         rbp
113    ret
114
115
116;void vp8_filter_block1dc_v6_mmx
117;(
118;   short *src_ptr,
119;   unsigned char *output_ptr,
120;    int output_pitch,
121;   unsigned int pixels_per_line,
122;   unsigned int pixel_step,
123;   unsigned int output_height,
124;   unsigned int output_width,
125;   short * vp8_filter
126;)
127global sym(vp8_filter_block1dc_v6_mmx)
128sym(vp8_filter_block1dc_v6_mmx):
129    push        rbp
130    mov         rbp, rsp
131    SHADOW_ARGS_TO_STACK 8
132    GET_GOT     rbx
133    push        rsi
134    push        rdi
135    ; end prolog
136
137        movq      mm5, [GLOBAL(rd)]
138        push        rbx
139        mov         rbx, arg(7) ;vp8_filter
140        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
141        movq      mm2, [rbx + 32]         ;
142        movq      mm6, [rbx + 48]        ;
143        movq      mm7, [rbx + 64]        ;
144
145        movsxd      rdx, dword ptr arg(3) ;pixels_per_line
146        mov         rdi, arg(1) ;output_ptr
147        mov         rsi, arg(0) ;src_ptr
148        sub         rsi, rdx
149        sub         rsi, rdx
150        movsxd      rcx, DWORD PTR arg(5) ;output_height
151        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
152        pxor        mm0, mm0              ; mm0 = 00000000
153
154
155nextrow_cv:
156        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
157        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
158
159
160        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
161        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
162        paddsw      mm3, mm4              ; mm3 += mm4
163
164        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
165        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
166        paddsw      mm3, mm4              ; mm3 += mm4
167
168        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
169        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
170        paddsw      mm3, mm4              ; mm3 += mm4
171
172
173        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
174        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
175        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
176        paddsw      mm3, mm4              ; mm3 += mm4
177
178        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
179        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
180        paddsw      mm3, mm4              ; mm3 += mm4
181
182
183        paddsw      mm3, mm5               ; mm3 += round value
184        psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
185        packuswb    mm3, mm0              ; pack and saturate
186
187        movd        [rdi],mm3             ; store the results in the destination
188        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
189        ; recon block should be in cache this shouldn't cost much.  Its obviously
190        ; avoidable!!!.
191        lea         rdi,  [rdi+rax] ;
192        dec         rcx                   ; decrement count
193        jnz         nextrow_cv             ; next row
194
195        pop         rbx
196
197    ; begin epilog
198    pop rdi
199    pop rsi
200    RESTORE_GOT
201    UNSHADOW_ARGS
202    pop         rbp
203    ret
204
205
206;void bilinear_predict8x8_mmx
207;(
208;    unsigned char  *src_ptr,
209;    int   src_pixels_per_line,
210;    int  xoffset,
211;    int  yoffset,
212;   unsigned char *dst_ptr,
213;    int dst_pitch
214;)
215global sym(vp8_bilinear_predict8x8_mmx)
216sym(vp8_bilinear_predict8x8_mmx):
217    push        rbp
218    mov         rbp, rsp
219    SHADOW_ARGS_TO_STACK 6
220    GET_GOT     rbx
221    push        rsi
222    push        rdi
223    ; end prolog
224
225    ;const short *HFilter = bilinear_filters_mmx[xoffset];
226    ;const short *VFilter = bilinear_filters_mmx[yoffset];
227
228        movsxd      rax,        dword ptr arg(2) ;xoffset
229        mov         rdi,        arg(4) ;dst_ptr           ;
230
231        shl         rax,        5 ; offset * 32
232        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
233
234        add         rax,        rcx ; HFilter
235        mov         rsi,        arg(0) ;src_ptr              ;
236
237        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
238        movq        mm1,        [rax]               ;
239
240        movq        mm2,        [rax+16]            ;
241        movsxd      rax,        dword ptr arg(3) ;yoffset
242
243        pxor        mm0,        mm0                 ;
244
245        shl         rax,        5 ; offset*32
246        add         rax,        rcx ; VFilter
247
248        lea         rcx,        [rdi+rdx*8]          ;
249        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
250
251
252
253        ; get the first horizontal line done       ;
254        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
255        movq        mm4,        mm3                 ; make a copy of current line
256
257        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
258        punpckhbw   mm4,        mm0                 ;
259
260        pmullw      mm3,        mm1                 ;
261        pmullw      mm4,        mm1                 ;
262
263        movq        mm5,        [rsi+1]             ;
264        movq        mm6,        mm5                 ;
265
266        punpcklbw   mm5,        mm0                 ;
267        punpckhbw   mm6,        mm0                 ;
268
269        pmullw      mm5,        mm2                 ;
270        pmullw      mm6,        mm2                 ;
271
272        paddw       mm3,        mm5                 ;
273        paddw       mm4,        mm6                 ;
274
275        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
276        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
277
278        paddw       mm4,        [GLOBAL(rd)]                 ;
279        psraw       mm4,        VP8_FILTER_SHIFT        ;
280
281        movq        mm7,        mm3                 ;
282        packuswb    mm7,        mm4                 ;
283
284        add         rsi,        rdx                 ; next line
285next_row_8x8:
286        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
287        movq        mm4,        mm3                 ; make a copy of current line
288
289        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
290        punpckhbw   mm4,        mm0                 ;
291
292        pmullw      mm3,        mm1                 ;
293        pmullw      mm4,        mm1                 ;
294
295        movq        mm5,        [rsi+1]             ;
296        movq        mm6,        mm5                 ;
297
298        punpcklbw   mm5,        mm0                 ;
299        punpckhbw   mm6,        mm0                 ;
300
301        pmullw      mm5,        mm2                 ;
302        pmullw      mm6,        mm2                 ;
303
304        paddw       mm3,        mm5                 ;
305        paddw       mm4,        mm6                 ;
306
307        movq        mm5,        mm7                 ;
308        movq        mm6,        mm7                 ;
309
310        punpcklbw   mm5,        mm0                 ;
311        punpckhbw   mm6,        mm0
312
313        pmullw      mm5,        [rax]               ;
314        pmullw      mm6,        [rax]               ;
315
316        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
317        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
318
319        paddw       mm4,        [GLOBAL(rd)]                 ;
320        psraw       mm4,        VP8_FILTER_SHIFT        ;
321
322        movq        mm7,        mm3                 ;
323        packuswb    mm7,        mm4                 ;
324
325
326        pmullw      mm3,        [rax+16]            ;
327        pmullw      mm4,        [rax+16]            ;
328
329        paddw       mm3,        mm5                 ;
330        paddw       mm4,        mm6                 ;
331
332
333        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
334        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
335
336        paddw       mm4,        [GLOBAL(rd)]                 ;
337        psraw       mm4,        VP8_FILTER_SHIFT        ;
338
339        packuswb    mm3,        mm4
340
341        movq        [rdi],      mm3                 ; store the results in the destination
342
343%if ABI_IS_32BIT
344        add         rsi,        rdx                 ; next line
345        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
346%else
347        movsxd      r8,         dword ptr arg(5) ;dst_pitch
348        add         rsi,        rdx                 ; next line
349        add         rdi,        r8                  ;dst_pitch
350%endif
351        cmp         rdi,        rcx                 ;
352        jne         next_row_8x8
353
354    ; begin epilog
355    pop rdi
356    pop rsi
357    RESTORE_GOT
358    UNSHADOW_ARGS
359    pop         rbp
360    ret
361
362
363;void bilinear_predict8x4_mmx
364;(
365;    unsigned char  *src_ptr,
366;    int   src_pixels_per_line,
367;    int  xoffset,
368;    int  yoffset,
369;    unsigned char *dst_ptr,
370;    int dst_pitch
371;)
372global sym(vp8_bilinear_predict8x4_mmx)
373sym(vp8_bilinear_predict8x4_mmx):
374    push        rbp
375    mov         rbp, rsp
376    SHADOW_ARGS_TO_STACK 6
377    GET_GOT     rbx
378    push        rsi
379    push        rdi
380    ; end prolog
381
382    ;const short *HFilter = bilinear_filters_mmx[xoffset];
383    ;const short *VFilter = bilinear_filters_mmx[yoffset];
384
385        movsxd      rax,        dword ptr arg(2) ;xoffset
386        mov         rdi,        arg(4) ;dst_ptr           ;
387
388        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
389        shl         rax,        5
390
391        mov         rsi,        arg(0) ;src_ptr              ;
392        add         rax,        rcx
393
394        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
395        movq        mm1,        [rax]               ;
396
397        movq        mm2,        [rax+16]            ;
398        movsxd      rax,        dword ptr arg(3) ;yoffset
399
400        pxor        mm0,        mm0                 ;
401        shl         rax,        5
402
403        add         rax,        rcx
404        lea         rcx,        [rdi+rdx*4]          ;
405
406        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
407
408        ; get the first horizontal line done       ;
409        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
410        movq        mm4,        mm3                 ; make a copy of current line
411
412        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
413        punpckhbw   mm4,        mm0                 ;
414
415        pmullw      mm3,        mm1                 ;
416        pmullw      mm4,        mm1                 ;
417
418        movq        mm5,        [rsi+1]             ;
419        movq        mm6,        mm5                 ;
420
421        punpcklbw   mm5,        mm0                 ;
422        punpckhbw   mm6,        mm0                 ;
423
424        pmullw      mm5,        mm2                 ;
425        pmullw      mm6,        mm2                 ;
426
427        paddw       mm3,        mm5                 ;
428        paddw       mm4,        mm6                 ;
429
430        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
431        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
432
433        paddw       mm4,        [GLOBAL(rd)]                 ;
434        psraw       mm4,        VP8_FILTER_SHIFT        ;
435
436        movq        mm7,        mm3                 ;
437        packuswb    mm7,        mm4                 ;
438
439        add         rsi,        rdx                 ; next line
440next_row_8x4:
441        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
442        movq        mm4,        mm3                 ; make a copy of current line
443
444        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
445        punpckhbw   mm4,        mm0                 ;
446
447        pmullw      mm3,        mm1                 ;
448        pmullw      mm4,        mm1                 ;
449
450        movq        mm5,        [rsi+1]             ;
451        movq        mm6,        mm5                 ;
452
453        punpcklbw   mm5,        mm0                 ;
454        punpckhbw   mm6,        mm0                 ;
455
456        pmullw      mm5,        mm2                 ;
457        pmullw      mm6,        mm2                 ;
458
459        paddw       mm3,        mm5                 ;
460        paddw       mm4,        mm6                 ;
461
462        movq        mm5,        mm7                 ;
463        movq        mm6,        mm7                 ;
464
465        punpcklbw   mm5,        mm0                 ;
466        punpckhbw   mm6,        mm0
467
468        pmullw      mm5,        [rax]               ;
469        pmullw      mm6,        [rax]               ;
470
471        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
472        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
473
474        paddw       mm4,        [GLOBAL(rd)]                 ;
475        psraw       mm4,        VP8_FILTER_SHIFT        ;
476
477        movq        mm7,        mm3                 ;
478        packuswb    mm7,        mm4                 ;
479
480
481        pmullw      mm3,        [rax+16]            ;
482        pmullw      mm4,        [rax+16]            ;
483
484        paddw       mm3,        mm5                 ;
485        paddw       mm4,        mm6                 ;
486
487
488        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
489        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
490
491        paddw       mm4,        [GLOBAL(rd)]                 ;
492        psraw       mm4,        VP8_FILTER_SHIFT        ;
493
494        packuswb    mm3,        mm4
495
496        movq        [rdi],      mm3                 ; store the results in the destination
497
498%if ABI_IS_32BIT
499        add         rsi,        rdx                 ; next line
500        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
501%else
502        movsxd      r8,         dword ptr arg(5) ;dst_pitch
503        add         rsi,        rdx                 ; next line
504        add         rdi,        r8
505%endif
506        cmp         rdi,        rcx                 ;
507        jne         next_row_8x4
508
509    ; begin epilog
510    pop rdi
511    pop rsi
512    RESTORE_GOT
513    UNSHADOW_ARGS
514    pop         rbp
515    ret
516
517
518;void bilinear_predict4x4_mmx
519;(
520;    unsigned char  *src_ptr,
521;    int   src_pixels_per_line,
522;    int  xoffset,
523;    int  yoffset,
524;    unsigned char *dst_ptr,
525;    int dst_pitch
526;)
527global sym(vp8_bilinear_predict4x4_mmx)
528sym(vp8_bilinear_predict4x4_mmx):
529    push        rbp
530    mov         rbp, rsp
531    SHADOW_ARGS_TO_STACK 6
532    GET_GOT     rbx
533    push        rsi
534    push        rdi
535    ; end prolog
536
537    ;const short *HFilter = bilinear_filters_mmx[xoffset];
538    ;const short *VFilter = bilinear_filters_mmx[yoffset];
539
540        movsxd      rax,        dword ptr arg(2) ;xoffset
541        mov         rdi,        arg(4) ;dst_ptr           ;
542
543        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
544        shl         rax,        5
545
546        add         rax,        rcx ; HFilter
547        mov         rsi,        arg(0) ;src_ptr              ;
548
549        movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
550        movq        mm1,        [rax]               ;
551
552        movq        mm2,        [rax+16]            ;
553        movsxd      rax,        dword ptr arg(3) ;yoffset
554
555        pxor        mm0,        mm0                 ;
556        shl         rax,        5
557
558        add         rax,        rcx
559        lea         rcx,        [rdi+rdx*4]          ;
560
561        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
562
563        ; get the first horizontal line done       ;
564        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
565        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
566
567        pmullw      mm3,        mm1                 ;
568        movd        mm5,        [rsi+1]             ;
569
570        punpcklbw   mm5,        mm0                 ;
571        pmullw      mm5,        mm2                 ;
572
573        paddw       mm3,        mm5                 ;
574        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
575
576        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
577
578        movq        mm7,        mm3                 ;
579        packuswb    mm7,        mm0                 ;
580
581        add         rsi,        rdx                 ; next line
582next_row_4x4:
583        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
584        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
585
586        pmullw      mm3,        mm1                 ;
587        movd        mm5,        [rsi+1]             ;
588
589        punpcklbw   mm5,        mm0                 ;
590        pmullw      mm5,        mm2                 ;
591
592        paddw       mm3,        mm5                 ;
593
594        movq        mm5,        mm7                 ;
595        punpcklbw   mm5,        mm0                 ;
596
597        pmullw      mm5,        [rax]               ;
598        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
599
600        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
601        movq        mm7,        mm3                 ;
602
603        packuswb    mm7,        mm0                 ;
604
605        pmullw      mm3,        [rax+16]            ;
606        paddw       mm3,        mm5                 ;
607
608
609        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
610        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
611
612        packuswb    mm3,        mm0
613        movd        [rdi],      mm3                 ; store the results in the destination
614
615%if ABI_IS_32BIT
616        add         rsi,        rdx                 ; next line
617        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
618%else
619        movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
620        add         rsi,        rdx                 ; next line
621        add         rdi,        r8
622%endif
623
624        cmp         rdi,        rcx                 ;
625        jne         next_row_4x4
626
627    ; begin epilog
628    pop rdi
629    pop rsi
630    RESTORE_GOT
631    UNSHADOW_ARGS
632    pop         rbp
633    ret
634
635
636
637SECTION_RODATA
638align 16
639rd:
640    times 4 dw 0x40
641
642align 16
643global HIDDEN_DATA(sym(vp8_six_tap_mmx))
644sym(vp8_six_tap_mmx):
645    times 8 dw 0
646    times 8 dw 0
647    times 8 dw 128
648    times 8 dw 0
649    times 8 dw 0
650    times 8 dw 0
651
652    times 8 dw 0
653    times 8 dw -6
654    times 8 dw 123
655    times 8 dw 12
656    times 8 dw -1
657    times 8 dw 0
658
659    times 8 dw 2
660    times 8 dw -11
661    times 8 dw 108
662    times 8 dw 36
663    times 8 dw -8
664    times 8 dw 1
665
666    times 8 dw 0
667    times 8 dw -9
668    times 8 dw 93
669    times 8 dw 50
670    times 8 dw -6
671    times 8 dw 0
672
673    times 8 dw 3
674    times 8 dw -16
675    times 8 dw 77
676    times 8 dw 77
677    times 8 dw -16
678    times 8 dw 3
679
680    times 8 dw 0
681    times 8 dw -6
682    times 8 dw 50
683    times 8 dw 93
684    times 8 dw -9
685    times 8 dw 0
686
687    times 8 dw 1
688    times 8 dw -8
689    times 8 dw 36
690    times 8 dw 108
691    times 8 dw -11
692    times 8 dw 2
693
694    times 8 dw 0
695    times 8 dw -1
696    times 8 dw 12
697    times 8 dw 123
698    times 8 dw -6
699    times 8 dw 0
700
701
702align 16
703global HIDDEN_DATA(sym(vp8_bilinear_filters_mmx))
704sym(vp8_bilinear_filters_mmx):
705    times 8 dw 128
706    times 8 dw 0
707
708    times 8 dw 112
709    times 8 dw 16
710
711    times 8 dw 96
712    times 8 dw 32
713
714    times 8 dw 80
715    times 8 dw 48
716
717    times 8 dw 64
718    times 8 dw 64
719
720    times 8 dw 48
721    times 8 dw 80
722
723    times 8 dw 32
724    times 8 dw 96
725
726    times 8 dw 16
727    times 8 dw 112