PageRenderTime 56ms CodeModel.GetById 10ms app.highlight 41ms RepoModel.GetById 1ms app.codeStats 1ms

/media/libvpx/vp8/encoder/x86/sad_sse2.asm

http://github.com/zpao/v8monkey
Assembly | 410 lines | 255 code | 102 blank | 53 comment | 0 complexity | 1ec3e7a8f4593fc866a2f5bf69260e91 MD5 | raw file
  1;
  2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3;
  4;  Use of this source code is governed by a BSD-style license
  5;  that can be found in the LICENSE file in the root of the source
  6;  tree. An additional intellectual property rights grant can be found
  7;  in the file PATENTS.  All contributing project authors may
  8;  be found in the AUTHORS file in the root of the source tree.
  9;
 10
 11
 12%include "vpx_ports/x86_abi_support.asm"
 13
 14;unsigned int vp8_sad16x16_wmt(
 15;    unsigned char *src_ptr,
 16;    int  src_stride,
 17;    unsigned char *ref_ptr,
 18;    int  ref_stride)
 19global sym(vp8_sad16x16_wmt)
 20sym(vp8_sad16x16_wmt):
 21    push        rbp
 22    mov         rbp, rsp
 23    SHADOW_ARGS_TO_STACK 4
 24    SAVE_XMM 6
 25    push        rsi
 26    push        rdi
 27    ; end prolog
 28
 29        mov             rsi,        arg(0) ;src_ptr
 30        mov             rdi,        arg(2) ;ref_ptr
 31
 32        movsxd          rax,        dword ptr arg(1) ;src_stride
 33        movsxd          rdx,        dword ptr arg(3) ;ref_stride
 34
 35        lea             rcx,        [rsi+rax*8]
 36
 37        lea             rcx,        [rcx+rax*8]
 38        pxor            xmm6,       xmm6
 39
 40x16x16sad_wmt_loop:
 41
 42        movq            xmm0,       QWORD PTR [rsi]
 43        movq            xmm2,       QWORD PTR [rsi+8]
 44
 45        movq            xmm1,       QWORD PTR [rdi]
 46        movq            xmm3,       QWORD PTR [rdi+8]
 47
 48        movq            xmm4,       QWORD PTR [rsi+rax]
 49        movq            xmm5,       QWORD PTR [rdi+rdx]
 50
 51
 52        punpcklbw       xmm0,       xmm2
 53        punpcklbw       xmm1,       xmm3
 54
 55        psadbw          xmm0,       xmm1
 56        movq            xmm2,       QWORD PTR [rsi+rax+8]
 57
 58        movq            xmm3,       QWORD PTR [rdi+rdx+8]
 59        lea             rsi,        [rsi+rax*2]
 60
 61        lea             rdi,        [rdi+rdx*2]
 62        punpcklbw       xmm4,       xmm2
 63
 64        punpcklbw       xmm5,       xmm3
 65        psadbw          xmm4,       xmm5
 66
 67        paddw           xmm6,       xmm0
 68        paddw           xmm6,       xmm4
 69
 70        cmp             rsi,        rcx
 71        jne             x16x16sad_wmt_loop
 72
 73        movq            xmm0,       xmm6
 74        psrldq          xmm6,       8
 75
 76        paddw           xmm0,       xmm6
 77        movq            rax,        xmm0
 78
 79    ; begin epilog
 80    pop rdi
 81    pop rsi
 82    RESTORE_XMM
 83    UNSHADOW_ARGS
 84    pop         rbp
 85    ret
 86
 87;unsigned int vp8_sad8x16_wmt(
 88;    unsigned char *src_ptr,
 89;    int  src_stride,
 90;    unsigned char *ref_ptr,
 91;    int  ref_stride,
 92;    int  max_err)
 93global sym(vp8_sad8x16_wmt)
 94sym(vp8_sad8x16_wmt):
 95    push        rbp
 96    mov         rbp, rsp
 97    SHADOW_ARGS_TO_STACK 5
 98    push        rbx
 99    push        rsi
100    push        rdi
101    ; end prolog
102
103        mov             rsi,        arg(0) ;src_ptr
104        mov             rdi,        arg(2) ;ref_ptr
105
106        movsxd          rbx,        dword ptr arg(1) ;src_stride
107        movsxd          rdx,        dword ptr arg(3) ;ref_stride
108
109        lea             rcx,        [rsi+rbx*8]
110
111        lea             rcx,        [rcx+rbx*8]
112        pxor            mm7,        mm7
113
114x8x16sad_wmt_loop:
115
116        movq            rax,        mm7
117        cmp             eax,        arg(4)
118        jg              x8x16sad_wmt_early_exit
119
120        movq            mm0,        QWORD PTR [rsi]
121        movq            mm1,        QWORD PTR [rdi]
122
123        movq            mm2,        QWORD PTR [rsi+rbx]
124        movq            mm3,        QWORD PTR [rdi+rdx]
125
126        psadbw          mm0,        mm1
127        psadbw          mm2,        mm3
128
129        lea             rsi,        [rsi+rbx*2]
130        lea             rdi,        [rdi+rdx*2]
131
132        paddw           mm7,        mm0
133        paddw           mm7,        mm2
134
135        cmp             rsi,        rcx
136        jne             x8x16sad_wmt_loop
137
138        movq            rax,        mm7
139
140x8x16sad_wmt_early_exit:
141
142    ; begin epilog
143    pop         rdi
144    pop         rsi
145    pop         rbx
146    UNSHADOW_ARGS
147    pop         rbp
148    ret
149
150
151;unsigned int vp8_sad8x8_wmt(
152;    unsigned char *src_ptr,
153;    int  src_stride,
154;    unsigned char *ref_ptr,
155;    int  ref_stride)
156global sym(vp8_sad8x8_wmt)
157sym(vp8_sad8x8_wmt):
158    push        rbp
159    mov         rbp, rsp
160    SHADOW_ARGS_TO_STACK 5
161    push        rbx
162    push        rsi
163    push        rdi
164    ; end prolog
165
166        mov             rsi,        arg(0) ;src_ptr
167        mov             rdi,        arg(2) ;ref_ptr
168
169        movsxd          rbx,        dword ptr arg(1) ;src_stride
170        movsxd          rdx,        dword ptr arg(3) ;ref_stride
171
172        lea             rcx,        [rsi+rbx*8]
173        pxor            mm7,        mm7
174
175x8x8sad_wmt_loop:
176
177        movq            rax,        mm7
178        cmp             eax,        arg(4)
179        jg              x8x8sad_wmt_early_exit
180
181        movq            mm0,        QWORD PTR [rsi]
182        movq            mm1,        QWORD PTR [rdi]
183
184        psadbw          mm0,        mm1
185        lea             rsi,        [rsi+rbx]
186
187        add             rdi,        rdx
188        paddw           mm7,        mm0
189
190        cmp             rsi,        rcx
191        jne             x8x8sad_wmt_loop
192
193        movq            rax,        mm7
194x8x8sad_wmt_early_exit:
195
196    ; begin epilog
197    pop         rdi
198    pop         rsi
199    pop         rbx
200    UNSHADOW_ARGS
201    pop         rbp
202    ret
203
204;unsigned int vp8_sad4x4_wmt(
205;    unsigned char *src_ptr,
206;    int  src_stride,
207;    unsigned char *ref_ptr,
208;    int  ref_stride)
209global sym(vp8_sad4x4_wmt)
210sym(vp8_sad4x4_wmt):
211    push        rbp
212    mov         rbp, rsp
213    SHADOW_ARGS_TO_STACK 4
214    push        rsi
215    push        rdi
216    ; end prolog
217
218        mov             rsi,        arg(0) ;src_ptr
219        mov             rdi,        arg(2) ;ref_ptr
220
221        movsxd          rax,        dword ptr arg(1) ;src_stride
222        movsxd          rdx,        dword ptr arg(3) ;ref_stride
223
224        movd            mm0,        DWORD PTR [rsi]
225        movd            mm1,        DWORD PTR [rdi]
226
227        movd            mm2,        DWORD PTR [rsi+rax]
228        movd            mm3,        DWORD PTR [rdi+rdx]
229
230        punpcklbw       mm0,        mm2
231        punpcklbw       mm1,        mm3
232
233        psadbw          mm0,        mm1
234        lea             rsi,        [rsi+rax*2]
235
236        lea             rdi,        [rdi+rdx*2]
237        movd            mm4,        DWORD PTR [rsi]
238
239        movd            mm5,        DWORD PTR [rdi]
240        movd            mm6,        DWORD PTR [rsi+rax]
241
242        movd            mm7,        DWORD PTR [rdi+rdx]
243        punpcklbw       mm4,        mm6
244
245        punpcklbw       mm5,        mm7
246        psadbw          mm4,        mm5
247
248        paddw           mm0,        mm4
249        movq            rax,        mm0
250
251    ; begin epilog
252    pop rdi
253    pop rsi
254    UNSHADOW_ARGS
255    pop         rbp
256    ret
257
258
259;unsigned int vp8_sad16x8_wmt(
260;    unsigned char *src_ptr,
261;    int  src_stride,
262;    unsigned char *ref_ptr,
263;    int  ref_stride)
264global sym(vp8_sad16x8_wmt)
265sym(vp8_sad16x8_wmt):
266    push        rbp
267    mov         rbp, rsp
268    SHADOW_ARGS_TO_STACK 5
269    push        rbx
270    push        rsi
271    push        rdi
272    ; end prolog
273
274
275        mov             rsi,        arg(0) ;src_ptr
276        mov             rdi,        arg(2) ;ref_ptr
277
278        movsxd          rbx,        dword ptr arg(1) ;src_stride
279        movsxd          rdx,        dword ptr arg(3) ;ref_stride
280
281        lea             rcx,        [rsi+rbx*8]
282        pxor            mm7,        mm7
283
284x16x8sad_wmt_loop:
285
286        movq            rax,        mm7
287        cmp             eax,        arg(4)
288        jg              x16x8sad_wmt_early_exit
289
290        movq            mm0,        QWORD PTR [rsi]
291        movq            mm2,        QWORD PTR [rsi+8]
292
293        movq            mm1,        QWORD PTR [rdi]
294        movq            mm3,        QWORD PTR [rdi+8]
295
296        movq            mm4,        QWORD PTR [rsi+rbx]
297        movq            mm5,        QWORD PTR [rdi+rdx]
298
299        psadbw          mm0,        mm1
300        psadbw          mm2,        mm3
301
302        movq            mm1,        QWORD PTR [rsi+rbx+8]
303        movq            mm3,        QWORD PTR [rdi+rdx+8]
304
305        psadbw          mm4,        mm5
306        psadbw          mm1,        mm3
307
308        lea             rsi,        [rsi+rbx*2]
309        lea             rdi,        [rdi+rdx*2]
310
311        paddw           mm0,        mm2
312        paddw           mm4,        mm1
313
314        paddw           mm7,        mm0
315        paddw           mm7,        mm4
316
317        cmp             rsi,        rcx
318        jne             x16x8sad_wmt_loop
319
320        movq            rax,        mm7
321
322x16x8sad_wmt_early_exit:
323
324    ; begin epilog
325    pop         rdi
326    pop         rsi
327    pop         rbx
328    UNSHADOW_ARGS
329    pop         rbp
330    ret
331
332;void vp8_copy32xn_sse2(
333;    unsigned char *src_ptr,
334;    int  src_stride,
335;    unsigned char *dst_ptr,
336;    int  dst_stride,
337;    int height);
338global sym(vp8_copy32xn_sse2)
339sym(vp8_copy32xn_sse2):
340    push        rbp
341    mov         rbp, rsp
342    SHADOW_ARGS_TO_STACK 5
343    SAVE_XMM 7
344    push        rsi
345    push        rdi
346    ; end prolog
347
348        mov             rsi,        arg(0) ;src_ptr
349        mov             rdi,        arg(2) ;dst_ptr
350
351        movsxd          rax,        dword ptr arg(1) ;src_stride
352        movsxd          rdx,        dword ptr arg(3) ;dst_stride
353        movsxd          rcx,        dword ptr arg(4) ;height
354
355block_copy_sse2_loopx4:
356        movdqu          xmm0,       XMMWORD PTR [rsi]
357        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
358        movdqu          xmm2,       XMMWORD PTR [rsi + rax]
359        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]
360
361        lea             rsi,        [rsi+rax*2]
362
363        movdqu          xmm4,       XMMWORD PTR [rsi]
364        movdqu          xmm5,       XMMWORD PTR [rsi + 16]
365        movdqu          xmm6,       XMMWORD PTR [rsi + rax]
366        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]
367
368        lea             rsi,    [rsi+rax*2]
369
370        movdqa          XMMWORD PTR [rdi], xmm0
371        movdqa          XMMWORD PTR [rdi + 16], xmm1
372        movdqa          XMMWORD PTR [rdi + rdx], xmm2
373        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3
374
375        lea             rdi,    [rdi+rdx*2]
376
377        movdqa          XMMWORD PTR [rdi], xmm4
378        movdqa          XMMWORD PTR [rdi + 16], xmm5
379        movdqa          XMMWORD PTR [rdi + rdx], xmm6
380        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7
381
382        lea             rdi,    [rdi+rdx*2]
383
384        sub             rcx,     4
385        cmp             rcx,     4
386        jge             block_copy_sse2_loopx4
387
388        cmp             rcx, 0
389        je              copy_is_done
390
391block_copy_sse2_loop:
392        movdqu          xmm0,       XMMWORD PTR [rsi]
393        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
394        lea             rsi,    [rsi+rax]
395
396        movdqa          XMMWORD PTR [rdi], xmm0
397        movdqa          XMMWORD PTR [rdi + 16], xmm1
398        lea             rdi,    [rdi+rdx]
399
400        sub             rcx,     1
401        jne             block_copy_sse2_loop
402
403copy_is_done:
404    ; begin epilog
405    pop rdi
406    pop rsi
407    RESTORE_XMM
408    UNSHADOW_ARGS
409    pop         rbp
410    ret