PageRenderTime 37ms CodeModel.GetById 14ms app.highlight 19ms RepoModel.GetById 1ms app.codeStats 0ms

/media/libvpx/vp8/encoder/x86/variance_impl_ssse3.asm

http://github.com/zpao/v8monkey
Assembly | 364 lines | 264 code | 74 blank | 26 comment | 0 complexity | ad8249bbcc257d22147c3709978449a9 MD5 | raw file
  1;
  2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3;
  4;  Use of this source code is governed by a BSD-style license
  5;  that can be found in the LICENSE file in the root of the source
  6;  tree. An additional intellectual property rights grant can be found
  7;  in the file PATENTS.  All contributing project authors may
  8;  be found in the AUTHORS file in the root of the source tree.
  9;
 10
 11
 12%include "vpx_ports/x86_abi_support.asm"
 13
 14%define xmm_filter_shift            7
 15
 16
 17;void vp8_filter_block2d_bil_var_ssse3
 18;(
 19;    unsigned char *ref_ptr,
 20;    int ref_pixels_per_line,
 21;    unsigned char *src_ptr,
 22;    int src_pixels_per_line,
 23;    unsigned int Height,
 24;    int  xoffset,
 25;    int  yoffset,
 26;    int *sum,
 27;    unsigned int *sumsquared;;
 28;
 29;)
 30;Note: The filter coefficient at offset=0 is 128. Since the second register
 31;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
 32global sym(vp8_filter_block2d_bil_var_ssse3)
 33sym(vp8_filter_block2d_bil_var_ssse3):
 34    push        rbp
 35    mov         rbp, rsp
 36    SHADOW_ARGS_TO_STACK 9
 37    SAVE_XMM 7
 38    GET_GOT     rbx
 39    push rsi
 40    push rdi
 41    ; end prolog
 42
 43        pxor            xmm6,           xmm6
 44        pxor            xmm7,           xmm7
 45
 46        lea             rcx,            [GLOBAL(vp8_bilinear_filters_ssse3)]
 47        movsxd          rax,            dword ptr arg(5)     ; xoffset
 48
 49        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
 50        je              filter_block2d_bil_var_ssse3_sp_only
 51
 52        shl             rax,            4                    ; point to filter coeff with xoffset
 53        lea             rax,            [rax + rcx]          ; HFilter
 54
 55        movsxd          rdx,            dword ptr arg(6)     ; yoffset
 56
 57        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
 58        je              filter_block2d_bil_var_ssse3_fp_only
 59
 60        shl             rdx,            4
 61        lea             rdx,            [rdx + rcx]          ; VFilter
 62
 63        mov             rsi,            arg(0)               ;ref_ptr
 64        mov             rdi,            arg(2)               ;src_ptr
 65        movsxd          rcx,            dword ptr arg(4)     ;Height
 66
 67        movdqu          xmm0,           XMMWORD PTR [rsi]
 68        movdqu          xmm1,           XMMWORD PTR [rsi+1]
 69        movdqa          xmm2,           xmm0
 70
 71        punpcklbw       xmm0,           xmm1
 72        punpckhbw       xmm2,           xmm1
 73        pmaddubsw       xmm0,           [rax]
 74        pmaddubsw       xmm2,           [rax]
 75
 76        paddw           xmm0,           [GLOBAL(xmm_bi_rd)]
 77        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
 78        psraw           xmm0,           xmm_filter_shift
 79        psraw           xmm2,           xmm_filter_shift
 80
 81        packuswb        xmm0,           xmm2
 82
 83%if ABI_IS_32BIT
 84        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
 85%else
 86        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
 87        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
 88        lea             rsi,            [rsi + r8]
 89%endif
 90
 91filter_block2d_bil_var_ssse3_loop:
 92        movdqu          xmm1,           XMMWORD PTR [rsi]
 93        movdqu          xmm2,           XMMWORD PTR [rsi+1]
 94        movdqa          xmm3,           xmm1
 95
 96        punpcklbw       xmm1,           xmm2
 97        punpckhbw       xmm3,           xmm2
 98        pmaddubsw       xmm1,           [rax]
 99        pmaddubsw       xmm3,           [rax]
100
101        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
102        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
103        psraw           xmm1,           xmm_filter_shift
104        psraw           xmm3,           xmm_filter_shift
105        packuswb        xmm1,           xmm3
106
107        movdqa          xmm2,           xmm0
108        movdqa          xmm0,           xmm1
109        movdqa          xmm3,           xmm2
110
111        punpcklbw       xmm2,           xmm1
112        punpckhbw       xmm3,           xmm1
113        pmaddubsw       xmm2,           [rdx]
114        pmaddubsw       xmm3,           [rdx]
115
116        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
117        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
118        psraw           xmm2,           xmm_filter_shift
119        psraw           xmm3,           xmm_filter_shift
120
121        movq            xmm1,           QWORD PTR [rdi]
122        pxor            xmm4,           xmm4
123        punpcklbw       xmm1,           xmm4
124        movq            xmm5,           QWORD PTR [rdi+8]
125        punpcklbw       xmm5,           xmm4
126
127        psubw           xmm2,           xmm1
128        psubw           xmm3,           xmm5
129        paddw           xmm6,           xmm2
130        paddw           xmm6,           xmm3
131        pmaddwd         xmm2,           xmm2
132        pmaddwd         xmm3,           xmm3
133        paddd           xmm7,           xmm2
134        paddd           xmm7,           xmm3
135
136%if ABI_IS_32BIT
137        add             rsi,            dword ptr arg(1)     ;ref_pixels_per_line
138        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
139%else
140        lea             rsi,            [rsi + r8]
141        lea             rdi,            [rdi + r9]
142%endif
143
144        sub             rcx,            1
145        jnz             filter_block2d_bil_var_ssse3_loop
146
147        jmp             filter_block2d_bil_variance
148
149filter_block2d_bil_var_ssse3_sp_only:
150        movsxd          rdx,            dword ptr arg(6)     ; yoffset
151
152        cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0
153        je              filter_block2d_bil_var_ssse3_full_pixel
154
155        shl             rdx,            4
156        lea             rdx,            [rdx + rcx]          ; VFilter
157
158        mov             rsi,            arg(0)               ;ref_ptr
159        mov             rdi,            arg(2)               ;src_ptr
160        movsxd          rcx,            dword ptr arg(4)     ;Height
161        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
162
163        movdqu          xmm1,           XMMWORD PTR [rsi]
164        movdqa          xmm0,           xmm1
165
166%if ABI_IS_32BIT=0
167        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
168%endif
169
170        lea             rsi,            [rsi + rax]
171
172filter_block2d_bil_sp_only_loop:
173        movdqu          xmm3,           XMMWORD PTR [rsi]
174        movdqa          xmm2,           xmm1
175        movdqa          xmm0,           xmm3
176
177        punpcklbw       xmm1,           xmm3
178        punpckhbw       xmm2,           xmm3
179        pmaddubsw       xmm1,           [rdx]
180        pmaddubsw       xmm2,           [rdx]
181
182        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
183        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
184        psraw           xmm1,           xmm_filter_shift
185        psraw           xmm2,           xmm_filter_shift
186
187        movq            xmm3,           QWORD PTR [rdi]
188        pxor            xmm4,           xmm4
189        punpcklbw       xmm3,           xmm4
190        movq            xmm5,           QWORD PTR [rdi+8]
191        punpcklbw       xmm5,           xmm4
192
193        psubw           xmm1,           xmm3
194        psubw           xmm2,           xmm5
195        paddw           xmm6,           xmm1
196        paddw           xmm6,           xmm2
197        pmaddwd         xmm1,           xmm1
198        pmaddwd         xmm2,           xmm2
199        paddd           xmm7,           xmm1
200        paddd           xmm7,           xmm2
201
202        movdqa          xmm1,           xmm0
203        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
204
205%if ABI_IS_32BIT
206        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
207%else
208        lea             rdi,            [rdi + r9]
209%endif
210
211        sub             rcx,            1
212        jnz             filter_block2d_bil_sp_only_loop
213
214        jmp             filter_block2d_bil_variance
215
216filter_block2d_bil_var_ssse3_full_pixel:
217        mov             rsi,            arg(0)               ;ref_ptr
218        mov             rdi,            arg(2)               ;src_ptr
219        movsxd          rcx,            dword ptr arg(4)     ;Height
220        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
221        movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line
222        pxor            xmm0,           xmm0
223
224filter_block2d_bil_full_pixel_loop:
225        movq            xmm1,           QWORD PTR [rsi]
226        punpcklbw       xmm1,           xmm0
227        movq            xmm2,           QWORD PTR [rsi+8]
228        punpcklbw       xmm2,           xmm0
229
230        movq            xmm3,           QWORD PTR [rdi]
231        punpcklbw       xmm3,           xmm0
232        movq            xmm4,           QWORD PTR [rdi+8]
233        punpcklbw       xmm4,           xmm0
234
235        psubw           xmm1,           xmm3
236        psubw           xmm2,           xmm4
237        paddw           xmm6,           xmm1
238        paddw           xmm6,           xmm2
239        pmaddwd         xmm1,           xmm1
240        pmaddwd         xmm2,           xmm2
241        paddd           xmm7,           xmm1
242        paddd           xmm7,           xmm2
243
244        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
245        lea             rdi,            [rdi + rdx]          ;src_pixels_per_line
246        sub             rcx,            1
247        jnz             filter_block2d_bil_full_pixel_loop
248
249        jmp             filter_block2d_bil_variance
250
251filter_block2d_bil_var_ssse3_fp_only:
252        mov             rsi,            arg(0)               ;ref_ptr
253        mov             rdi,            arg(2)               ;src_ptr
254        movsxd          rcx,            dword ptr arg(4)     ;Height
255        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
256
257        pxor            xmm0,           xmm0
258
259%if ABI_IS_32BIT=0
260        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
261%endif
262
263filter_block2d_bil_fp_only_loop:
264        movdqu          xmm1,           XMMWORD PTR [rsi]
265        movdqu          xmm2,           XMMWORD PTR [rsi+1]
266        movdqa          xmm3,           xmm1
267
268        punpcklbw       xmm1,           xmm2
269        punpckhbw       xmm3,           xmm2
270        pmaddubsw       xmm1,           [rax]
271        pmaddubsw       xmm3,           [rax]
272
273        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
274        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
275        psraw           xmm1,           xmm_filter_shift
276        psraw           xmm3,           xmm_filter_shift
277
278        movq            xmm2,           XMMWORD PTR [rdi]
279        pxor            xmm4,           xmm4
280        punpcklbw       xmm2,           xmm4
281        movq            xmm5,           QWORD PTR [rdi+8]
282        punpcklbw       xmm5,           xmm4
283
284        psubw           xmm1,           xmm2
285        psubw           xmm3,           xmm5
286        paddw           xmm6,           xmm1
287        paddw           xmm6,           xmm3
288        pmaddwd         xmm1,           xmm1
289        pmaddwd         xmm3,           xmm3
290        paddd           xmm7,           xmm1
291        paddd           xmm7,           xmm3
292
293        lea             rsi,            [rsi + rdx]
294%if ABI_IS_32BIT
295        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
296%else
297        lea             rdi,            [rdi + r9]
298%endif
299
300        sub             rcx,            1
301        jnz             filter_block2d_bil_fp_only_loop
302
303        jmp             filter_block2d_bil_variance
304
305filter_block2d_bil_variance:
306        pxor        xmm0,           xmm0
307        pxor        xmm1,           xmm1
308        pxor        xmm5,           xmm5
309
310        punpcklwd   xmm0,           xmm6
311        punpckhwd   xmm1,           xmm6
312        psrad       xmm0,           16
313        psrad       xmm1,           16
314        paddd       xmm0,           xmm1
315        movdqa      xmm1,           xmm0
316
317        movdqa      xmm6,           xmm7
318        punpckldq   xmm6,           xmm5
319        punpckhdq   xmm7,           xmm5
320        paddd       xmm6,           xmm7
321
322        punpckldq   xmm0,           xmm5
323        punpckhdq   xmm1,           xmm5
324        paddd       xmm0,           xmm1
325
326        movdqa      xmm7,           xmm6
327        movdqa      xmm1,           xmm0
328
329        psrldq      xmm7,           8
330        psrldq      xmm1,           8
331
332        paddd       xmm6,           xmm7
333        paddd       xmm0,           xmm1
334
335        mov         rsi,            arg(7) ;[Sum]
336        mov         rdi,            arg(8) ;[SSE]
337
338        movd        [rsi],       xmm0
339        movd        [rdi],       xmm6
340
341    ; begin epilog
342    pop rdi
343    pop rsi
344    RESTORE_GOT
345    RESTORE_XMM
346    UNSHADOW_ARGS
347    pop         rbp
348    ret
349
350
351SECTION_RODATA
352align 16
353xmm_bi_rd:
354    times 8 dw 64
355align 16
356vp8_bilinear_filters_ssse3:
357    times 8 db 128, 0
358    times 8 db 112, 16
359    times 8 db 96,  32
360    times 8 db 80,  48
361    times 8 db 64,  64
362    times 8 db 48,  80
363    times 8 db 32,  96
364    times 8 db 16,  112