PageRenderTime 26ms CodeModel.GetById 13ms app.highlight 10ms RepoModel.GetById 1ms app.codeStats 0ms

/media/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm

http://github.com/zpao/v8monkey
Assembly | 207 lines | 139 code | 28 blank | 40 comment | 0 complexity | 79da8281245c2ca513662429c9d44676 MD5 | raw file
  1;
  2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3;
  4;  Use of this source code is governed by a BSD-style license
  5;  that can be found in the LICENSE file in the root of the source
  6;  tree. An additional intellectual property rights grant can be found
  7;  in the file PATENTS.  All contributing project authors may
  8;  be found in the AUTHORS file in the root of the source tree.
  9;
 10
 11
 12%include "vpx_ports/x86_abi_support.asm"
 13
 14; void vp8_temporal_filter_apply_sse2 | arg
 15;  (unsigned char  *frame1,           |  0
 16;   unsigned int    stride,           |  1
 17;   unsigned char  *frame2,           |  2
 18;   unsigned int    block_size,       |  3
 19;   int             strength,         |  4
 20;   int             filter_weight,    |  5
 21;   unsigned int   *accumulator,      |  6
 22;   unsigned short *count)            |  7
 23global sym(vp8_temporal_filter_apply_sse2)
 24sym(vp8_temporal_filter_apply_sse2):
 25
 26    push        rbp
 27    mov         rbp, rsp
 28    SHADOW_ARGS_TO_STACK 8
 29    SAVE_XMM 7
 30    GET_GOT     rbx
 31    push        rsi
 32    push        rdi
 33    ALIGN_STACK 16, rax
 34    %define block_size    0
 35    %define strength      16
 36    %define filter_weight 32
 37    %define rounding_bit  48
 38    %define rbp_backup    64
 39    %define stack_size    80
 40    sub         rsp,           stack_size
 41    mov         [rsp + rbp_backup], rbp
 42    ; end prolog
 43
 44        mov         rdx,            arg(3)
 45        mov         [rsp + block_size], rdx
 46        movd        xmm6,            arg(4)
 47        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
 48
 49        ; calculate the rounding bit outside the loop
 50        ; 0x8000 >> (16 - strength)
 51        mov         rdx,            16
 52        sub         rdx,            arg(4) ; 16 - strength
 53        movd        xmm4,           rdx    ; can't use rdx w/ shift
 54        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
 55        psrlw       xmm5,           xmm4
 56        movdqa      [rsp + rounding_bit], xmm5
 57
 58        mov         rsi,            arg(0) ; src/frame1
 59        mov         rdx,            arg(2) ; predictor frame
 60        mov         rdi,            arg(6) ; accumulator
 61        mov         rax,            arg(7) ; count
 62
 63        ; dup the filter weight and store for later
 64        movd        xmm0,           arg(5) ; filter_weight
 65        pshuflw     xmm0,           xmm0, 0
 66        punpcklwd   xmm0,           xmm0
 67        movdqa      [rsp + filter_weight], xmm0
 68
 69        mov         rbp,            arg(1) ; stride
 70        pxor        xmm7,           xmm7   ; zero for extraction
 71
 72        lea         rcx,            [rdx + 16*16*1]
 73        cmp         dword ptr [rsp + block_size], 8
 74        jne         temporal_filter_apply_load_16
 75        lea         rcx,            [rdx + 8*8*1]
 76
 77temporal_filter_apply_load_8:
 78        movq        xmm0,           [rsi]  ; first row
 79        lea         rsi,            [rsi + rbp] ; += stride
 80        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
 81        movq        xmm1,           [rsi]  ; second row
 82        lea         rsi,            [rsi + rbp] ; += stride
 83        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
 84        jmp         temporal_filter_apply_load_finished
 85
 86temporal_filter_apply_load_16:
 87        movdqa      xmm0,           [rsi]  ; src (frame1)
 88        lea         rsi,            [rsi + rbp] ; += stride
 89        movdqa      xmm1,           xmm0
 90        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
 91        punpckhbw   xmm1,           xmm7   ; src[ 8-15]
 92
 93temporal_filter_apply_load_finished:
 94        movdqa      xmm2,           [rdx]  ; predictor (frame2)
 95        movdqa      xmm3,           xmm2
 96        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
 97        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
 98
 99        ; modifier = src_byte - pixel_value
100        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
101        psubw       xmm1,           xmm3   ; src - pred[ 8-15]
102
103        ; modifier *= modifier
104        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
105        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
106
107        ; modifier *= 3
108        pmullw      xmm0,           [GLOBAL(_const_3w)]
109        pmullw      xmm1,           [GLOBAL(_const_3w)]
110
111        ; modifer += 0x8000 >> (16 - strength)
112        paddw       xmm0,           [rsp + rounding_bit]
113        paddw       xmm1,           [rsp + rounding_bit]
114
115        ; modifier >>= strength
116        psrlw       xmm0,           [rsp + strength]
117        psrlw       xmm1,           [rsp + strength]
118
119        ; modifier = 16 - modifier
120        ; saturation takes care of modifier > 16
121        movdqa      xmm3,           [GLOBAL(_const_16w)]
122        movdqa      xmm2,           [GLOBAL(_const_16w)]
123        psubusw     xmm3,           xmm1
124        psubusw     xmm2,           xmm0
125
126        ; modifier *= filter_weight
127        pmullw      xmm2,           [rsp + filter_weight]
128        pmullw      xmm3,           [rsp + filter_weight]
129
130        ; count
131        movdqa      xmm4,           [rax]
132        movdqa      xmm5,           [rax+16]
133        ; += modifier
134        paddw       xmm4,           xmm2
135        paddw       xmm5,           xmm3
136        ; write back
137        movdqa      [rax],          xmm4
138        movdqa      [rax+16],       xmm5
139        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
140
141        ; load and extract the predictor up to shorts
142        pxor        xmm7,           xmm7
143        movdqa      xmm0,           [rdx]
144        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
145        movdqa      xmm1,           xmm0
146        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
147        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
148
149        ; modifier *= pixel_value
150        pmullw      xmm0,           xmm2
151        pmullw      xmm1,           xmm3
152
153        ; expand to double words
154        movdqa      xmm2,           xmm0
155        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
156        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
157        movdqa      xmm3,           xmm1
158        punpcklwd   xmm1,           xmm7   ; [ 8-11]
159        punpckhwd   xmm3,           xmm7   ; [12-15]
160
161        ; accumulator
162        movdqa      xmm4,           [rdi]
163        movdqa      xmm5,           [rdi+16]
164        movdqa      xmm6,           [rdi+32]
165        movdqa      xmm7,           [rdi+48]
166        ; += modifier
167        paddd       xmm4,           xmm0
168        paddd       xmm5,           xmm2
169        paddd       xmm6,           xmm1
170        paddd       xmm7,           xmm3
171        ; write back
172        movdqa      [rdi],          xmm4
173        movdqa      [rdi+16],       xmm5
174        movdqa      [rdi+32],       xmm6
175        movdqa      [rdi+48],       xmm7
176        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
177
178        cmp         rdx,            rcx
179        je          temporal_filter_apply_epilog
180        pxor        xmm7,           xmm7   ; zero for extraction
181        cmp         dword ptr [rsp + block_size], 16
182        je          temporal_filter_apply_load_16
183        jmp         temporal_filter_apply_load_8
184
185temporal_filter_apply_epilog:
186    ; begin epilog
187    mov         rbp,            [rsp + rbp_backup]
188    add         rsp,            stack_size
189    pop         rsp
190    pop         rdi
191    pop         rsi
192    RESTORE_GOT
193    RESTORE_XMM
194    UNSHADOW_ARGS
195    pop         rbp
196    ret
197
198SECTION_RODATA
199align 16
200_const_3w:
201    times 8 dw 3
202align 16
203_const_top_bit:
204    times 8 dw 1<<15
205align 16
206_const_16w
207    times 8 dw 16