/media/libvpx/vp8/common/x86/subpixel_mmx.asm
http://github.com/zpao/v8monkey · Assembly · 727 lines · 466 code · 181 blank · 80 comment · 0 complexity · c5007295dc4543de92fdd3602b3e30ec MD5 · raw file
- ;
- ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- ;
- ; Use of this source code is governed by a BSD-style license
- ; that can be found in the LICENSE file in the root of the source
- ; tree. An additional intellectual property rights grant can be found
- ; in the file PATENTS. All contributing project authors may
- ; be found in the AUTHORS file in the root of the source tree.
- ;
- %include "vpx_ports/x86_abi_support.asm"
- %define BLOCK_HEIGHT_WIDTH 4
- %define vp8_filter_weight 128
- %define VP8_FILTER_SHIFT 7
- ;void vp8_filter_block1d_h6_mmx
- ;(
- ; unsigned char *src_ptr,
- ; unsigned short *output_ptr,
- ; unsigned int src_pixels_per_line,
- ; unsigned int pixel_step,
- ; unsigned int output_height,
- ; unsigned int output_width,
- ; short * vp8_filter
- ;)
- global sym(vp8_filter_block1d_h6_mmx)
- sym(vp8_filter_block1d_h6_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
- mov rdx, arg(6) ;vp8_filter
- movq mm1, [rdx + 16] ; do both the negative taps first!!!
- movq mm2, [rdx + 32] ;
- movq mm6, [rdx + 48] ;
- movq mm7, [rdx + 64] ;
- mov rdi, arg(1) ;output_ptr
- mov rsi, arg(0) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
- nextrow:
- movq mm3, [rsi-2] ; mm3 = p-2..p5
- movq mm4, mm3 ; mm4 = p-2..p5
- psrlq mm3, 8 ; mm3 = p-1..p5
- punpcklbw mm3, mm0 ; mm3 = p-1..p2
- pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
- movq mm5, mm4 ; mm5 = p-2..p5
- punpckhbw mm4, mm0 ; mm5 = p2..p5
- pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
- paddsw mm3, mm4 ; mm3 += mm5
- movq mm4, mm5 ; mm4 = p-2..p5;
- psrlq mm5, 16 ; mm5 = p0..p5;
- punpcklbw mm5, mm0 ; mm5 = p0..p3
- pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
- paddsw mm3, mm5 ; mm3 += mm5
- movq mm5, mm4 ; mm5 = p-2..p5
- psrlq mm4, 24 ; mm4 = p1..p5
- punpcklbw mm4, mm0 ; mm4 = p1..p4
- pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
- paddsw mm3, mm4 ; mm3 += mm5
- ; do outer positive taps
- movd mm4, [rsi+3]
- punpcklbw mm4, mm0 ; mm5 = p3..p6
- pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
- paddsw mm3, mm4 ; mm3 += mm5
- punpcklbw mm5, mm0 ; mm5 = p-2..p1
- pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
- paddsw mm3, mm5 ; mm3 += mm5
- paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and unpack to saturate
- punpcklbw mm3, mm0 ;
- movq [rdi], mm3 ; store the results in the destination
- %if ABI_IS_32BIT
- add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
- add rdi, rax;
- %else
- movsxd r8, dword ptr arg(2) ;src_pixels_per_line
- add rdi, rax;
- add rsi, r8 ; next line
- %endif
- dec rcx ; decrement count
- jnz nextrow ; next row
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
- ;void vp8_filter_block1dc_v6_mmx
- ;(
- ; short *src_ptr,
- ; unsigned char *output_ptr,
- ; int output_pitch,
- ; unsigned int pixels_per_line,
- ; unsigned int pixel_step,
- ; unsigned int output_height,
- ; unsigned int output_width,
- ; short * vp8_filter
- ;)
- global sym(vp8_filter_block1dc_v6_mmx)
- sym(vp8_filter_block1dc_v6_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
- movq mm5, [GLOBAL(rd)]
- push rbx
- mov rbx, arg(7) ;vp8_filter
- movq mm1, [rbx + 16] ; do both the negative taps first!!!
- movq mm2, [rbx + 32] ;
- movq mm6, [rbx + 48] ;
- movq mm7, [rbx + 64] ;
- movsxd rdx, dword ptr arg(3) ;pixels_per_line
- mov rdi, arg(1) ;output_ptr
- mov rsi, arg(0) ;src_ptr
- sub rsi, rdx
- sub rsi, rdx
- movsxd rcx, DWORD PTR arg(5) ;output_height
- movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
- nextrow_cv:
- movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
- pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
- movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
- pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
- movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
- pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
- movq mm4, [rsi] ; mm4 = p0..p3 = row -2
- pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
- add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
- movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
- pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
- movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
- pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
- paddsw mm3, mm5 ; mm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and saturate
- movd [rdi],mm3 ; store the results in the destination
- ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
- ; recon block should be in cache this shouldn't cost much. Its obviously
- ; avoidable!!!.
- lea rdi, [rdi+rax] ;
- dec rcx ; decrement count
- jnz nextrow_cv ; next row
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
- ;void bilinear_predict8x8_mmx
- ;(
- ; unsigned char *src_ptr,
- ; int src_pixels_per_line,
- ; int xoffset,
- ; int yoffset,
- ; unsigned char *dst_ptr,
- ; int dst_pitch
- ;)
- global sym(vp8_bilinear_predict8x8_mmx)
- sym(vp8_bilinear_predict8x8_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
- ;const short *HFilter = bilinear_filters_mmx[xoffset];
- ;const short *VFilter = bilinear_filters_mmx[yoffset];
- movsxd rax, dword ptr arg(2) ;xoffset
- mov rdi, arg(4) ;dst_ptr ;
- shl rax, 5 ; offset * 32
- lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))]
- add rax, rcx ; HFilter
- mov rsi, arg(0) ;src_ptr ;
- movsxd rdx, dword ptr arg(5) ;dst_pitch
- movq mm1, [rax] ;
- movq mm2, [rax+16] ;
- movsxd rax, dword ptr arg(3) ;yoffset
- pxor mm0, mm0 ;
- shl rax, 5 ; offset*32
- add rax, rcx ; VFilter
- lea rcx, [rdi+rdx*8] ;
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
- ; get the first horizontal line done ;
- movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movq mm4, mm3 ; make a copy of current line
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- punpckhbw mm4, mm0 ;
- pmullw mm3, mm1 ;
- pmullw mm4, mm1 ;
- movq mm5, [rsi+1] ;
- movq mm6, mm5 ;
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0 ;
- pmullw mm5, mm2 ;
- pmullw mm6, mm2 ;
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
- movq mm7, mm3 ;
- packuswb mm7, mm4 ;
- add rsi, rdx ; next line
- next_row_8x8:
- movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movq mm4, mm3 ; make a copy of current line
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- punpckhbw mm4, mm0 ;
- pmullw mm3, mm1 ;
- pmullw mm4, mm1 ;
- movq mm5, [rsi+1] ;
- movq mm6, mm5 ;
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0 ;
- pmullw mm5, mm2 ;
- pmullw mm6, mm2 ;
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
- movq mm5, mm7 ;
- movq mm6, mm7 ;
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0
- pmullw mm5, [rax] ;
- pmullw mm6, [rax] ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
- movq mm7, mm3 ;
- packuswb mm7, mm4 ;
- pmullw mm3, [rax+16] ;
- pmullw mm4, [rax+16] ;
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
- packuswb mm3, mm4
- movq [rdi], mm3 ; store the results in the destination
- %if ABI_IS_32BIT
- add rsi, rdx ; next line
- add rdi, dword ptr arg(5) ;dst_pitch ;
- %else
- movsxd r8, dword ptr arg(5) ;dst_pitch
- add rsi, rdx ; next line
- add rdi, r8 ;dst_pitch
- %endif
- cmp rdi, rcx ;
- jne next_row_8x8
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
- ;void bilinear_predict8x4_mmx
- ;(
- ; unsigned char *src_ptr,
- ; int src_pixels_per_line,
- ; int xoffset,
- ; int yoffset,
- ; unsigned char *dst_ptr,
- ; int dst_pitch
- ;)
- global sym(vp8_bilinear_predict8x4_mmx)
- sym(vp8_bilinear_predict8x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
- ;const short *HFilter = bilinear_filters_mmx[xoffset];
- ;const short *VFilter = bilinear_filters_mmx[yoffset];
- movsxd rax, dword ptr arg(2) ;xoffset
- mov rdi, arg(4) ;dst_ptr ;
- lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))]
- shl rax, 5
- mov rsi, arg(0) ;src_ptr ;
- add rax, rcx
- movsxd rdx, dword ptr arg(5) ;dst_pitch
- movq mm1, [rax] ;
- movq mm2, [rax+16] ;
- movsxd rax, dword ptr arg(3) ;yoffset
- pxor mm0, mm0 ;
- shl rax, 5
- add rax, rcx
- lea rcx, [rdi+rdx*4] ;
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
- ; get the first horizontal line done ;
- movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movq mm4, mm3 ; make a copy of current line
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- punpckhbw mm4, mm0 ;
- pmullw mm3, mm1 ;
- pmullw mm4, mm1 ;
- movq mm5, [rsi+1] ;
- movq mm6, mm5 ;
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0 ;
- pmullw mm5, mm2 ;
- pmullw mm6, mm2 ;
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
- movq mm7, mm3 ;
- packuswb mm7, mm4 ;
- add rsi, rdx ; next line
- next_row_8x4:
- movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movq mm4, mm3 ; make a copy of current line
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- punpckhbw mm4, mm0 ;
- pmullw mm3, mm1 ;
- pmullw mm4, mm1 ;
- movq mm5, [rsi+1] ;
- movq mm6, mm5 ;
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0 ;
- pmullw mm5, mm2 ;
- pmullw mm6, mm2 ;
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
- movq mm5, mm7 ;
- movq mm6, mm7 ;
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0
- pmullw mm5, [rax] ;
- pmullw mm6, [rax] ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
- movq mm7, mm3 ;
- packuswb mm7, mm4 ;
- pmullw mm3, [rax+16] ;
- pmullw mm4, [rax+16] ;
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
- packuswb mm3, mm4
- movq [rdi], mm3 ; store the results in the destination
- %if ABI_IS_32BIT
- add rsi, rdx ; next line
- add rdi, dword ptr arg(5) ;dst_pitch ;
- %else
- movsxd r8, dword ptr arg(5) ;dst_pitch
- add rsi, rdx ; next line
- add rdi, r8
- %endif
- cmp rdi, rcx ;
- jne next_row_8x4
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
- ;void bilinear_predict4x4_mmx
- ;(
- ; unsigned char *src_ptr,
- ; int src_pixels_per_line,
- ; int xoffset,
- ; int yoffset,
- ; unsigned char *dst_ptr,
- ; int dst_pitch
- ;)
- global sym(vp8_bilinear_predict4x4_mmx)
- sym(vp8_bilinear_predict4x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
- ;const short *HFilter = bilinear_filters_mmx[xoffset];
- ;const short *VFilter = bilinear_filters_mmx[yoffset];
- movsxd rax, dword ptr arg(2) ;xoffset
- mov rdi, arg(4) ;dst_ptr ;
- lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))]
- shl rax, 5
- add rax, rcx ; HFilter
- mov rsi, arg(0) ;src_ptr ;
- movsxd rdx, dword ptr arg(5) ;ldst_pitch
- movq mm1, [rax] ;
- movq mm2, [rax+16] ;
- movsxd rax, dword ptr arg(3) ;yoffset
- pxor mm0, mm0 ;
- shl rax, 5
- add rax, rcx
- lea rcx, [rdi+rdx*4] ;
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
- ; get the first horizontal line done ;
- movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- pmullw mm3, mm1 ;
- movd mm5, [rsi+1] ;
- punpcklbw mm5, mm0 ;
- pmullw mm5, mm2 ;
- paddw mm3, mm5 ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
- movq mm7, mm3 ;
- packuswb mm7, mm0 ;
- add rsi, rdx ; next line
- next_row_4x4:
- movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- pmullw mm3, mm1 ;
- movd mm5, [rsi+1] ;
- punpcklbw mm5, mm0 ;
- pmullw mm5, mm2 ;
- paddw mm3, mm5 ;
- movq mm5, mm7 ;
- punpcklbw mm5, mm0 ;
- pmullw mm5, [rax] ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
- movq mm7, mm3 ;
- packuswb mm7, mm0 ;
- pmullw mm3, [rax+16] ;
- paddw mm3, mm5 ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
- packuswb mm3, mm0
- movd [rdi], mm3 ; store the results in the destination
- %if ABI_IS_32BIT
- add rsi, rdx ; next line
- add rdi, dword ptr arg(5) ;dst_pitch ;
- %else
- movsxd r8, dword ptr arg(5) ;dst_pitch ;
- add rsi, rdx ; next line
- add rdi, r8
- %endif
- cmp rdi, rcx ;
- jne next_row_4x4
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
- SECTION_RODATA
- align 16
- rd:
- times 4 dw 0x40
- align 16
- global HIDDEN_DATA(sym(vp8_six_tap_mmx))
- sym(vp8_six_tap_mmx):
- times 8 dw 0
- times 8 dw 0
- times 8 dw 128
- times 8 dw 0
- times 8 dw 0
- times 8 dw 0
- times 8 dw 0
- times 8 dw -6
- times 8 dw 123
- times 8 dw 12
- times 8 dw -1
- times 8 dw 0
- times 8 dw 2
- times 8 dw -11
- times 8 dw 108
- times 8 dw 36
- times 8 dw -8
- times 8 dw 1
- times 8 dw 0
- times 8 dw -9
- times 8 dw 93
- times 8 dw 50
- times 8 dw -6
- times 8 dw 0
- times 8 dw 3
- times 8 dw -16
- times 8 dw 77
- times 8 dw 77
- times 8 dw -16
- times 8 dw 3
- times 8 dw 0
- times 8 dw -6
- times 8 dw 50
- times 8 dw 93
- times 8 dw -9
- times 8 dw 0
- times 8 dw 1
- times 8 dw -8
- times 8 dw 36
- times 8 dw 108
- times 8 dw -11
- times 8 dw 2
- times 8 dw 0
- times 8 dw -1
- times 8 dw 12
- times 8 dw 123
- times 8 dw -6
- times 8 dw 0
- align 16
- global HIDDEN_DATA(sym(vp8_bilinear_filters_mmx))
- sym(vp8_bilinear_filters_mmx):
- times 8 dw 128
- times 8 dw 0
- times 8 dw 112
- times 8 dw 16
- times 8 dw 96
- times 8 dw 32
- times 8 dw 80
- times 8 dw 48
- times 8 dw 64
- times 8 dw 64
- times 8 dw 48
- times 8 dw 80
- times 8 dw 32
- times 8 dw 96
- times 8 dw 16
- times 8 dw 112