PageRenderTime 51ms CodeModel.GetById 14ms app.highlight 30ms RepoModel.GetById 1ms app.codeStats 1ms

/media/libjpeg/simd/jcsammmx.asm

http://github.com/zpao/v8monkey
Assembly | 324 lines | 204 code | 63 blank | 57 comment | 0 complexity | 68c4533083e89c591bb5ca37917140cd MD5 | raw file
  1;
  2; jcsammmx.asm - downsampling (MMX)
  3;
  4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5;
  6; Based on
  7; x86 SIMD extension for IJG JPEG library
  8; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9; For conditions of distribution and use, see copyright notice in jsimdext.inc
 10;
 11; This file should be assembled with NASM (Netwide Assembler),
 12; can *not* be assembled with Microsoft's MASM or any compatible
 13; assembler (including Borland's Turbo Assembler).
 14; NASM is available from http://nasm.sourceforge.net/ or
 15; http://sourceforge.net/project/showfiles.php?group_id=6208
 16;
 17; [TAB8]
 18
 19%include "jsimdext.inc"
 20
 21; --------------------------------------------------------------------------
 22	SECTION	SEG_TEXT
 23	BITS	32
 24;
 25; Downsample pixel values of a single component.
 26; This version handles the common case of 2:1 horizontal and 1:1 vertical,
 27; without smoothing.
 28;
 29; GLOBAL(void)
 30; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
 31;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
 32;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
 33;
 34
 35%define img_width(b)	(b)+8			; JDIMENSION image_width
 36%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
 37%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
 38%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
 39%define input_data(b)	(b)+24		; JSAMPARRAY input_data
 40%define output_data(b)	(b)+28	; JSAMPARRAY output_data
 41
 42	align	16
 43	global	EXTN(jsimd_h2v1_downsample_mmx)
 44
 45EXTN(jsimd_h2v1_downsample_mmx):
 46	push	ebp
 47	mov	ebp,esp
 48;	push	ebx		; unused
 49;	push	ecx		; need not be preserved
 50;	push	edx		; need not be preserved
 51	push	esi
 52	push	edi
 53
 54	mov	ecx, JDIMENSION [width_blks(ebp)]
 55	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
 56	jz	near .return
 57
 58	mov	edx, JDIMENSION [img_width(ebp)]
 59
 60	; -- expand_right_edge
 61
 62	push	ecx
 63	shl	ecx,1				; output_cols * 2
 64	sub	ecx,edx
 65	jle	short .expand_end
 66
 67	mov	eax, INT [max_v_samp(ebp)]
 68	test	eax,eax
 69	jle	short .expand_end
 70
 71	cld
 72	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
 73	alignx	16,7
 74.expandloop:
 75	push	eax
 76	push	ecx
 77
 78	mov	edi, JSAMPROW [esi]
 79	add	edi,edx
 80	mov	al, JSAMPLE [edi-1]
 81
 82	rep stosb
 83
 84	pop	ecx
 85	pop	eax
 86
 87	add	esi, byte SIZEOF_JSAMPROW
 88	dec	eax
 89	jg	short .expandloop
 90
 91.expand_end:
 92	pop	ecx				; output_cols
 93
 94	; -- h2v1_downsample
 95
 96	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
 97	test	eax,eax
 98	jle	near .return
 99
100	mov       edx, 0x00010000	; bias pattern
101	movd      mm7,edx
102	pcmpeqw   mm6,mm6
103	punpckldq mm7,mm7		; mm7={0, 1, 0, 1}
104	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
105
106	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
107	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
108	alignx	16,7
109.rowloop:
110	push	ecx
111	push	edi
112	push	esi
113
114	mov	esi, JSAMPROW [esi]		; inptr
115	mov	edi, JSAMPROW [edi]		; outptr
116	alignx	16,7
117.columnloop:
118
119	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
120	movq	mm1, MMWORD [esi+1*SIZEOF_MMWORD]
121	movq	mm2,mm0
122	movq	mm3,mm1
123
124	pand	mm0,mm6
125	psrlw	mm2,BYTE_BIT
126	pand	mm1,mm6
127	psrlw	mm3,BYTE_BIT
128
129	paddw	mm0,mm2
130	paddw	mm1,mm3
131	paddw	mm0,mm7
132	paddw	mm1,mm7
133	psrlw	mm0,1
134	psrlw	mm1,1
135
136	packuswb mm0,mm1
137
138	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
139
140	add	esi, byte 2*SIZEOF_MMWORD	; inptr
141	add	edi, byte 1*SIZEOF_MMWORD	; outptr
142	sub	ecx, byte SIZEOF_MMWORD		; outcol
143	jnz	short .columnloop
144
145	pop	esi
146	pop	edi
147	pop	ecx
148
149	add	esi, byte SIZEOF_JSAMPROW	; input_data
150	add	edi, byte SIZEOF_JSAMPROW	; output_data
151	dec	eax				; rowctr
152	jg	short .rowloop
153
154	emms		; empty MMX state
155
156.return:
157	pop	edi
158	pop	esi
159;	pop	edx		; need not be preserved
160;	pop	ecx		; need not be preserved
161;	pop	ebx		; unused
162	pop	ebp
163	ret
164
165; --------------------------------------------------------------------------
166;
167; Downsample pixel values of a single component.
168; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
169; without smoothing.
170;
171; GLOBAL(void)
172; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
173;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
174;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
175;
176
177%define img_width(b)	(b)+8			; JDIMENSION image_width
178%define max_v_samp(b)	(b)+12		; int max_v_samp_factor
179%define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
180%define width_blks(b)	(b)+20		; JDIMENSION width_blocks
181%define input_data(b)	(b)+24		; JSAMPARRAY input_data
182%define output_data(b)	(b)+28	; JSAMPARRAY output_data
183
184	align	16
185	global	EXTN(jsimd_h2v2_downsample_mmx)
186
187EXTN(jsimd_h2v2_downsample_mmx):
188	push	ebp
189	mov	ebp,esp
190;	push	ebx		; unused
191;	push	ecx		; need not be preserved
192;	push	edx		; need not be preserved
193	push	esi
194	push	edi
195
196	mov	ecx, JDIMENSION [width_blks(ebp)]
197	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
198	jz	near .return
199
200	mov	edx, JDIMENSION [img_width(ebp)]
201
202	; -- expand_right_edge
203
204	push	ecx
205	shl	ecx,1				; output_cols * 2
206	sub	ecx,edx
207	jle	short .expand_end
208
209	mov	eax, INT [max_v_samp(ebp)]
210	test	eax,eax
211	jle	short .expand_end
212
213	cld
214	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
215	alignx	16,7
216.expandloop:
217	push	eax
218	push	ecx
219
220	mov	edi, JSAMPROW [esi]
221	add	edi,edx
222	mov	al, JSAMPLE [edi-1]
223
224	rep stosb
225
226	pop	ecx
227	pop	eax
228
229	add	esi, byte SIZEOF_JSAMPROW
230	dec	eax
231	jg	short .expandloop
232
233.expand_end:
234	pop	ecx				; output_cols
235
236	; -- h2v2_downsample
237
238	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
239	test	eax,eax
240	jle	near .return
241
242	mov       edx, 0x00020001	; bias pattern
243	movd      mm7,edx
244	pcmpeqw   mm6,mm6
245	punpckldq mm7,mm7		; mm7={1, 2, 1, 2}
246	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
247
248	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
249	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
250	alignx	16,7
251.rowloop:
252	push	ecx
253	push	edi
254	push	esi
255
256	mov	edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
257	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1
258	mov	edi, JSAMPROW [edi]			; outptr
259	alignx	16,7
260.columnloop:
261
262	movq	mm0, MMWORD [edx+0*SIZEOF_MMWORD]
263	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
264	movq	mm2, MMWORD [edx+1*SIZEOF_MMWORD]
265	movq	mm3, MMWORD [esi+1*SIZEOF_MMWORD]
266
267	movq	mm4,mm0
268	movq	mm5,mm1
269	pand	mm0,mm6
270	psrlw	mm4,BYTE_BIT
271	pand	mm1,mm6
272	psrlw	mm5,BYTE_BIT
273	paddw	mm0,mm4
274	paddw	mm1,mm5
275
276	movq	mm4,mm2
277	movq	mm5,mm3
278	pand	mm2,mm6
279	psrlw	mm4,BYTE_BIT
280	pand	mm3,mm6
281	psrlw	mm5,BYTE_BIT
282	paddw	mm2,mm4
283	paddw	mm3,mm5
284
285	paddw	mm0,mm1
286	paddw	mm2,mm3
287	paddw	mm0,mm7
288	paddw	mm2,mm7
289	psrlw	mm0,2
290	psrlw	mm2,2
291
292	packuswb mm0,mm2
293
294	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
295
296	add	edx, byte 2*SIZEOF_MMWORD	; inptr0
297	add	esi, byte 2*SIZEOF_MMWORD	; inptr1
298	add	edi, byte 1*SIZEOF_MMWORD	; outptr
299	sub	ecx, byte SIZEOF_MMWORD		; outcol
300	jnz	near .columnloop
301
302	pop	esi
303	pop	edi
304	pop	ecx
305
306	add	esi, byte 2*SIZEOF_JSAMPROW	; input_data
307	add	edi, byte 1*SIZEOF_JSAMPROW	; output_data
308	dec	eax				; rowctr
309	jg	near .rowloop
310
311	emms		; empty MMX state
312
313.return:
314	pop	edi
315	pop	esi
316;	pop	edx		; need not be preserved
317;	pop	ecx		; need not be preserved
318;	pop	ebx		; unused
319	pop	ebp
320	ret
321
322; For some reason, the OS X linker does not honor the request to align the
323; segment unless we do this.
324	align	16