PageRenderTime 44ms CodeModel.GetById 16ms app.highlight 22ms RepoModel.GetById 1ms app.codeStats 1ms

/media/libjpeg/simd/jimmxred.asm

http://github.com/zpao/v8monkey
Assembly | 706 lines | 482 code | 132 blank | 92 comment | 1 complexity | 68891dfcfa1d936e39e0b047945dd602 MD5 | raw file
  1;
  2; jimmxred.asm - reduced-size IDCT (MMX)
  3;
  4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5;
  6; Based on
  7; x86 SIMD extension for IJG JPEG library
  8; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9; For conditions of distribution and use, see copyright notice in jsimdext.inc
 10;
 11; This file should be assembled with NASM (Netwide Assembler),
 12; can *not* be assembled with Microsoft's MASM or any compatible
 13; assembler (including Borland's Turbo Assembler).
 14; NASM is available from http://nasm.sourceforge.net/ or
 15; http://sourceforge.net/project/showfiles.php?group_id=6208
 16;
 17; This file contains inverse-DCT routines that produce reduced-size
 18; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
 19; The following code is based directly on the IJG's original jidctred.c;
 20; see the jidctred.c for more details.
 21;
 22; [TAB8]
 23
 24%include "jsimdext.inc"
 25%include "jdct.inc"
 26
 27; --------------------------------------------------------------------------
 28
 29%define CONST_BITS	13
 30%define PASS1_BITS	2
 31
 32%define DESCALE_P1_4	(CONST_BITS-PASS1_BITS+1)
 33%define DESCALE_P2_4	(CONST_BITS+PASS1_BITS+3+1)
 34%define DESCALE_P1_2	(CONST_BITS-PASS1_BITS+2)
 35%define DESCALE_P2_2	(CONST_BITS+PASS1_BITS+3+2)
 36
 37%if CONST_BITS == 13
 38F_0_211	equ	 1730		; FIX(0.211164243)
 39F_0_509	equ	 4176		; FIX(0.509795579)
 40F_0_601	equ	 4926		; FIX(0.601344887)
 41F_0_720	equ	 5906		; FIX(0.720959822)
 42F_0_765	equ	 6270		; FIX(0.765366865)
 43F_0_850	equ	 6967		; FIX(0.850430095)
 44F_0_899	equ	 7373		; FIX(0.899976223)
 45F_1_061	equ	 8697		; FIX(1.061594337)
 46F_1_272	equ	10426		; FIX(1.272758580)
 47F_1_451	equ	11893		; FIX(1.451774981)
 48F_1_847	equ	15137		; FIX(1.847759065)
 49F_2_172	equ	17799		; FIX(2.172734803)
 50F_2_562	equ	20995		; FIX(2.562915447)
 51F_3_624	equ	29692		; FIX(3.624509785)
 52%else
 53; NASM cannot do compile-time arithmetic on floating-point constants.
 54%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
 55F_0_211	equ	DESCALE( 226735879,30-CONST_BITS)	; FIX(0.211164243)
 56F_0_509	equ	DESCALE( 547388834,30-CONST_BITS)	; FIX(0.509795579)
 57F_0_601	equ	DESCALE( 645689155,30-CONST_BITS)	; FIX(0.601344887)
 58F_0_720	equ	DESCALE( 774124714,30-CONST_BITS)	; FIX(0.720959822)
 59F_0_765	equ	DESCALE( 821806413,30-CONST_BITS)	; FIX(0.765366865)
 60F_0_850	equ	DESCALE( 913142361,30-CONST_BITS)	; FIX(0.850430095)
 61F_0_899	equ	DESCALE( 966342111,30-CONST_BITS)	; FIX(0.899976223)
 62F_1_061	equ	DESCALE(1139878239,30-CONST_BITS)	; FIX(1.061594337)
 63F_1_272	equ	DESCALE(1366614119,30-CONST_BITS)	; FIX(1.272758580)
 64F_1_451	equ	DESCALE(1558831516,30-CONST_BITS)	; FIX(1.451774981)
 65F_1_847	equ	DESCALE(1984016188,30-CONST_BITS)	; FIX(1.847759065)
 66F_2_172	equ	DESCALE(2332956230,30-CONST_BITS)	; FIX(2.172734803)
 67F_2_562	equ	DESCALE(2751909506,30-CONST_BITS)	; FIX(2.562915447)
 68F_3_624	equ	DESCALE(3891787747,30-CONST_BITS)	; FIX(3.624509785)
 69%endif
 70
 71; --------------------------------------------------------------------------
 72	SECTION	SEG_CONST
 73
 74	alignz	16
 75	global	EXTN(jconst_idct_red_mmx)
 76
 77EXTN(jconst_idct_red_mmx):
 78
 79PW_F184_MF076	times 2 dw  F_1_847,-F_0_765
 80PW_F256_F089	times 2 dw  F_2_562, F_0_899
 81PW_F106_MF217	times 2 dw  F_1_061,-F_2_172
 82PW_MF060_MF050	times 2 dw -F_0_601,-F_0_509
 83PW_F145_MF021	times 2 dw  F_1_451,-F_0_211
 84PW_F362_MF127	times 2 dw  F_3_624,-F_1_272
 85PW_F085_MF072	times 2 dw  F_0_850,-F_0_720
 86PD_DESCALE_P1_4	times 2 dd  1 << (DESCALE_P1_4-1)
 87PD_DESCALE_P2_4	times 2 dd  1 << (DESCALE_P2_4-1)
 88PD_DESCALE_P1_2	times 2 dd  1 << (DESCALE_P1_2-1)
 89PD_DESCALE_P2_2	times 2 dd  1 << (DESCALE_P2_2-1)
 90PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
 91
 92	alignz	16
 93
 94; --------------------------------------------------------------------------
 95	SECTION	SEG_TEXT
 96	BITS	32
 97;
 98; Perform dequantization and inverse DCT on one block of coefficients,
 99; producing a reduced-size 4x4 output block.
100;
101; GLOBAL(void)
102; jsimd_idct_4x4_mmx (void * dct_table, JCOEFPTR coef_block,
103;                     JSAMPARRAY output_buf, JDIMENSION output_col)
104;
105
106%define dct_table(b)	(b)+8			; void * dct_table
107%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
108%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
109%define output_col(b)	(b)+20		; JDIMENSION output_col
110
111%define original_ebp	ebp+0
112%define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
113%define WK_NUM		2
114%define workspace	wk(0)-DCTSIZE2*SIZEOF_JCOEF
115					; JCOEF workspace[DCTSIZE2]
116
117	align	16
118	global	EXTN(jsimd_idct_4x4_mmx)
119
120EXTN(jsimd_idct_4x4_mmx):
121	push	ebp
122	mov	eax,esp				; eax = original ebp
123	sub	esp, byte 4
124	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
125	mov	[esp],eax
126	mov	ebp,esp				; ebp = aligned ebp
127	lea	esp, [workspace]
128	pushpic	ebx
129;	push	ecx		; need not be preserved
130;	push	edx		; need not be preserved
131	push	esi
132	push	edi
133
134	get_GOT	ebx		; get GOT address
135
136	; ---- Pass 1: process columns from input, store into work array.
137
138;	mov	eax, [original_ebp]
139	mov	edx, POINTER [dct_table(eax)]	; quantptr
140	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
141	lea	edi, [workspace]			; JCOEF * wsptr
142	mov	ecx, DCTSIZE/4				; ctr
143	alignx	16,7
144.columnloop:
145%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
146	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
147	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
148	jnz	short .columnDCT
149
150	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
151	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
152	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
153	por	mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
154	por	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
155	por	mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
156	por	mm0,mm1
157	packsswb mm0,mm0
158	movd	eax,mm0
159	test	eax,eax
160	jnz	short .columnDCT
161
162	; -- AC terms all zero
163
164	movq	mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
165	pmullw	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
166
167	psllw	mm0,PASS1_BITS
168
169	movq      mm2,mm0		; mm0=in0=(00 01 02 03)
170	punpcklwd mm0,mm0		; mm0=(00 00 01 01)
171	punpckhwd mm2,mm2		; mm2=(02 02 03 03)
172
173	movq      mm1,mm0
174	punpckldq mm0,mm0		; mm0=(00 00 00 00)
175	punpckhdq mm1,mm1		; mm1=(01 01 01 01)
176	movq      mm3,mm2
177	punpckldq mm2,mm2		; mm2=(02 02 02 02)
178	punpckhdq mm3,mm3		; mm3=(03 03 03 03)
179
180	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
181	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
182	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
183	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
184	jmp	near .nextcolumn
185	alignx	16,7
186%endif
187.columnDCT:
188
189	; -- Odd part
190
191	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
192	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
193	pmullw	mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
194	pmullw	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
195	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
196	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
197	pmullw	mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
198	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
199
200	movq      mm4,mm0
201	movq      mm5,mm0
202	punpcklwd mm4,mm1
203	punpckhwd mm5,mm1
204	movq      mm0,mm4
205	movq      mm1,mm5
206	pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]	; mm4=(tmp2L)
207	pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]	; mm5=(tmp2H)
208	pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]	; mm0=(tmp0L)
209	pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]	; mm1=(tmp0H)
210
211	movq      mm6,mm2
212	movq      mm7,mm2
213	punpcklwd mm6,mm3
214	punpckhwd mm7,mm3
215	movq      mm2,mm6
216	movq      mm3,mm7
217	pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]	; mm6=(tmp2L)
218	pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]	; mm7=(tmp2H)
219	pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]	; mm2=(tmp0L)
220	pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]	; mm3=(tmp0H)
221
222	paddd	mm6,mm4			; mm6=tmp2L
223	paddd	mm7,mm5			; mm7=tmp2H
224	paddd	mm2,mm0			; mm2=tmp0L
225	paddd	mm3,mm1			; mm3=tmp0H
226
227	movq	MMWORD [wk(0)], mm2	; wk(0)=tmp0L
228	movq	MMWORD [wk(1)], mm3	; wk(1)=tmp0H
229
230	; -- Even part
231
232	movq	mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
233	movq	mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
234	movq	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
235	pmullw	mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
236	pmullw	mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
237	pmullw	mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
238
239	pxor      mm1,mm1
240	pxor      mm2,mm2
241	punpcklwd mm1,mm4		; mm1=tmp0L
242	punpckhwd mm2,mm4		; mm2=tmp0H
243	psrad     mm1,(16-CONST_BITS-1)	; psrad mm1,16 & pslld mm1,CONST_BITS+1
244	psrad     mm2,(16-CONST_BITS-1)	; psrad mm2,16 & pslld mm2,CONST_BITS+1
245
246	movq      mm3,mm5		; mm5=in2=z2
247	punpcklwd mm5,mm0		; mm0=in6=z3
248	punpckhwd mm3,mm0
249	pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]	; mm5=tmp2L
250	pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]	; mm3=tmp2H
251
252	movq	mm4,mm1
253	movq	mm0,mm2
254	paddd	mm1,mm5			; mm1=tmp10L
255	paddd	mm2,mm3			; mm2=tmp10H
256	psubd	mm4,mm5			; mm4=tmp12L
257	psubd	mm0,mm3			; mm0=tmp12H
258
259	; -- Final output stage
260
261	movq	mm5,mm1
262	movq	mm3,mm2
263	paddd	mm1,mm6			; mm1=data0L
264	paddd	mm2,mm7			; mm2=data0H
265	psubd	mm5,mm6			; mm5=data3L
266	psubd	mm3,mm7			; mm3=data3H
267
268	movq	mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; mm6=[PD_DESCALE_P1_4]
269
270	paddd	mm1,mm6
271	paddd	mm2,mm6
272	psrad	mm1,DESCALE_P1_4
273	psrad	mm2,DESCALE_P1_4
274	paddd	mm5,mm6
275	paddd	mm3,mm6
276	psrad	mm5,DESCALE_P1_4
277	psrad	mm3,DESCALE_P1_4
278
279	packssdw  mm1,mm2		; mm1=data0=(00 01 02 03)
280	packssdw  mm5,mm3		; mm5=data3=(30 31 32 33)
281
282	movq	mm7, MMWORD [wk(0)]	; mm7=tmp0L
283	movq	mm6, MMWORD [wk(1)]	; mm6=tmp0H
284
285	movq	mm2,mm4
286	movq	mm3,mm0
287	paddd	mm4,mm7			; mm4=data1L
288	paddd	mm0,mm6			; mm0=data1H
289	psubd	mm2,mm7			; mm2=data2L
290	psubd	mm3,mm6			; mm3=data2H
291
292	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]	; mm7=[PD_DESCALE_P1_4]
293
294	paddd	mm4,mm7
295	paddd	mm0,mm7
296	psrad	mm4,DESCALE_P1_4
297	psrad	mm0,DESCALE_P1_4
298	paddd	mm2,mm7
299	paddd	mm3,mm7
300	psrad	mm2,DESCALE_P1_4
301	psrad	mm3,DESCALE_P1_4
302
303	packssdw  mm4,mm0		; mm4=data1=(10 11 12 13)
304	packssdw  mm2,mm3		; mm2=data2=(20 21 22 23)
305
306	movq      mm6,mm1		; transpose coefficients(phase 1)
307	punpcklwd mm1,mm4		; mm1=(00 10 01 11)
308	punpckhwd mm6,mm4		; mm6=(02 12 03 13)
309	movq      mm7,mm2		; transpose coefficients(phase 1)
310	punpcklwd mm2,mm5		; mm2=(20 30 21 31)
311	punpckhwd mm7,mm5		; mm7=(22 32 23 33)
312
313	movq      mm0,mm1		; transpose coefficients(phase 2)
314	punpckldq mm1,mm2		; mm1=(00 10 20 30)
315	punpckhdq mm0,mm2		; mm0=(01 11 21 31)
316	movq      mm3,mm6		; transpose coefficients(phase 2)
317	punpckldq mm6,mm7		; mm6=(02 12 22 32)
318	punpckhdq mm3,mm7		; mm3=(03 13 23 33)
319
320	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
321	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
322	movq	MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
323	movq	MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
324
325.nextcolumn:
326	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
327	add	edx, byte 4*SIZEOF_ISLOW_MULT_TYPE	; quantptr
328	add	edi, byte 4*DCTSIZE*SIZEOF_JCOEF	; wsptr
329	dec	ecx					; ctr
330	jnz	near .columnloop
331
332	; ---- Pass 2: process rows from work array, store into output array.
333
334	mov	eax, [original_ebp]
335	lea	esi, [workspace]			; JCOEF * wsptr
336	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
337	mov	eax, JDIMENSION [output_col(eax)]
338
339	; -- Odd part
340
341	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
342	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
343	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
344	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
345
346	movq      mm4,mm0
347	movq      mm5,mm0
348	punpcklwd mm4,mm1
349	punpckhwd mm5,mm1
350	movq      mm0,mm4
351	movq      mm1,mm5
352	pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]	; mm4=(tmp2L)
353	pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]	; mm5=(tmp2H)
354	pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]	; mm0=(tmp0L)
355	pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]	; mm1=(tmp0H)
356
357	movq      mm6,mm2
358	movq      mm7,mm2
359	punpcklwd mm6,mm3
360	punpckhwd mm7,mm3
361	movq      mm2,mm6
362	movq      mm3,mm7
363	pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]	; mm6=(tmp2L)
364	pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]	; mm7=(tmp2H)
365	pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]	; mm2=(tmp0L)
366	pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]	; mm3=(tmp0H)
367
368	paddd	mm6,mm4			; mm6=tmp2L
369	paddd	mm7,mm5			; mm7=tmp2H
370	paddd	mm2,mm0			; mm2=tmp0L
371	paddd	mm3,mm1			; mm3=tmp0H
372
373	movq	MMWORD [wk(0)], mm2	; wk(0)=tmp0L
374	movq	MMWORD [wk(1)], mm3	; wk(1)=tmp0H
375
376	; -- Even part
377
378	movq	mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
379	movq	mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
380	movq	mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
381
382	pxor      mm1,mm1
383	pxor      mm2,mm2
384	punpcklwd mm1,mm4		; mm1=tmp0L
385	punpckhwd mm2,mm4		; mm2=tmp0H
386	psrad     mm1,(16-CONST_BITS-1)	; psrad mm1,16 & pslld mm1,CONST_BITS+1
387	psrad     mm2,(16-CONST_BITS-1)	; psrad mm2,16 & pslld mm2,CONST_BITS+1
388
389	movq      mm3,mm5		; mm5=in2=z2
390	punpcklwd mm5,mm0		; mm0=in6=z3
391	punpckhwd mm3,mm0
392	pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]	; mm5=tmp2L
393	pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]	; mm3=tmp2H
394
395	movq	mm4,mm1
396	movq	mm0,mm2
397	paddd	mm1,mm5			; mm1=tmp10L
398	paddd	mm2,mm3			; mm2=tmp10H
399	psubd	mm4,mm5			; mm4=tmp12L
400	psubd	mm0,mm3			; mm0=tmp12H
401
402	; -- Final output stage
403
404	movq	mm5,mm1
405	movq	mm3,mm2
406	paddd	mm1,mm6			; mm1=data0L
407	paddd	mm2,mm7			; mm2=data0H
408	psubd	mm5,mm6			; mm5=data3L
409	psubd	mm3,mm7			; mm3=data3H
410
411	movq	mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; mm6=[PD_DESCALE_P2_4]
412
413	paddd	mm1,mm6
414	paddd	mm2,mm6
415	psrad	mm1,DESCALE_P2_4
416	psrad	mm2,DESCALE_P2_4
417	paddd	mm5,mm6
418	paddd	mm3,mm6
419	psrad	mm5,DESCALE_P2_4
420	psrad	mm3,DESCALE_P2_4
421
422	packssdw  mm1,mm2		; mm1=data0=(00 10 20 30)
423	packssdw  mm5,mm3		; mm5=data3=(03 13 23 33)
424
425	movq	mm7, MMWORD [wk(0)]	; mm7=tmp0L
426	movq	mm6, MMWORD [wk(1)]	; mm6=tmp0H
427
428	movq	mm2,mm4
429	movq	mm3,mm0
430	paddd	mm4,mm7			; mm4=data1L
431	paddd	mm0,mm6			; mm0=data1H
432	psubd	mm2,mm7			; mm2=data2L
433	psubd	mm3,mm6			; mm3=data2H
434
435	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)]	; mm7=[PD_DESCALE_P2_4]
436
437	paddd	mm4,mm7
438	paddd	mm0,mm7
439	psrad	mm4,DESCALE_P2_4
440	psrad	mm0,DESCALE_P2_4
441	paddd	mm2,mm7
442	paddd	mm3,mm7
443	psrad	mm2,DESCALE_P2_4
444	psrad	mm3,DESCALE_P2_4
445
446	packssdw  mm4,mm0		; mm4=data1=(01 11 21 31)
447	packssdw  mm2,mm3		; mm2=data2=(02 12 22 32)
448
449	movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm6=[PB_CENTERJSAMP]
450
451	packsswb  mm1,mm2		; mm1=(00 10 20 30 02 12 22 32)
452	packsswb  mm4,mm5		; mm4=(01 11 21 31 03 13 23 33)
453	paddb     mm1,mm6
454	paddb     mm4,mm6
455
456	movq      mm7,mm1		; transpose coefficients(phase 1)
457	punpcklbw mm1,mm4		; mm1=(00 01 10 11 20 21 30 31)
458	punpckhbw mm7,mm4		; mm7=(02 03 12 13 22 23 32 33)
459
460	movq      mm0,mm1		; transpose coefficients(phase 2)
461	punpcklwd mm1,mm7		; mm1=(00 01 02 03 10 11 12 13)
462	punpckhwd mm0,mm7		; mm0=(20 21 22 23 30 31 32 33)
463
464	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
465	mov	esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
466	movd	DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
467	movd	DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
468
469	psrlq	mm1,4*BYTE_BIT
470	psrlq	mm0,4*BYTE_BIT
471
472	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
473	mov	esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
474	movd	DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
475	movd	DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
476
477	emms		; empty MMX state
478
479	pop	edi
480	pop	esi
481;	pop	edx		; need not be preserved
482;	pop	ecx		; need not be preserved
483	poppic	ebx
484	mov	esp,ebp		; esp <- aligned ebp
485	pop	esp		; esp <- original ebp
486	pop	ebp
487	ret
488
489
490; --------------------------------------------------------------------------
491;
492; Perform dequantization and inverse DCT on one block of coefficients,
493; producing a reduced-size 2x2 output block.
494;
495; GLOBAL(void)
496; jsimd_idct_2x2_mmx (void * dct_table, JCOEFPTR coef_block,
497;                     JSAMPARRAY output_buf, JDIMENSION output_col)
498;
499
500%define dct_table(b)	(b)+8			; void * dct_table
501%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
502%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
503%define output_col(b)	(b)+20		; JDIMENSION output_col
504
505	align	16
506	global	EXTN(jsimd_idct_2x2_mmx)
507
508EXTN(jsimd_idct_2x2_mmx):
509	push	ebp
510	mov	ebp,esp
511	push	ebx
512;	push	ecx		; need not be preserved
513;	push	edx		; need not be preserved
514	push	esi
515	push	edi
516
517	get_GOT	ebx		; get GOT address
518
519	; ---- Pass 1: process columns from input.
520
521	mov	edx, POINTER [dct_table(ebp)]	; quantptr
522	mov	esi, JCOEFPTR [coef_block(ebp)]		; inptr
523
524	; | input:                  | result:        |
525	; | 00 01 ** 03 ** 05 ** 07 |                |
526	; | 10 11 ** 13 ** 15 ** 17 |                |
527	; | ** ** ** ** ** ** ** ** |                |
528	; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
529	; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
530	; | 50 51 ** 53 ** 55 ** 57 |                |
531	; | ** ** ** ** ** ** ** ** |                |
532	; | 70 71 ** 73 ** 75 ** 77 |                |
533
534	; -- Odd part
535
536	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
537	movq	mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
538	pmullw	mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
539	pmullw	mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
540	movq	mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
541	movq	mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
542	pmullw	mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
543	pmullw	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
544
545	; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
546	; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
547
548	pcmpeqd   mm7,mm7
549	pslld     mm7,WORD_BIT		; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
550
551	movq      mm4,mm0		; mm4=(10 11 ** 13)
552	movq      mm5,mm2		; mm5=(50 51 ** 53)
553	punpcklwd mm4,mm1		; mm4=(10 30 11 31)
554	punpcklwd mm5,mm3		; mm5=(50 70 51 71)
555	pmaddwd   mm4,[GOTOFF(ebx,PW_F362_MF127)]
556	pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
557
558	psrld	mm0,WORD_BIT		; mm0=(11 -- 13 --)
559	pand	mm1,mm7			; mm1=(-- 31 -- 33)
560	psrld	mm2,WORD_BIT		; mm2=(51 -- 53 --)
561	pand	mm3,mm7			; mm3=(-- 71 -- 73)
562	por	mm0,mm1			; mm0=(11 31 13 33)
563	por	mm2,mm3			; mm2=(51 71 53 73)
564	pmaddwd	mm0,[GOTOFF(ebx,PW_F362_MF127)]
565	pmaddwd	mm2,[GOTOFF(ebx,PW_F085_MF072)]
566
567	paddd	mm4,mm5			; mm4=tmp0[col0 col1]
568
569	movq	mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
570	movq	mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
571	pmullw	mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
572	pmullw	mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
573	movq	mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
574	movq	mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
575	pmullw	mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
576	pmullw	mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
577
578	; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
579	; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
580
581	psrld	mm6,WORD_BIT		; mm6=(15 -- 17 --)
582	pand	mm1,mm7			; mm1=(-- 35 -- 37)
583	psrld	mm3,WORD_BIT		; mm3=(55 -- 57 --)
584	pand	mm5,mm7			; mm5=(-- 75 -- 77)
585	por	mm6,mm1			; mm6=(15 35 17 37)
586	por	mm3,mm5			; mm3=(55 75 57 77)
587	pmaddwd	mm6,[GOTOFF(ebx,PW_F362_MF127)]
588	pmaddwd	mm3,[GOTOFF(ebx,PW_F085_MF072)]
589
590	paddd	mm0,mm2			; mm0=tmp0[col1 col3]
591	paddd	mm6,mm3			; mm6=tmp0[col5 col7]
592
593	; -- Even part
594
595	movq	mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
596	movq	mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
597	pmullw	mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
598	pmullw	mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
599
600	; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
601
602	movq	mm2,mm1				; mm2=(00 01 ** 03)
603	pslld	mm1,WORD_BIT			; mm1=(-- 00 -- **)
604	psrad	mm1,(WORD_BIT-CONST_BITS-2)	; mm1=tmp10[col0 ****]
605
606	pand	mm2,mm7				; mm2=(-- 01 -- 03)
607	pand	mm5,mm7				; mm5=(-- 05 -- 07)
608	psrad	mm2,(WORD_BIT-CONST_BITS-2)	; mm2=tmp10[col1 col3]
609	psrad	mm5,(WORD_BIT-CONST_BITS-2)	; mm5=tmp10[col5 col7]
610
611	; -- Final output stage
612
613	movq      mm3,mm1
614	paddd     mm1,mm4		; mm1=data0[col0 ****]=(A0 **)
615	psubd     mm3,mm4		; mm3=data1[col0 ****]=(B0 **)
616	punpckldq mm1,mm3		; mm1=(A0 B0)
617
618	movq	mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)]	; mm7=[PD_DESCALE_P1_2]
619
620	movq	mm4,mm2
621	movq	mm3,mm5
622	paddd	mm2,mm0			; mm2=data0[col1 col3]=(A1 A3)
623	paddd	mm5,mm6			; mm5=data0[col5 col7]=(A5 A7)
624	psubd	mm4,mm0			; mm4=data1[col1 col3]=(B1 B3)
625	psubd	mm3,mm6			; mm3=data1[col5 col7]=(B5 B7)
626
627	paddd	mm1,mm7
628	psrad	mm1,DESCALE_P1_2
629
630	paddd	mm2,mm7
631	paddd	mm5,mm7
632	psrad	mm2,DESCALE_P1_2
633	psrad	mm5,DESCALE_P1_2
634	paddd	mm4,mm7
635	paddd	mm3,mm7
636	psrad	mm4,DESCALE_P1_2
637	psrad	mm3,DESCALE_P1_2
638
639	; ---- Pass 2: process rows, store into output array.
640
641	mov	edi, JSAMPARRAY [output_buf(ebp)]	; (JSAMPROW *)
642	mov	eax, JDIMENSION [output_col(ebp)]
643
644	; | input:| result:|
645	; | A0 B0 |        |
646	; | A1 B1 | C0 C1  |
647	; | A3 B3 | D0 D1  |
648	; | A5 B5 |        |
649	; | A7 B7 |        |
650
651	; -- Odd part
652
653	packssdw  mm2,mm4		; mm2=(A1 A3 B1 B3)
654	packssdw  mm5,mm3		; mm5=(A5 A7 B5 B7)
655	pmaddwd   mm2,[GOTOFF(ebx,PW_F362_MF127)]
656	pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
657
658	paddd     mm2,mm5		; mm2=tmp0[row0 row1]
659
660	; -- Even part
661
662	pslld     mm1,(CONST_BITS+2)	; mm1=tmp10[row0 row1]
663
664	; -- Final output stage
665
666	movq      mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)]	; mm0=[PD_DESCALE_P2_2]
667
668	movq      mm6,mm1
669	paddd     mm1,mm2		; mm1=data0[row0 row1]=(C0 C1)
670	psubd     mm6,mm2		; mm6=data1[row0 row1]=(D0 D1)
671
672	paddd     mm1,mm0
673	paddd     mm6,mm0
674	psrad     mm1,DESCALE_P2_2
675	psrad     mm6,DESCALE_P2_2
676
677	movq      mm7,mm1		; transpose coefficients
678	punpckldq mm1,mm6		; mm1=(C0 D0)
679	punpckhdq mm7,mm6		; mm7=(C1 D1)
680
681	packssdw  mm1,mm7		; mm1=(C0 D0 C1 D1)
682	packsswb  mm1,mm1		; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
683	paddb     mm1,[GOTOFF(ebx,PB_CENTERJSAMP)]
684
685	movd	ecx,mm1
686	movd	ebx,mm1			; ebx=(C0 D0 C1 D1)
687	shr	ecx,2*BYTE_BIT		; ecx=(C1 D1 -- --)
688
689	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
690	mov	esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
691	mov	WORD [edx+eax*SIZEOF_JSAMPLE], bx
692	mov	WORD [esi+eax*SIZEOF_JSAMPLE], cx
693
694	emms		; empty MMX state
695
696	pop	edi
697	pop	esi
698;	pop	edx		; need not be preserved
699;	pop	ecx		; need not be preserved
700	pop	ebx
701	pop	ebp
702	ret
703
704; For some reason, the OS X linker does not honor the request to align the
705; segment unless we do this.
706	align	16