/media/libjpeg/simd/jiss2int.asm

http://github.com/zpao/v8monkey · Assembly · 859 lines · 599 code · 147 blank · 113 comment · 1 complexity · f49937901bc900d30ad238bae805f86d MD5 · raw file

  1. ;
  2. ; jiss2int.asm - accurate integer IDCT (SSE2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ;
  6. ; Based on
  7. ; x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. ;
  17. ; This file contains a slow-but-accurate integer implementation of the
  18. ; inverse DCT (Discrete Cosine Transform). The following code is based
  19. ; directly on the IJG's original jidctint.c; see the jidctint.c for
  20. ; more details.
  21. ;
  22. ; [TAB8]
  23. %include "jsimdext.inc"
  24. %include "jdct.inc"
  25. ; --------------------------------------------------------------------------
  26. %define CONST_BITS 13
  27. %define PASS1_BITS 2
  28. %define DESCALE_P1 (CONST_BITS-PASS1_BITS)
  29. %define DESCALE_P2 (CONST_BITS+PASS1_BITS+3)
  30. %if CONST_BITS == 13
  31. F_0_298 equ 2446 ; FIX(0.298631336)
  32. F_0_390 equ 3196 ; FIX(0.390180644)
  33. F_0_541 equ 4433 ; FIX(0.541196100)
  34. F_0_765 equ 6270 ; FIX(0.765366865)
  35. F_0_899 equ 7373 ; FIX(0.899976223)
  36. F_1_175 equ 9633 ; FIX(1.175875602)
  37. F_1_501 equ 12299 ; FIX(1.501321110)
  38. F_1_847 equ 15137 ; FIX(1.847759065)
  39. F_1_961 equ 16069 ; FIX(1.961570560)
  40. F_2_053 equ 16819 ; FIX(2.053119869)
  41. F_2_562 equ 20995 ; FIX(2.562915447)
  42. F_3_072 equ 25172 ; FIX(3.072711026)
  43. %else
  44. ; NASM cannot do compile-time arithmetic on floating-point constants.
  45. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
  46. F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336)
  47. F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644)
  48. F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100)
  49. F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
  50. F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
  51. F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602)
  52. F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110)
  53. F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
  54. F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560)
  55. F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869)
  56. F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
  57. F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026)
  58. %endif
  59. ; --------------------------------------------------------------------------
  60. SECTION SEG_CONST
  61. alignz 16
  62. global EXTN(jconst_idct_islow_sse2)
  63. EXTN(jconst_idct_islow_sse2):
  64. PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541
  65. PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847)
  66. PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175
  67. PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390)
  68. PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899
  69. PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899)
  70. PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562
  71. PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562)
  72. PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1)
  73. PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1)
  74. PB_CENTERJSAMP times 16 db CENTERJSAMPLE
  75. alignz 16
  76. ; --------------------------------------------------------------------------
  77. SECTION SEG_TEXT
  78. BITS 32
  79. ;
  80. ; Perform dequantization and inverse DCT on one block of coefficients.
  81. ;
  82. ; GLOBAL(void)
  83. ; jsimd_idct_islow_sse2 (void * dct_table, JCOEFPTR coef_block,
  84. ; JSAMPARRAY output_buf, JDIMENSION output_col)
  85. ;
  86. %define dct_table(b) (b)+8 ; jpeg_component_info * compptr
  87. %define coef_block(b) (b)+12 ; JCOEFPTR coef_block
  88. %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
  89. %define output_col(b) (b)+20 ; JDIMENSION output_col
  90. %define original_ebp ebp+0
  91. %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
  92. %define WK_NUM 12
  93. align 16
  94. global EXTN(jsimd_idct_islow_sse2)
  95. EXTN(jsimd_idct_islow_sse2):
  96. push ebp
  97. mov eax,esp ; eax = original ebp
  98. sub esp, byte 4
  99. and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
  100. mov [esp],eax
  101. mov ebp,esp ; ebp = aligned ebp
  102. lea esp, [wk(0)]
  103. pushpic ebx
  104. ; push ecx ; unused
  105. ; push edx ; need not be preserved
  106. push esi
  107. push edi
  108. get_GOT ebx ; get GOT address
  109. ; ---- Pass 1: process columns from input.
  110. ; mov eax, [original_ebp]
  111. mov edx, POINTER [dct_table(eax)] ; quantptr
  112. mov esi, JCOEFPTR [coef_block(eax)] ; inptr
  113. %ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
  114. mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
  115. or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
  116. jnz near .columnDCT
  117. movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
  118. movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
  119. por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
  120. por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
  121. por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
  122. por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
  123. por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
  124. por xmm1,xmm0
  125. packsswb xmm1,xmm1
  126. packsswb xmm1,xmm1
  127. movd eax,xmm1
  128. test eax,eax
  129. jnz short .columnDCT
  130. ; -- AC terms all zero
  131. movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
  132. pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  133. psllw xmm5,PASS1_BITS
  134. movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
  135. punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
  136. punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
  137. pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
  138. pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
  139. pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
  140. pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
  141. pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
  142. pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
  143. pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
  144. pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
  145. movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
  146. movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
  147. movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
  148. movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
  149. jmp near .column_end
  150. alignx 16,7
  151. %endif
  152. .columnDCT:
  153. ; -- Even part
  154. movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
  155. movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
  156. pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  157. pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  158. movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
  159. movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
  160. pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  161. pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  162. ; (Original)
  163. ; z1 = (z2 + z3) * 0.541196100;
  164. ; tmp2 = z1 + z3 * -1.847759065;
  165. ; tmp3 = z1 + z2 * 0.765366865;
  166. ;
  167. ; (This implementation)
  168. ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
  169. ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
  170. movdqa xmm4,xmm1 ; xmm1=in2=z2
  171. movdqa xmm5,xmm1
  172. punpcklwd xmm4,xmm3 ; xmm3=in6=z3
  173. punpckhwd xmm5,xmm3
  174. movdqa xmm1,xmm4
  175. movdqa xmm3,xmm5
  176. pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L
  177. pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
  178. pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
  179. pmaddwd xmm3,[GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H
  180. movdqa xmm6,xmm0
  181. paddw xmm0,xmm2 ; xmm0=in0+in4
  182. psubw xmm6,xmm2 ; xmm6=in0-in4
  183. pxor xmm7,xmm7
  184. pxor xmm2,xmm2
  185. punpcklwd xmm7,xmm0 ; xmm7=tmp0L
  186. punpckhwd xmm2,xmm0 ; xmm2=tmp0H
  187. psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
  188. psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
  189. movdqa xmm0,xmm7
  190. paddd xmm7,xmm4 ; xmm7=tmp10L
  191. psubd xmm0,xmm4 ; xmm0=tmp13L
  192. movdqa xmm4,xmm2
  193. paddd xmm2,xmm5 ; xmm2=tmp10H
  194. psubd xmm4,xmm5 ; xmm4=tmp13H
  195. movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
  196. movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
  197. movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
  198. movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
  199. pxor xmm5,xmm5
  200. pxor xmm7,xmm7
  201. punpcklwd xmm5,xmm6 ; xmm5=tmp1L
  202. punpckhwd xmm7,xmm6 ; xmm7=tmp1H
  203. psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
  204. psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
  205. movdqa xmm2,xmm5
  206. paddd xmm5,xmm1 ; xmm5=tmp11L
  207. psubd xmm2,xmm1 ; xmm2=tmp12L
  208. movdqa xmm0,xmm7
  209. paddd xmm7,xmm3 ; xmm7=tmp11H
  210. psubd xmm0,xmm3 ; xmm0=tmp12H
  211. movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
  212. movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
  213. movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L
  214. movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H
  215. ; -- Odd part
  216. movdqa xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
  217. movdqa xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
  218. pmullw xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  219. pmullw xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  220. movdqa xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
  221. movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
  222. pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  223. pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  224. movdqa xmm5,xmm6
  225. movdqa xmm7,xmm4
  226. paddw xmm5,xmm3 ; xmm5=z3
  227. paddw xmm7,xmm1 ; xmm7=z4
  228. ; (Original)
  229. ; z5 = (z3 + z4) * 1.175875602;
  230. ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
  231. ; z3 += z5; z4 += z5;
  232. ;
  233. ; (This implementation)
  234. ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
  235. ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
  236. movdqa xmm2,xmm5
  237. movdqa xmm0,xmm5
  238. punpcklwd xmm2,xmm7
  239. punpckhwd xmm0,xmm7
  240. movdqa xmm5,xmm2
  241. movdqa xmm7,xmm0
  242. pmaddwd xmm2,[GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L
  243. pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H
  244. pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
  245. pmaddwd xmm7,[GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H
  246. movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
  247. movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
  248. ; (Original)
  249. ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
  250. ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
  251. ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
  252. ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
  253. ; tmp0 += z1 + z3; tmp1 += z2 + z4;
  254. ; tmp2 += z2 + z3; tmp3 += z1 + z4;
  255. ;
  256. ; (This implementation)
  257. ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
  258. ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
  259. ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
  260. ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
  261. ; tmp0 += z3; tmp1 += z4;
  262. ; tmp2 += z3; tmp3 += z4;
  263. movdqa xmm2,xmm3
  264. movdqa xmm0,xmm3
  265. punpcklwd xmm2,xmm4
  266. punpckhwd xmm0,xmm4
  267. movdqa xmm3,xmm2
  268. movdqa xmm4,xmm0
  269. pmaddwd xmm2,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L
  270. pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H
  271. pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L
  272. pmaddwd xmm4,[GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H
  273. paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
  274. paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
  275. paddd xmm3,xmm5 ; xmm3=tmp3L
  276. paddd xmm4,xmm7 ; xmm4=tmp3H
  277. movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
  278. movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
  279. movdqa xmm2,xmm1
  280. movdqa xmm0,xmm1
  281. punpcklwd xmm2,xmm6
  282. punpckhwd xmm0,xmm6
  283. movdqa xmm1,xmm2
  284. movdqa xmm6,xmm0
  285. pmaddwd xmm2,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L
  286. pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H
  287. pmaddwd xmm1,[GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L
  288. pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
  289. paddd xmm2,xmm5 ; xmm2=tmp1L
  290. paddd xmm0,xmm7 ; xmm0=tmp1H
  291. paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
  292. paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
  293. movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L
  294. movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H
  295. ; -- Final output stage
  296. movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
  297. movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
  298. movdqa xmm2,xmm5
  299. movdqa xmm0,xmm7
  300. paddd xmm5,xmm3 ; xmm5=data0L
  301. paddd xmm7,xmm4 ; xmm7=data0H
  302. psubd xmm2,xmm3 ; xmm2=data7L
  303. psubd xmm0,xmm4 ; xmm0=data7H
  304. movdqa xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1]
  305. paddd xmm5,xmm3
  306. paddd xmm7,xmm3
  307. psrad xmm5,DESCALE_P1
  308. psrad xmm7,DESCALE_P1
  309. paddd xmm2,xmm3
  310. paddd xmm0,xmm3
  311. psrad xmm2,DESCALE_P1
  312. psrad xmm0,DESCALE_P1
  313. packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
  314. packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
  315. movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
  316. movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
  317. movdqa xmm7,xmm4
  318. movdqa xmm0,xmm3
  319. paddd xmm4,xmm1 ; xmm4=data1L
  320. paddd xmm3,xmm6 ; xmm3=data1H
  321. psubd xmm7,xmm1 ; xmm7=data6L
  322. psubd xmm0,xmm6 ; xmm0=data6H
  323. movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1]
  324. paddd xmm4,xmm1
  325. paddd xmm3,xmm1
  326. psrad xmm4,DESCALE_P1
  327. psrad xmm3,DESCALE_P1
  328. paddd xmm7,xmm1
  329. paddd xmm0,xmm1
  330. psrad xmm7,DESCALE_P1
  331. psrad xmm0,DESCALE_P1
  332. packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
  333. packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
  334. movdqa xmm6,xmm5 ; transpose coefficients(phase 1)
  335. punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
  336. punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
  337. movdqa xmm1,xmm7 ; transpose coefficients(phase 1)
  338. punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
  339. punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
  340. movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
  341. movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
  342. movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L
  343. movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H
  344. movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13)
  345. movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17)
  346. movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
  347. movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
  348. movdqa xmm5,xmm3
  349. movdqa xmm6,xmm0
  350. paddd xmm3,xmm4 ; xmm3=data2L
  351. paddd xmm0,xmm2 ; xmm0=data2H
  352. psubd xmm5,xmm4 ; xmm5=data5L
  353. psubd xmm6,xmm2 ; xmm6=data5H
  354. movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1]
  355. paddd xmm3,xmm7
  356. paddd xmm0,xmm7
  357. psrad xmm3,DESCALE_P1
  358. psrad xmm0,DESCALE_P1
  359. paddd xmm5,xmm7
  360. paddd xmm6,xmm7
  361. psrad xmm5,DESCALE_P1
  362. psrad xmm6,DESCALE_P1
  363. packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
  364. packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
  365. movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
  366. movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
  367. movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
  368. movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
  369. movdqa xmm0,xmm1
  370. movdqa xmm6,xmm4
  371. paddd xmm1,xmm2 ; xmm1=data3L
  372. paddd xmm4,xmm7 ; xmm4=data3H
  373. psubd xmm0,xmm2 ; xmm0=data4L
  374. psubd xmm6,xmm7 ; xmm6=data4H
  375. movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1]
  376. paddd xmm1,xmm2
  377. paddd xmm4,xmm2
  378. psrad xmm1,DESCALE_P1
  379. psrad xmm4,DESCALE_P1
  380. paddd xmm0,xmm2
  381. paddd xmm6,xmm2
  382. psrad xmm0,DESCALE_P1
  383. psrad xmm6,DESCALE_P1
  384. packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
  385. packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
  386. movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
  387. movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
  388. movdqa xmm4,xmm3 ; transpose coefficients(phase 1)
  389. punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
  390. punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
  391. movdqa xmm6,xmm0 ; transpose coefficients(phase 1)
  392. punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
  393. punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
  394. movdqa xmm1,xmm7 ; transpose coefficients(phase 2)
  395. punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
  396. punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
  397. movdqa xmm5,xmm2 ; transpose coefficients(phase 2)
  398. punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
  399. punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
  400. movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
  401. movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
  402. movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
  403. movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
  404. movdqa xmm2,xmm0 ; transpose coefficients(phase 2)
  405. punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
  406. punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
  407. movdqa xmm5,xmm6 ; transpose coefficients(phase 2)
  408. punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
  409. punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
  410. movdqa xmm3,xmm7 ; transpose coefficients(phase 3)
  411. punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
  412. punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
  413. movdqa xmm4,xmm1 ; transpose coefficients(phase 3)
  414. punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
  415. punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
  416. movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
  417. movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
  418. movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
  419. movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
  420. movdqa xmm3,xmm0 ; transpose coefficients(phase 3)
  421. punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
  422. punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
  423. movdqa xmm4,xmm2 ; transpose coefficients(phase 3)
  424. punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
  425. punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
  426. movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
  427. movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
  428. .column_end:
  429. ; -- Prefetch the next coefficient block
  430. prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
  431. prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
  432. prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
  433. prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
  434. ; ---- Pass 2: process rows from work array, store into output array.
  435. mov eax, [original_ebp]
  436. mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
  437. mov eax, JDIMENSION [output_col(eax)]
  438. ; -- Even part
  439. ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
  440. ; (Original)
  441. ; z1 = (z2 + z3) * 0.541196100;
  442. ; tmp2 = z1 + z3 * -1.847759065;
  443. ; tmp3 = z1 + z2 * 0.765366865;
  444. ;
  445. ; (This implementation)
  446. ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
  447. ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
  448. movdqa xmm6,xmm1 ; xmm1=in2=z2
  449. movdqa xmm5,xmm1
  450. punpcklwd xmm6,xmm2 ; xmm2=in6=z3
  451. punpckhwd xmm5,xmm2
  452. movdqa xmm1,xmm6
  453. movdqa xmm2,xmm5
  454. pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L
  455. pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H
  456. pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L
  457. pmaddwd xmm2,[GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H
  458. movdqa xmm3,xmm7
  459. paddw xmm7,xmm0 ; xmm7=in0+in4
  460. psubw xmm3,xmm0 ; xmm3=in0-in4
  461. pxor xmm4,xmm4
  462. pxor xmm0,xmm0
  463. punpcklwd xmm4,xmm7 ; xmm4=tmp0L
  464. punpckhwd xmm0,xmm7 ; xmm0=tmp0H
  465. psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
  466. psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
  467. movdqa xmm7,xmm4
  468. paddd xmm4,xmm6 ; xmm4=tmp10L
  469. psubd xmm7,xmm6 ; xmm7=tmp13L
  470. movdqa xmm6,xmm0
  471. paddd xmm0,xmm5 ; xmm0=tmp10H
  472. psubd xmm6,xmm5 ; xmm6=tmp13H
  473. movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
  474. movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
  475. movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
  476. movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
  477. pxor xmm5,xmm5
  478. pxor xmm4,xmm4
  479. punpcklwd xmm5,xmm3 ; xmm5=tmp1L
  480. punpckhwd xmm4,xmm3 ; xmm4=tmp1H
  481. psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
  482. psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
  483. movdqa xmm0,xmm5
  484. paddd xmm5,xmm1 ; xmm5=tmp11L
  485. psubd xmm0,xmm1 ; xmm0=tmp12L
  486. movdqa xmm7,xmm4
  487. paddd xmm4,xmm2 ; xmm4=tmp11H
  488. psubd xmm7,xmm2 ; xmm7=tmp12H
  489. movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
  490. movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
  491. movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L
  492. movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H
  493. ; -- Odd part
  494. movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3
  495. movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1
  496. movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
  497. movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
  498. movdqa xmm5,xmm6
  499. movdqa xmm4,xmm3
  500. paddw xmm5,xmm1 ; xmm5=z3
  501. paddw xmm4,xmm2 ; xmm4=z4
  502. ; (Original)
  503. ; z5 = (z3 + z4) * 1.175875602;
  504. ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
  505. ; z3 += z5; z4 += z5;
  506. ;
  507. ; (This implementation)
  508. ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
  509. ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
  510. movdqa xmm0,xmm5
  511. movdqa xmm7,xmm5
  512. punpcklwd xmm0,xmm4
  513. punpckhwd xmm7,xmm4
  514. movdqa xmm5,xmm0
  515. movdqa xmm4,xmm7
  516. pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L
  517. pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H
  518. pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L
  519. pmaddwd xmm4,[GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H
  520. movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
  521. movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
  522. ; (Original)
  523. ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
  524. ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
  525. ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
  526. ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
  527. ; tmp0 += z1 + z3; tmp1 += z2 + z4;
  528. ; tmp2 += z2 + z3; tmp3 += z1 + z4;
  529. ;
  530. ; (This implementation)
  531. ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
  532. ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
  533. ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
  534. ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
  535. ; tmp0 += z3; tmp1 += z4;
  536. ; tmp2 += z3; tmp3 += z4;
  537. movdqa xmm0,xmm1
  538. movdqa xmm7,xmm1
  539. punpcklwd xmm0,xmm3
  540. punpckhwd xmm7,xmm3
  541. movdqa xmm1,xmm0
  542. movdqa xmm3,xmm7
  543. pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L
  544. pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H
  545. pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L
  546. pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H
  547. paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
  548. paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
  549. paddd xmm1,xmm5 ; xmm1=tmp3L
  550. paddd xmm3,xmm4 ; xmm3=tmp3H
  551. movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
  552. movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
  553. movdqa xmm0,xmm2
  554. movdqa xmm7,xmm2
  555. punpcklwd xmm0,xmm6
  556. punpckhwd xmm7,xmm6
  557. movdqa xmm2,xmm0
  558. movdqa xmm6,xmm7
  559. pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L
  560. pmaddwd xmm7,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H
  561. pmaddwd xmm2,[GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L
  562. pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H
  563. paddd xmm0,xmm5 ; xmm0=tmp1L
  564. paddd xmm7,xmm4 ; xmm7=tmp1H
  565. paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
  566. paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
  567. movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L
  568. movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H
  569. ; -- Final output stage
  570. movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
  571. movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
  572. movdqa xmm0,xmm5
  573. movdqa xmm7,xmm4
  574. paddd xmm5,xmm1 ; xmm5=data0L
  575. paddd xmm4,xmm3 ; xmm4=data0H
  576. psubd xmm0,xmm1 ; xmm0=data7L
  577. psubd xmm7,xmm3 ; xmm7=data7H
  578. movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2]
  579. paddd xmm5,xmm1
  580. paddd xmm4,xmm1
  581. psrad xmm5,DESCALE_P2
  582. psrad xmm4,DESCALE_P2
  583. paddd xmm0,xmm1
  584. paddd xmm7,xmm1
  585. psrad xmm0,DESCALE_P2
  586. psrad xmm7,DESCALE_P2
  587. packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
  588. packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
  589. movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
  590. movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
  591. movdqa xmm4,xmm3
  592. movdqa xmm7,xmm1
  593. paddd xmm3,xmm2 ; xmm3=data1L
  594. paddd xmm1,xmm6 ; xmm1=data1H
  595. psubd xmm4,xmm2 ; xmm4=data6L
  596. psubd xmm7,xmm6 ; xmm7=data6H
  597. movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2]
  598. paddd xmm3,xmm2
  599. paddd xmm1,xmm2
  600. psrad xmm3,DESCALE_P2
  601. psrad xmm1,DESCALE_P2
  602. paddd xmm4,xmm2
  603. paddd xmm7,xmm2
  604. psrad xmm4,DESCALE_P2
  605. psrad xmm7,DESCALE_P2
  606. packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
  607. packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
  608. packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
  609. packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
  610. movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
  611. movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
  612. movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L
  613. movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H
  614. movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
  615. movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
  616. movdqa xmm4,xmm6
  617. movdqa xmm0,xmm2
  618. paddd xmm6,xmm1 ; xmm6=data2L
  619. paddd xmm2,xmm7 ; xmm2=data2H
  620. psubd xmm4,xmm1 ; xmm4=data5L
  621. psubd xmm0,xmm7 ; xmm0=data5H
  622. movdqa xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2]
  623. paddd xmm6,xmm5
  624. paddd xmm2,xmm5
  625. psrad xmm6,DESCALE_P2
  626. psrad xmm2,DESCALE_P2
  627. paddd xmm4,xmm5
  628. paddd xmm0,xmm5
  629. psrad xmm4,DESCALE_P2
  630. psrad xmm0,DESCALE_P2
  631. packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
  632. packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
  633. movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
  634. movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
  635. movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
  636. movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
  637. movdqa xmm2,xmm3
  638. movdqa xmm0,xmm1
  639. paddd xmm3,xmm7 ; xmm3=data3L
  640. paddd xmm1,xmm5 ; xmm1=data3H
  641. psubd xmm2,xmm7 ; xmm2=data4L
  642. psubd xmm0,xmm5 ; xmm0=data4H
  643. movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2]
  644. paddd xmm3,xmm7
  645. paddd xmm1,xmm7
  646. psrad xmm3,DESCALE_P2
  647. psrad xmm1,DESCALE_P2
  648. paddd xmm2,xmm7
  649. paddd xmm0,xmm7
  650. psrad xmm2,DESCALE_P2
  651. psrad xmm0,DESCALE_P2
  652. movdqa xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP]
  653. packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
  654. packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
  655. movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
  656. movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
  657. packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
  658. packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
  659. paddb xmm7,xmm5
  660. paddb xmm1,xmm5
  661. paddb xmm6,xmm5
  662. paddb xmm3,xmm5
  663. movdqa xmm0,xmm7 ; transpose coefficients(phase 1)
  664. punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
  665. punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
  666. movdqa xmm2,xmm6 ; transpose coefficients(phase 1)
  667. punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
  668. punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
  669. movdqa xmm4,xmm7 ; transpose coefficients(phase 2)
  670. punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
  671. punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
  672. movdqa xmm5,xmm2 ; transpose coefficients(phase 2)
  673. punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
  674. punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
  675. movdqa xmm1,xmm7 ; transpose coefficients(phase 3)
  676. punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
  677. punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
  678. movdqa xmm3,xmm4 ; transpose coefficients(phase 3)
  679. punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
  680. punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
  681. pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
  682. pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
  683. pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
  684. pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
  685. mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
  686. mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
  687. movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
  688. movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
  689. mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
  690. mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
  691. movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
  692. movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
  693. mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
  694. mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
  695. movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
  696. movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
  697. mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
  698. mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
  699. movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
  700. movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
  701. pop edi
  702. pop esi
  703. ; pop edx ; need not be preserved
  704. ; pop ecx ; unused
  705. poppic ebx
  706. mov esp,ebp ; esp <- aligned ebp
  707. pop esp ; esp <- original ebp
  708. pop ebp
  709. ret
  710. ; For some reason, the OS X linker does not honor the request to align the
  711. ; segment unless we do this.
  712. align 16