/media/libjpeg/simd/jimmxfst.asm

http://github.com/zpao/v8monkey · Assembly · 500 lines · 345 code · 87 blank · 68 comment · 2 complexity · 00f86e2bd3ebfb7fe17210e27bc3c852 MD5 · raw file

  1. ;
  2. ; jimmxfst.asm - fast integer IDCT (MMX)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ;
  6. ; Based on
  7. ; x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. ;
  17. ; This file contains a fast, not so accurate integer implementation of
  18. ; the inverse DCT (Discrete Cosine Transform). The following code is
  19. ; based directly on the IJG's original jidctfst.c; see the jidctfst.c
  20. ; for more details.
  21. ;
  22. ; [TAB8]
  23. %include "jsimdext.inc"
  24. %include "jdct.inc"
  25. ; --------------------------------------------------------------------------
  26. %define CONST_BITS 8 ; 14 is also OK.
  27. %define PASS1_BITS 2
  28. %if IFAST_SCALE_BITS != PASS1_BITS
  29. %error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
  30. %endif
  31. %if CONST_BITS == 8
  32. F_1_082 equ 277 ; FIX(1.082392200)
  33. F_1_414 equ 362 ; FIX(1.414213562)
  34. F_1_847 equ 473 ; FIX(1.847759065)
  35. F_2_613 equ 669 ; FIX(2.613125930)
  36. F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
  37. %else
  38. ; NASM cannot do compile-time arithmetic on floating-point constants.
  39. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
  40. F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200)
  41. F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562)
  42. F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
  43. F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930)
  44. F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
  45. %endif
  46. ; --------------------------------------------------------------------------
  47. SECTION SEG_CONST
  48. ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
  49. ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
  50. %define PRE_MULTIPLY_SCALE_BITS 2
  51. %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
  52. alignz 16
  53. global EXTN(jconst_idct_ifast_mmx)
  54. EXTN(jconst_idct_ifast_mmx):
  55. PW_F1414 times 4 dw F_1_414 << CONST_SHIFT
  56. PW_F1847 times 4 dw F_1_847 << CONST_SHIFT
  57. PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT
  58. PW_F1082 times 4 dw F_1_082 << CONST_SHIFT
  59. PB_CENTERJSAMP times 8 db CENTERJSAMPLE
  60. alignz 16
  61. ; --------------------------------------------------------------------------
  62. SECTION SEG_TEXT
  63. BITS 32
  64. ;
  65. ; Perform dequantization and inverse DCT on one block of coefficients.
  66. ;
  67. ; GLOBAL(void)
  68. ; jsimd_idct_ifast_mmx (void * dct_table, JCOEFPTR coef_block,
  69. ; JSAMPARRAY output_buf, JDIMENSION output_col)
  70. ;
  71. %define dct_table(b) (b)+8 ; jpeg_component_info * compptr
  72. %define coef_block(b) (b)+12 ; JCOEFPTR coef_block
  73. %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
  74. %define output_col(b) (b)+20 ; JDIMENSION output_col
  75. %define original_ebp ebp+0
  76. %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
  77. %define WK_NUM 2
  78. %define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF
  79. ; JCOEF workspace[DCTSIZE2]
  80. align 16
  81. global EXTN(jsimd_idct_ifast_mmx)
  82. EXTN(jsimd_idct_ifast_mmx):
  83. push ebp
  84. mov eax,esp ; eax = original ebp
  85. sub esp, byte 4
  86. and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
  87. mov [esp],eax
  88. mov ebp,esp ; ebp = aligned ebp
  89. lea esp, [workspace]
  90. push ebx
  91. ; push ecx ; need not be preserved
  92. ; push edx ; need not be preserved
  93. push esi
  94. push edi
  95. get_GOT ebx ; get GOT address
  96. ; ---- Pass 1: process columns from input, store into work array.
  97. ; mov eax, [original_ebp]
  98. mov edx, POINTER [dct_table(eax)] ; quantptr
  99. mov esi, JCOEFPTR [coef_block(eax)] ; inptr
  100. lea edi, [workspace] ; JCOEF * wsptr
  101. mov ecx, DCTSIZE/4 ; ctr
  102. alignx 16,7
  103. .columnloop:
  104. %ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
  105. mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
  106. or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
  107. jnz short .columnDCT
  108. movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
  109. movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
  110. por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
  111. por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
  112. por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
  113. por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
  114. por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
  115. por mm1,mm0
  116. packsswb mm1,mm1
  117. movd eax,mm1
  118. test eax,eax
  119. jnz short .columnDCT
  120. ; -- AC terms all zero
  121. movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
  122. pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  123. movq mm2,mm0 ; mm0=in0=(00 01 02 03)
  124. punpcklwd mm0,mm0 ; mm0=(00 00 01 01)
  125. punpckhwd mm2,mm2 ; mm2=(02 02 03 03)
  126. movq mm1,mm0
  127. punpckldq mm0,mm0 ; mm0=(00 00 00 00)
  128. punpckhdq mm1,mm1 ; mm1=(01 01 01 01)
  129. movq mm3,mm2
  130. punpckldq mm2,mm2 ; mm2=(02 02 02 02)
  131. punpckhdq mm3,mm3 ; mm3=(03 03 03 03)
  132. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
  133. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
  134. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
  135. movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
  136. movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
  137. movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
  138. movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
  139. movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
  140. jmp near .nextcolumn
  141. alignx 16,7
  142. %endif
  143. .columnDCT:
  144. ; -- Even part
  145. movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
  146. movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
  147. pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  148. pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  149. movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
  150. movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
  151. pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  152. pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  153. movq mm4,mm0
  154. movq mm5,mm1
  155. psubw mm0,mm2 ; mm0=tmp11
  156. psubw mm1,mm3
  157. paddw mm4,mm2 ; mm4=tmp10
  158. paddw mm5,mm3 ; mm5=tmp13
  159. psllw mm1,PRE_MULTIPLY_SCALE_BITS
  160. pmulhw mm1,[GOTOFF(ebx,PW_F1414)]
  161. psubw mm1,mm5 ; mm1=tmp12
  162. movq mm6,mm4
  163. movq mm7,mm0
  164. psubw mm4,mm5 ; mm4=tmp3
  165. psubw mm0,mm1 ; mm0=tmp2
  166. paddw mm6,mm5 ; mm6=tmp0
  167. paddw mm7,mm1 ; mm7=tmp1
  168. movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3
  169. movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2
  170. ; -- Odd part
  171. movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
  172. movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
  173. pmullw mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  174. pmullw mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  175. movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
  176. movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
  177. pmullw mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  178. pmullw mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  179. movq mm4,mm2
  180. movq mm0,mm5
  181. psubw mm2,mm1 ; mm2=z12
  182. psubw mm5,mm3 ; mm5=z10
  183. paddw mm4,mm1 ; mm4=z11
  184. paddw mm0,mm3 ; mm0=z13
  185. movq mm1,mm5 ; mm1=z10(unscaled)
  186. psllw mm2,PRE_MULTIPLY_SCALE_BITS
  187. psllw mm5,PRE_MULTIPLY_SCALE_BITS
  188. movq mm3,mm4
  189. psubw mm4,mm0
  190. paddw mm3,mm0 ; mm3=tmp7
  191. psllw mm4,PRE_MULTIPLY_SCALE_BITS
  192. pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11
  193. ; To avoid overflow...
  194. ;
  195. ; (Original)
  196. ; tmp12 = -2.613125930 * z10 + z5;
  197. ;
  198. ; (This implementation)
  199. ; tmp12 = (-1.613125930 - 1) * z10 + z5;
  200. ; = -1.613125930 * z10 - z10 + z5;
  201. movq mm0,mm5
  202. paddw mm5,mm2
  203. pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5
  204. pmulhw mm0,[GOTOFF(ebx,PW_MF1613)]
  205. pmulhw mm2,[GOTOFF(ebx,PW_F1082)]
  206. psubw mm0,mm1
  207. psubw mm2,mm5 ; mm2=tmp10
  208. paddw mm0,mm5 ; mm0=tmp12
  209. ; -- Final output stage
  210. psubw mm0,mm3 ; mm0=tmp6
  211. movq mm1,mm6
  212. movq mm5,mm7
  213. paddw mm6,mm3 ; mm6=data0=(00 01 02 03)
  214. paddw mm7,mm0 ; mm7=data1=(10 11 12 13)
  215. psubw mm1,mm3 ; mm1=data7=(70 71 72 73)
  216. psubw mm5,mm0 ; mm5=data6=(60 61 62 63)
  217. psubw mm4,mm0 ; mm4=tmp5
  218. movq mm3,mm6 ; transpose coefficients(phase 1)
  219. punpcklwd mm6,mm7 ; mm6=(00 10 01 11)
  220. punpckhwd mm3,mm7 ; mm3=(02 12 03 13)
  221. movq mm0,mm5 ; transpose coefficients(phase 1)
  222. punpcklwd mm5,mm1 ; mm5=(60 70 61 71)
  223. punpckhwd mm0,mm1 ; mm0=(62 72 63 73)
  224. movq mm7, MMWORD [wk(0)] ; mm7=tmp2
  225. movq mm1, MMWORD [wk(1)] ; mm1=tmp3
  226. movq MMWORD [wk(0)], mm5 ; wk(0)=(60 70 61 71)
  227. movq MMWORD [wk(1)], mm0 ; wk(1)=(62 72 63 73)
  228. paddw mm2,mm4 ; mm2=tmp4
  229. movq mm5,mm7
  230. movq mm0,mm1
  231. paddw mm7,mm4 ; mm7=data2=(20 21 22 23)
  232. paddw mm1,mm2 ; mm1=data4=(40 41 42 43)
  233. psubw mm5,mm4 ; mm5=data5=(50 51 52 53)
  234. psubw mm0,mm2 ; mm0=data3=(30 31 32 33)
  235. movq mm4,mm7 ; transpose coefficients(phase 1)
  236. punpcklwd mm7,mm0 ; mm7=(20 30 21 31)
  237. punpckhwd mm4,mm0 ; mm4=(22 32 23 33)
  238. movq mm2,mm1 ; transpose coefficients(phase 1)
  239. punpcklwd mm1,mm5 ; mm1=(40 50 41 51)
  240. punpckhwd mm2,mm5 ; mm2=(42 52 43 53)
  241. movq mm0,mm6 ; transpose coefficients(phase 2)
  242. punpckldq mm6,mm7 ; mm6=(00 10 20 30)
  243. punpckhdq mm0,mm7 ; mm0=(01 11 21 31)
  244. movq mm5,mm3 ; transpose coefficients(phase 2)
  245. punpckldq mm3,mm4 ; mm3=(02 12 22 32)
  246. punpckhdq mm5,mm4 ; mm5=(03 13 23 33)
  247. movq mm7, MMWORD [wk(0)] ; mm7=(60 70 61 71)
  248. movq mm4, MMWORD [wk(1)] ; mm4=(62 72 63 73)
  249. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
  250. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
  251. movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
  252. movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
  253. movq mm6,mm1 ; transpose coefficients(phase 2)
  254. punpckldq mm1,mm7 ; mm1=(40 50 60 70)
  255. punpckhdq mm6,mm7 ; mm6=(41 51 61 71)
  256. movq mm0,mm2 ; transpose coefficients(phase 2)
  257. punpckldq mm2,mm4 ; mm2=(42 52 62 72)
  258. punpckhdq mm0,mm4 ; mm0=(43 53 63 73)
  259. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
  260. movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
  261. movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
  262. movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
  263. .nextcolumn:
  264. add esi, byte 4*SIZEOF_JCOEF ; coef_block
  265. add edx, byte 4*SIZEOF_IFAST_MULT_TYPE ; quantptr
  266. add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
  267. dec ecx ; ctr
  268. jnz near .columnloop
  269. ; ---- Pass 2: process rows from work array, store into output array.
  270. mov eax, [original_ebp]
  271. lea esi, [workspace] ; JCOEF * wsptr
  272. mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
  273. mov eax, JDIMENSION [output_col(eax)]
  274. mov ecx, DCTSIZE/4 ; ctr
  275. alignx 16,7
  276. .rowloop:
  277. ; -- Even part
  278. movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
  279. movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
  280. movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
  281. movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
  282. movq mm4,mm0
  283. movq mm5,mm1
  284. psubw mm0,mm2 ; mm0=tmp11
  285. psubw mm1,mm3
  286. paddw mm4,mm2 ; mm4=tmp10
  287. paddw mm5,mm3 ; mm5=tmp13
  288. psllw mm1,PRE_MULTIPLY_SCALE_BITS
  289. pmulhw mm1,[GOTOFF(ebx,PW_F1414)]
  290. psubw mm1,mm5 ; mm1=tmp12
  291. movq mm6,mm4
  292. movq mm7,mm0
  293. psubw mm4,mm5 ; mm4=tmp3
  294. psubw mm0,mm1 ; mm0=tmp2
  295. paddw mm6,mm5 ; mm6=tmp0
  296. paddw mm7,mm1 ; mm7=tmp1
  297. movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3
  298. movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2
  299. ; -- Odd part
  300. movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
  301. movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
  302. movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
  303. movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
  304. movq mm4,mm2
  305. movq mm0,mm5
  306. psubw mm2,mm1 ; mm2=z12
  307. psubw mm5,mm3 ; mm5=z10
  308. paddw mm4,mm1 ; mm4=z11
  309. paddw mm0,mm3 ; mm0=z13
  310. movq mm1,mm5 ; mm1=z10(unscaled)
  311. psllw mm2,PRE_MULTIPLY_SCALE_BITS
  312. psllw mm5,PRE_MULTIPLY_SCALE_BITS
  313. movq mm3,mm4
  314. psubw mm4,mm0
  315. paddw mm3,mm0 ; mm3=tmp7
  316. psllw mm4,PRE_MULTIPLY_SCALE_BITS
  317. pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11
  318. ; To avoid overflow...
  319. ;
  320. ; (Original)
  321. ; tmp12 = -2.613125930 * z10 + z5;
  322. ;
  323. ; (This implementation)
  324. ; tmp12 = (-1.613125930 - 1) * z10 + z5;
  325. ; = -1.613125930 * z10 - z10 + z5;
  326. movq mm0,mm5
  327. paddw mm5,mm2
  328. pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5
  329. pmulhw mm0,[GOTOFF(ebx,PW_MF1613)]
  330. pmulhw mm2,[GOTOFF(ebx,PW_F1082)]
  331. psubw mm0,mm1
  332. psubw mm2,mm5 ; mm2=tmp10
  333. paddw mm0,mm5 ; mm0=tmp12
  334. ; -- Final output stage
  335. psubw mm0,mm3 ; mm0=tmp6
  336. movq mm1,mm6
  337. movq mm5,mm7
  338. paddw mm6,mm3 ; mm6=data0=(00 10 20 30)
  339. paddw mm7,mm0 ; mm7=data1=(01 11 21 31)
  340. psraw mm6,(PASS1_BITS+3) ; descale
  341. psraw mm7,(PASS1_BITS+3) ; descale
  342. psubw mm1,mm3 ; mm1=data7=(07 17 27 37)
  343. psubw mm5,mm0 ; mm5=data6=(06 16 26 36)
  344. psraw mm1,(PASS1_BITS+3) ; descale
  345. psraw mm5,(PASS1_BITS+3) ; descale
  346. psubw mm4,mm0 ; mm4=tmp5
  347. packsswb mm6,mm5 ; mm6=(00 10 20 30 06 16 26 36)
  348. packsswb mm7,mm1 ; mm7=(01 11 21 31 07 17 27 37)
  349. movq mm3, MMWORD [wk(0)] ; mm3=tmp2
  350. movq mm0, MMWORD [wk(1)] ; mm0=tmp3
  351. paddw mm2,mm4 ; mm2=tmp4
  352. movq mm5,mm3
  353. movq mm1,mm0
  354. paddw mm3,mm4 ; mm3=data2=(02 12 22 32)
  355. paddw mm0,mm2 ; mm0=data4=(04 14 24 34)
  356. psraw mm3,(PASS1_BITS+3) ; descale
  357. psraw mm0,(PASS1_BITS+3) ; descale
  358. psubw mm5,mm4 ; mm5=data5=(05 15 25 35)
  359. psubw mm1,mm2 ; mm1=data3=(03 13 23 33)
  360. psraw mm5,(PASS1_BITS+3) ; descale
  361. psraw mm1,(PASS1_BITS+3) ; descale
  362. movq mm4,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm4=[PB_CENTERJSAMP]
  363. packsswb mm3,mm0 ; mm3=(02 12 22 32 04 14 24 34)
  364. packsswb mm1,mm5 ; mm1=(03 13 23 33 05 15 25 35)
  365. paddb mm6,mm4
  366. paddb mm7,mm4
  367. paddb mm3,mm4
  368. paddb mm1,mm4
  369. movq mm2,mm6 ; transpose coefficients(phase 1)
  370. punpcklbw mm6,mm7 ; mm6=(00 01 10 11 20 21 30 31)
  371. punpckhbw mm2,mm7 ; mm2=(06 07 16 17 26 27 36 37)
  372. movq mm0,mm3 ; transpose coefficients(phase 1)
  373. punpcklbw mm3,mm1 ; mm3=(02 03 12 13 22 23 32 33)
  374. punpckhbw mm0,mm1 ; mm0=(04 05 14 15 24 25 34 35)
  375. movq mm5,mm6 ; transpose coefficients(phase 2)
  376. punpcklwd mm6,mm3 ; mm6=(00 01 02 03 10 11 12 13)
  377. punpckhwd mm5,mm3 ; mm5=(20 21 22 23 30 31 32 33)
  378. movq mm4,mm0 ; transpose coefficients(phase 2)
  379. punpcklwd mm0,mm2 ; mm0=(04 05 06 07 14 15 16 17)
  380. punpckhwd mm4,mm2 ; mm4=(24 25 26 27 34 35 36 37)
  381. movq mm7,mm6 ; transpose coefficients(phase 3)
  382. punpckldq mm6,mm0 ; mm6=(00 01 02 03 04 05 06 07)
  383. punpckhdq mm7,mm0 ; mm7=(10 11 12 13 14 15 16 17)
  384. movq mm1,mm5 ; transpose coefficients(phase 3)
  385. punpckldq mm5,mm4 ; mm5=(20 21 22 23 24 25 26 27)
  386. punpckhdq mm1,mm4 ; mm1=(30 31 32 33 34 35 36 37)
  387. pushpic ebx ; save GOT address
  388. mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
  389. mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
  390. movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
  391. movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
  392. mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
  393. mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
  394. movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
  395. movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
  396. poppic ebx ; restore GOT address
  397. add esi, byte 4*SIZEOF_JCOEF ; wsptr
  398. add edi, byte 4*SIZEOF_JSAMPROW
  399. dec ecx ; ctr
  400. jnz near .rowloop
  401. emms ; empty MMX state
  402. pop edi
  403. pop esi
  404. ; pop edx ; need not be preserved
  405. ; pop ecx ; need not be preserved
  406. pop ebx
  407. mov esp,ebp ; esp <- aligned ebp
  408. pop esp ; esp <- original ebp
  409. pop ebp
  410. ret
  411. ; For some reason, the OS X linker does not honor the request to align the
  412. ; segment unless we do this.
  413. align 16