/media/libjpeg/simd/jimmxred.asm

http://github.com/zpao/v8monkey · Assembly · 706 lines · 482 code · 132 blank · 92 comment · 1 complexity · 68891dfcfa1d936e39e0b047945dd602 MD5 · raw file

  1. ;
  2. ; jimmxred.asm - reduced-size IDCT (MMX)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ;
  6. ; Based on
  7. ; x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. ;
  17. ; This file contains inverse-DCT routines that produce reduced-size
  18. ; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
  19. ; The following code is based directly on the IJG's original jidctred.c;
  20. ; see the jidctred.c for more details.
  21. ;
  22. ; [TAB8]
  23. %include "jsimdext.inc"
  24. %include "jdct.inc"
  25. ; --------------------------------------------------------------------------
  26. %define CONST_BITS 13
  27. %define PASS1_BITS 2
  28. %define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1)
  29. %define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1)
  30. %define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2)
  31. %define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2)
  32. %if CONST_BITS == 13
  33. F_0_211 equ 1730 ; FIX(0.211164243)
  34. F_0_509 equ 4176 ; FIX(0.509795579)
  35. F_0_601 equ 4926 ; FIX(0.601344887)
  36. F_0_720 equ 5906 ; FIX(0.720959822)
  37. F_0_765 equ 6270 ; FIX(0.765366865)
  38. F_0_850 equ 6967 ; FIX(0.850430095)
  39. F_0_899 equ 7373 ; FIX(0.899976223)
  40. F_1_061 equ 8697 ; FIX(1.061594337)
  41. F_1_272 equ 10426 ; FIX(1.272758580)
  42. F_1_451 equ 11893 ; FIX(1.451774981)
  43. F_1_847 equ 15137 ; FIX(1.847759065)
  44. F_2_172 equ 17799 ; FIX(2.172734803)
  45. F_2_562 equ 20995 ; FIX(2.562915447)
  46. F_3_624 equ 29692 ; FIX(3.624509785)
  47. %else
  48. ; NASM cannot do compile-time arithmetic on floating-point constants.
  49. %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
  50. F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243)
  51. F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579)
  52. F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887)
  53. F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822)
  54. F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865)
  55. F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095)
  56. F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223)
  57. F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337)
  58. F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580)
  59. F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981)
  60. F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
  61. F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803)
  62. F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447)
  63. F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785)
  64. %endif
  65. ; --------------------------------------------------------------------------
  66. SECTION SEG_CONST
  67. alignz 16
  68. global EXTN(jconst_idct_red_mmx)
  69. EXTN(jconst_idct_red_mmx):
  70. PW_F184_MF076 times 2 dw F_1_847,-F_0_765
  71. PW_F256_F089 times 2 dw F_2_562, F_0_899
  72. PW_F106_MF217 times 2 dw F_1_061,-F_2_172
  73. PW_MF060_MF050 times 2 dw -F_0_601,-F_0_509
  74. PW_F145_MF021 times 2 dw F_1_451,-F_0_211
  75. PW_F362_MF127 times 2 dw F_3_624,-F_1_272
  76. PW_F085_MF072 times 2 dw F_0_850,-F_0_720
  77. PD_DESCALE_P1_4 times 2 dd 1 << (DESCALE_P1_4-1)
  78. PD_DESCALE_P2_4 times 2 dd 1 << (DESCALE_P2_4-1)
  79. PD_DESCALE_P1_2 times 2 dd 1 << (DESCALE_P1_2-1)
  80. PD_DESCALE_P2_2 times 2 dd 1 << (DESCALE_P2_2-1)
  81. PB_CENTERJSAMP times 8 db CENTERJSAMPLE
  82. alignz 16
  83. ; --------------------------------------------------------------------------
  84. SECTION SEG_TEXT
  85. BITS 32
  86. ;
  87. ; Perform dequantization and inverse DCT on one block of coefficients,
  88. ; producing a reduced-size 4x4 output block.
  89. ;
  90. ; GLOBAL(void)
  91. ; jsimd_idct_4x4_mmx (void * dct_table, JCOEFPTR coef_block,
  92. ; JSAMPARRAY output_buf, JDIMENSION output_col)
  93. ;
  94. %define dct_table(b) (b)+8 ; void * dct_table
  95. %define coef_block(b) (b)+12 ; JCOEFPTR coef_block
  96. %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
  97. %define output_col(b) (b)+20 ; JDIMENSION output_col
  98. %define original_ebp ebp+0
  99. %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
  100. %define WK_NUM 2
  101. %define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF
  102. ; JCOEF workspace[DCTSIZE2]
  103. align 16
  104. global EXTN(jsimd_idct_4x4_mmx)
  105. EXTN(jsimd_idct_4x4_mmx):
  106. push ebp
  107. mov eax,esp ; eax = original ebp
  108. sub esp, byte 4
  109. and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
  110. mov [esp],eax
  111. mov ebp,esp ; ebp = aligned ebp
  112. lea esp, [workspace]
  113. pushpic ebx
  114. ; push ecx ; need not be preserved
  115. ; push edx ; need not be preserved
  116. push esi
  117. push edi
  118. get_GOT ebx ; get GOT address
  119. ; ---- Pass 1: process columns from input, store into work array.
  120. ; mov eax, [original_ebp]
  121. mov edx, POINTER [dct_table(eax)] ; quantptr
  122. mov esi, JCOEFPTR [coef_block(eax)] ; inptr
  123. lea edi, [workspace] ; JCOEF * wsptr
  124. mov ecx, DCTSIZE/4 ; ctr
  125. alignx 16,7
  126. .columnloop:
  127. %ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
  128. mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
  129. or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
  130. jnz short .columnDCT
  131. movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
  132. movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
  133. por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
  134. por mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
  135. por mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
  136. por mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
  137. por mm0,mm1
  138. packsswb mm0,mm0
  139. movd eax,mm0
  140. test eax,eax
  141. jnz short .columnDCT
  142. ; -- AC terms all zero
  143. movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
  144. pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  145. psllw mm0,PASS1_BITS
  146. movq mm2,mm0 ; mm0=in0=(00 01 02 03)
  147. punpcklwd mm0,mm0 ; mm0=(00 00 01 01)
  148. punpckhwd mm2,mm2 ; mm2=(02 02 03 03)
  149. movq mm1,mm0
  150. punpckldq mm0,mm0 ; mm0=(00 00 00 00)
  151. punpckhdq mm1,mm1 ; mm1=(01 01 01 01)
  152. movq mm3,mm2
  153. punpckldq mm2,mm2 ; mm2=(02 02 02 02)
  154. punpckhdq mm3,mm3 ; mm3=(03 03 03 03)
  155. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
  156. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
  157. movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
  158. movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
  159. jmp near .nextcolumn
  160. alignx 16,7
  161. %endif
  162. .columnDCT:
  163. ; -- Odd part
  164. movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
  165. movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
  166. pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  167. pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  168. movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
  169. movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
  170. pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  171. pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  172. movq mm4,mm0
  173. movq mm5,mm0
  174. punpcklwd mm4,mm1
  175. punpckhwd mm5,mm1
  176. movq mm0,mm4
  177. movq mm1,mm5
  178. pmaddwd mm4,[GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L)
  179. pmaddwd mm5,[GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H)
  180. pmaddwd mm0,[GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L)
  181. pmaddwd mm1,[GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H)
  182. movq mm6,mm2
  183. movq mm7,mm2
  184. punpcklwd mm6,mm3
  185. punpckhwd mm7,mm3
  186. movq mm2,mm6
  187. movq mm3,mm7
  188. pmaddwd mm6,[GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L)
  189. pmaddwd mm7,[GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H)
  190. pmaddwd mm2,[GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L)
  191. pmaddwd mm3,[GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H)
  192. paddd mm6,mm4 ; mm6=tmp2L
  193. paddd mm7,mm5 ; mm7=tmp2H
  194. paddd mm2,mm0 ; mm2=tmp0L
  195. paddd mm3,mm1 ; mm3=tmp0H
  196. movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L
  197. movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H
  198. ; -- Even part
  199. movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
  200. movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
  201. movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
  202. pmullw mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  203. pmullw mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  204. pmullw mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  205. pxor mm1,mm1
  206. pxor mm2,mm2
  207. punpcklwd mm1,mm4 ; mm1=tmp0L
  208. punpckhwd mm2,mm4 ; mm2=tmp0H
  209. psrad mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
  210. psrad mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
  211. movq mm3,mm5 ; mm5=in2=z2
  212. punpcklwd mm5,mm0 ; mm0=in6=z3
  213. punpckhwd mm3,mm0
  214. pmaddwd mm5,[GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L
  215. pmaddwd mm3,[GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H
  216. movq mm4,mm1
  217. movq mm0,mm2
  218. paddd mm1,mm5 ; mm1=tmp10L
  219. paddd mm2,mm3 ; mm2=tmp10H
  220. psubd mm4,mm5 ; mm4=tmp12L
  221. psubd mm0,mm3 ; mm0=tmp12H
  222. ; -- Final output stage
  223. movq mm5,mm1
  224. movq mm3,mm2
  225. paddd mm1,mm6 ; mm1=data0L
  226. paddd mm2,mm7 ; mm2=data0H
  227. psubd mm5,mm6 ; mm5=data3L
  228. psubd mm3,mm7 ; mm3=data3H
  229. movq mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm6=[PD_DESCALE_P1_4]
  230. paddd mm1,mm6
  231. paddd mm2,mm6
  232. psrad mm1,DESCALE_P1_4
  233. psrad mm2,DESCALE_P1_4
  234. paddd mm5,mm6
  235. paddd mm3,mm6
  236. psrad mm5,DESCALE_P1_4
  237. psrad mm3,DESCALE_P1_4
  238. packssdw mm1,mm2 ; mm1=data0=(00 01 02 03)
  239. packssdw mm5,mm3 ; mm5=data3=(30 31 32 33)
  240. movq mm7, MMWORD [wk(0)] ; mm7=tmp0L
  241. movq mm6, MMWORD [wk(1)] ; mm6=tmp0H
  242. movq mm2,mm4
  243. movq mm3,mm0
  244. paddd mm4,mm7 ; mm4=data1L
  245. paddd mm0,mm6 ; mm0=data1H
  246. psubd mm2,mm7 ; mm2=data2L
  247. psubd mm3,mm6 ; mm3=data2H
  248. movq mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; mm7=[PD_DESCALE_P1_4]
  249. paddd mm4,mm7
  250. paddd mm0,mm7
  251. psrad mm4,DESCALE_P1_4
  252. psrad mm0,DESCALE_P1_4
  253. paddd mm2,mm7
  254. paddd mm3,mm7
  255. psrad mm2,DESCALE_P1_4
  256. psrad mm3,DESCALE_P1_4
  257. packssdw mm4,mm0 ; mm4=data1=(10 11 12 13)
  258. packssdw mm2,mm3 ; mm2=data2=(20 21 22 23)
  259. movq mm6,mm1 ; transpose coefficients(phase 1)
  260. punpcklwd mm1,mm4 ; mm1=(00 10 01 11)
  261. punpckhwd mm6,mm4 ; mm6=(02 12 03 13)
  262. movq mm7,mm2 ; transpose coefficients(phase 1)
  263. punpcklwd mm2,mm5 ; mm2=(20 30 21 31)
  264. punpckhwd mm7,mm5 ; mm7=(22 32 23 33)
  265. movq mm0,mm1 ; transpose coefficients(phase 2)
  266. punpckldq mm1,mm2 ; mm1=(00 10 20 30)
  267. punpckhdq mm0,mm2 ; mm0=(01 11 21 31)
  268. movq mm3,mm6 ; transpose coefficients(phase 2)
  269. punpckldq mm6,mm7 ; mm6=(02 12 22 32)
  270. punpckhdq mm3,mm7 ; mm3=(03 13 23 33)
  271. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
  272. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
  273. movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
  274. movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
  275. .nextcolumn:
  276. add esi, byte 4*SIZEOF_JCOEF ; coef_block
  277. add edx, byte 4*SIZEOF_ISLOW_MULT_TYPE ; quantptr
  278. add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
  279. dec ecx ; ctr
  280. jnz near .columnloop
  281. ; ---- Pass 2: process rows from work array, store into output array.
  282. mov eax, [original_ebp]
  283. lea esi, [workspace] ; JCOEF * wsptr
  284. mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
  285. mov eax, JDIMENSION [output_col(eax)]
  286. ; -- Odd part
  287. movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
  288. movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
  289. movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
  290. movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
  291. movq mm4,mm0
  292. movq mm5,mm0
  293. punpcklwd mm4,mm1
  294. punpckhwd mm5,mm1
  295. movq mm0,mm4
  296. movq mm1,mm5
  297. pmaddwd mm4,[GOTOFF(ebx,PW_F256_F089)] ; mm4=(tmp2L)
  298. pmaddwd mm5,[GOTOFF(ebx,PW_F256_F089)] ; mm5=(tmp2H)
  299. pmaddwd mm0,[GOTOFF(ebx,PW_F106_MF217)] ; mm0=(tmp0L)
  300. pmaddwd mm1,[GOTOFF(ebx,PW_F106_MF217)] ; mm1=(tmp0H)
  301. movq mm6,mm2
  302. movq mm7,mm2
  303. punpcklwd mm6,mm3
  304. punpckhwd mm7,mm3
  305. movq mm2,mm6
  306. movq mm3,mm7
  307. pmaddwd mm6,[GOTOFF(ebx,PW_MF060_MF050)] ; mm6=(tmp2L)
  308. pmaddwd mm7,[GOTOFF(ebx,PW_MF060_MF050)] ; mm7=(tmp2H)
  309. pmaddwd mm2,[GOTOFF(ebx,PW_F145_MF021)] ; mm2=(tmp0L)
  310. pmaddwd mm3,[GOTOFF(ebx,PW_F145_MF021)] ; mm3=(tmp0H)
  311. paddd mm6,mm4 ; mm6=tmp2L
  312. paddd mm7,mm5 ; mm7=tmp2H
  313. paddd mm2,mm0 ; mm2=tmp0L
  314. paddd mm3,mm1 ; mm3=tmp0H
  315. movq MMWORD [wk(0)], mm2 ; wk(0)=tmp0L
  316. movq MMWORD [wk(1)], mm3 ; wk(1)=tmp0H
  317. ; -- Even part
  318. movq mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
  319. movq mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
  320. movq mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
  321. pxor mm1,mm1
  322. pxor mm2,mm2
  323. punpcklwd mm1,mm4 ; mm1=tmp0L
  324. punpckhwd mm2,mm4 ; mm2=tmp0H
  325. psrad mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
  326. psrad mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
  327. movq mm3,mm5 ; mm5=in2=z2
  328. punpcklwd mm5,mm0 ; mm0=in6=z3
  329. punpckhwd mm3,mm0
  330. pmaddwd mm5,[GOTOFF(ebx,PW_F184_MF076)] ; mm5=tmp2L
  331. pmaddwd mm3,[GOTOFF(ebx,PW_F184_MF076)] ; mm3=tmp2H
  332. movq mm4,mm1
  333. movq mm0,mm2
  334. paddd mm1,mm5 ; mm1=tmp10L
  335. paddd mm2,mm3 ; mm2=tmp10H
  336. psubd mm4,mm5 ; mm4=tmp12L
  337. psubd mm0,mm3 ; mm0=tmp12H
  338. ; -- Final output stage
  339. movq mm5,mm1
  340. movq mm3,mm2
  341. paddd mm1,mm6 ; mm1=data0L
  342. paddd mm2,mm7 ; mm2=data0H
  343. psubd mm5,mm6 ; mm5=data3L
  344. psubd mm3,mm7 ; mm3=data3H
  345. movq mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm6=[PD_DESCALE_P2_4]
  346. paddd mm1,mm6
  347. paddd mm2,mm6
  348. psrad mm1,DESCALE_P2_4
  349. psrad mm2,DESCALE_P2_4
  350. paddd mm5,mm6
  351. paddd mm3,mm6
  352. psrad mm5,DESCALE_P2_4
  353. psrad mm3,DESCALE_P2_4
  354. packssdw mm1,mm2 ; mm1=data0=(00 10 20 30)
  355. packssdw mm5,mm3 ; mm5=data3=(03 13 23 33)
  356. movq mm7, MMWORD [wk(0)] ; mm7=tmp0L
  357. movq mm6, MMWORD [wk(1)] ; mm6=tmp0H
  358. movq mm2,mm4
  359. movq mm3,mm0
  360. paddd mm4,mm7 ; mm4=data1L
  361. paddd mm0,mm6 ; mm0=data1H
  362. psubd mm2,mm7 ; mm2=data2L
  363. psubd mm3,mm6 ; mm3=data2H
  364. movq mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; mm7=[PD_DESCALE_P2_4]
  365. paddd mm4,mm7
  366. paddd mm0,mm7
  367. psrad mm4,DESCALE_P2_4
  368. psrad mm0,DESCALE_P2_4
  369. paddd mm2,mm7
  370. paddd mm3,mm7
  371. psrad mm2,DESCALE_P2_4
  372. psrad mm3,DESCALE_P2_4
  373. packssdw mm4,mm0 ; mm4=data1=(01 11 21 31)
  374. packssdw mm2,mm3 ; mm2=data2=(02 12 22 32)
  375. movq mm6,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm6=[PB_CENTERJSAMP]
  376. packsswb mm1,mm2 ; mm1=(00 10 20 30 02 12 22 32)
  377. packsswb mm4,mm5 ; mm4=(01 11 21 31 03 13 23 33)
  378. paddb mm1,mm6
  379. paddb mm4,mm6
  380. movq mm7,mm1 ; transpose coefficients(phase 1)
  381. punpcklbw mm1,mm4 ; mm1=(00 01 10 11 20 21 30 31)
  382. punpckhbw mm7,mm4 ; mm7=(02 03 12 13 22 23 32 33)
  383. movq mm0,mm1 ; transpose coefficients(phase 2)
  384. punpcklwd mm1,mm7 ; mm1=(00 01 02 03 10 11 12 13)
  385. punpckhwd mm0,mm7 ; mm0=(20 21 22 23 30 31 32 33)
  386. mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
  387. mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
  388. movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
  389. movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
  390. psrlq mm1,4*BYTE_BIT
  391. psrlq mm0,4*BYTE_BIT
  392. mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
  393. mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
  394. movd DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
  395. movd DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
  396. emms ; empty MMX state
  397. pop edi
  398. pop esi
  399. ; pop edx ; need not be preserved
  400. ; pop ecx ; need not be preserved
  401. poppic ebx
  402. mov esp,ebp ; esp <- aligned ebp
  403. pop esp ; esp <- original ebp
  404. pop ebp
  405. ret
  406. ; --------------------------------------------------------------------------
  407. ;
  408. ; Perform dequantization and inverse DCT on one block of coefficients,
  409. ; producing a reduced-size 2x2 output block.
  410. ;
  411. ; GLOBAL(void)
  412. ; jsimd_idct_2x2_mmx (void * dct_table, JCOEFPTR coef_block,
  413. ; JSAMPARRAY output_buf, JDIMENSION output_col)
  414. ;
  415. %define dct_table(b) (b)+8 ; void * dct_table
  416. %define coef_block(b) (b)+12 ; JCOEFPTR coef_block
  417. %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
  418. %define output_col(b) (b)+20 ; JDIMENSION output_col
  419. align 16
  420. global EXTN(jsimd_idct_2x2_mmx)
  421. EXTN(jsimd_idct_2x2_mmx):
  422. push ebp
  423. mov ebp,esp
  424. push ebx
  425. ; push ecx ; need not be preserved
  426. ; push edx ; need not be preserved
  427. push esi
  428. push edi
  429. get_GOT ebx ; get GOT address
  430. ; ---- Pass 1: process columns from input.
  431. mov edx, POINTER [dct_table(ebp)] ; quantptr
  432. mov esi, JCOEFPTR [coef_block(ebp)] ; inptr
  433. ; | input: | result: |
  434. ; | 00 01 ** 03 ** 05 ** 07 | |
  435. ; | 10 11 ** 13 ** 15 ** 17 | |
  436. ; | ** ** ** ** ** ** ** ** | |
  437. ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
  438. ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
  439. ; | 50 51 ** 53 ** 55 ** 57 | |
  440. ; | ** ** ** ** ** ** ** ** | |
  441. ; | 70 71 ** 73 ** 75 ** 77 | |
  442. ; -- Odd part
  443. movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
  444. movq mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
  445. pmullw mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  446. pmullw mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  447. movq mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
  448. movq mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
  449. pmullw mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  450. pmullw mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  451. ; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
  452. ; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
  453. pcmpeqd mm7,mm7
  454. pslld mm7,WORD_BIT ; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
  455. movq mm4,mm0 ; mm4=(10 11 ** 13)
  456. movq mm5,mm2 ; mm5=(50 51 ** 53)
  457. punpcklwd mm4,mm1 ; mm4=(10 30 11 31)
  458. punpcklwd mm5,mm3 ; mm5=(50 70 51 71)
  459. pmaddwd mm4,[GOTOFF(ebx,PW_F362_MF127)]
  460. pmaddwd mm5,[GOTOFF(ebx,PW_F085_MF072)]
  461. psrld mm0,WORD_BIT ; mm0=(11 -- 13 --)
  462. pand mm1,mm7 ; mm1=(-- 31 -- 33)
  463. psrld mm2,WORD_BIT ; mm2=(51 -- 53 --)
  464. pand mm3,mm7 ; mm3=(-- 71 -- 73)
  465. por mm0,mm1 ; mm0=(11 31 13 33)
  466. por mm2,mm3 ; mm2=(51 71 53 73)
  467. pmaddwd mm0,[GOTOFF(ebx,PW_F362_MF127)]
  468. pmaddwd mm2,[GOTOFF(ebx,PW_F085_MF072)]
  469. paddd mm4,mm5 ; mm4=tmp0[col0 col1]
  470. movq mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
  471. movq mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
  472. pmullw mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
  473. pmullw mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
  474. movq mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
  475. movq mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
  476. pmullw mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
  477. pmullw mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
  478. ; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
  479. ; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
  480. psrld mm6,WORD_BIT ; mm6=(15 -- 17 --)
  481. pand mm1,mm7 ; mm1=(-- 35 -- 37)
  482. psrld mm3,WORD_BIT ; mm3=(55 -- 57 --)
  483. pand mm5,mm7 ; mm5=(-- 75 -- 77)
  484. por mm6,mm1 ; mm6=(15 35 17 37)
  485. por mm3,mm5 ; mm3=(55 75 57 77)
  486. pmaddwd mm6,[GOTOFF(ebx,PW_F362_MF127)]
  487. pmaddwd mm3,[GOTOFF(ebx,PW_F085_MF072)]
  488. paddd mm0,mm2 ; mm0=tmp0[col1 col3]
  489. paddd mm6,mm3 ; mm6=tmp0[col5 col7]
  490. ; -- Even part
  491. movq mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
  492. movq mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
  493. pmullw mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
  494. pmullw mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
  495. ; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
  496. movq mm2,mm1 ; mm2=(00 01 ** 03)
  497. pslld mm1,WORD_BIT ; mm1=(-- 00 -- **)
  498. psrad mm1,(WORD_BIT-CONST_BITS-2) ; mm1=tmp10[col0 ****]
  499. pand mm2,mm7 ; mm2=(-- 01 -- 03)
  500. pand mm5,mm7 ; mm5=(-- 05 -- 07)
  501. psrad mm2,(WORD_BIT-CONST_BITS-2) ; mm2=tmp10[col1 col3]
  502. psrad mm5,(WORD_BIT-CONST_BITS-2) ; mm5=tmp10[col5 col7]
  503. ; -- Final output stage
  504. movq mm3,mm1
  505. paddd mm1,mm4 ; mm1=data0[col0 ****]=(A0 **)
  506. psubd mm3,mm4 ; mm3=data1[col0 ****]=(B0 **)
  507. punpckldq mm1,mm3 ; mm1=(A0 B0)
  508. movq mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; mm7=[PD_DESCALE_P1_2]
  509. movq mm4,mm2
  510. movq mm3,mm5
  511. paddd mm2,mm0 ; mm2=data0[col1 col3]=(A1 A3)
  512. paddd mm5,mm6 ; mm5=data0[col5 col7]=(A5 A7)
  513. psubd mm4,mm0 ; mm4=data1[col1 col3]=(B1 B3)
  514. psubd mm3,mm6 ; mm3=data1[col5 col7]=(B5 B7)
  515. paddd mm1,mm7
  516. psrad mm1,DESCALE_P1_2
  517. paddd mm2,mm7
  518. paddd mm5,mm7
  519. psrad mm2,DESCALE_P1_2
  520. psrad mm5,DESCALE_P1_2
  521. paddd mm4,mm7
  522. paddd mm3,mm7
  523. psrad mm4,DESCALE_P1_2
  524. psrad mm3,DESCALE_P1_2
  525. ; ---- Pass 2: process rows, store into output array.
  526. mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *)
  527. mov eax, JDIMENSION [output_col(ebp)]
  528. ; | input:| result:|
  529. ; | A0 B0 | |
  530. ; | A1 B1 | C0 C1 |
  531. ; | A3 B3 | D0 D1 |
  532. ; | A5 B5 | |
  533. ; | A7 B7 | |
  534. ; -- Odd part
  535. packssdw mm2,mm4 ; mm2=(A1 A3 B1 B3)
  536. packssdw mm5,mm3 ; mm5=(A5 A7 B5 B7)
  537. pmaddwd mm2,[GOTOFF(ebx,PW_F362_MF127)]
  538. pmaddwd mm5,[GOTOFF(ebx,PW_F085_MF072)]
  539. paddd mm2,mm5 ; mm2=tmp0[row0 row1]
  540. ; -- Even part
  541. pslld mm1,(CONST_BITS+2) ; mm1=tmp10[row0 row1]
  542. ; -- Final output stage
  543. movq mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)] ; mm0=[PD_DESCALE_P2_2]
  544. movq mm6,mm1
  545. paddd mm1,mm2 ; mm1=data0[row0 row1]=(C0 C1)
  546. psubd mm6,mm2 ; mm6=data1[row0 row1]=(D0 D1)
  547. paddd mm1,mm0
  548. paddd mm6,mm0
  549. psrad mm1,DESCALE_P2_2
  550. psrad mm6,DESCALE_P2_2
  551. movq mm7,mm1 ; transpose coefficients
  552. punpckldq mm1,mm6 ; mm1=(C0 D0)
  553. punpckhdq mm7,mm6 ; mm7=(C1 D1)
  554. packssdw mm1,mm7 ; mm1=(C0 D0 C1 D1)
  555. packsswb mm1,mm1 ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
  556. paddb mm1,[GOTOFF(ebx,PB_CENTERJSAMP)]
  557. movd ecx,mm1
  558. movd ebx,mm1 ; ebx=(C0 D0 C1 D1)
  559. shr ecx,2*BYTE_BIT ; ecx=(C1 D1 -- --)
  560. mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
  561. mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
  562. mov WORD [edx+eax*SIZEOF_JSAMPLE], bx
  563. mov WORD [esi+eax*SIZEOF_JSAMPLE], cx
  564. emms ; empty MMX state
  565. pop edi
  566. pop esi
  567. ; pop edx ; need not be preserved
  568. ; pop ecx ; need not be preserved
  569. pop ebx
  570. pop ebp
  571. ret
  572. ; For some reason, the OS X linker does not honor the request to align the
  573. ; segment unless we do this.
  574. align 16