/media/libjpeg/simd/jcsammmx.asm

http://github.com/zpao/v8monkey · Assembly · 324 lines · 204 code · 63 blank · 57 comment · 0 complexity · 68c4533083e89c591bb5ca37917140cd MD5 · raw file

  1. ;
  2. ; jcsammmx.asm - downsampling (MMX)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ;
  6. ; Based on
  7. ; x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. ;
  17. ; [TAB8]
  18. %include "jsimdext.inc"
  19. ; --------------------------------------------------------------------------
  20. SECTION SEG_TEXT
  21. BITS 32
  22. ;
  23. ; Downsample pixel values of a single component.
  24. ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
  25. ; without smoothing.
  26. ;
  27. ; GLOBAL(void)
  28. ; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
  29. ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
  30. ; JSAMPARRAY input_data, JSAMPARRAY output_data);
  31. ;
  32. %define img_width(b) (b)+8 ; JDIMENSION image_width
  33. %define max_v_samp(b) (b)+12 ; int max_v_samp_factor
  34. %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
  35. %define width_blks(b) (b)+20 ; JDIMENSION width_blocks
  36. %define input_data(b) (b)+24 ; JSAMPARRAY input_data
  37. %define output_data(b) (b)+28 ; JSAMPARRAY output_data
  38. align 16
  39. global EXTN(jsimd_h2v1_downsample_mmx)
  40. EXTN(jsimd_h2v1_downsample_mmx):
  41. push ebp
  42. mov ebp,esp
  43. ; push ebx ; unused
  44. ; push ecx ; need not be preserved
  45. ; push edx ; need not be preserved
  46. push esi
  47. push edi
  48. mov ecx, JDIMENSION [width_blks(ebp)]
  49. shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
  50. jz near .return
  51. mov edx, JDIMENSION [img_width(ebp)]
  52. ; -- expand_right_edge
  53. push ecx
  54. shl ecx,1 ; output_cols * 2
  55. sub ecx,edx
  56. jle short .expand_end
  57. mov eax, INT [max_v_samp(ebp)]
  58. test eax,eax
  59. jle short .expand_end
  60. cld
  61. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  62. alignx 16,7
  63. .expandloop:
  64. push eax
  65. push ecx
  66. mov edi, JSAMPROW [esi]
  67. add edi,edx
  68. mov al, JSAMPLE [edi-1]
  69. rep stosb
  70. pop ecx
  71. pop eax
  72. add esi, byte SIZEOF_JSAMPROW
  73. dec eax
  74. jg short .expandloop
  75. .expand_end:
  76. pop ecx ; output_cols
  77. ; -- h2v1_downsample
  78. mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
  79. test eax,eax
  80. jle near .return
  81. mov edx, 0x00010000 ; bias pattern
  82. movd mm7,edx
  83. pcmpeqw mm6,mm6
  84. punpckldq mm7,mm7 ; mm7={0, 1, 0, 1}
  85. psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
  86. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  87. mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
  88. alignx 16,7
  89. .rowloop:
  90. push ecx
  91. push edi
  92. push esi
  93. mov esi, JSAMPROW [esi] ; inptr
  94. mov edi, JSAMPROW [edi] ; outptr
  95. alignx 16,7
  96. .columnloop:
  97. movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
  98. movq mm1, MMWORD [esi+1*SIZEOF_MMWORD]
  99. movq mm2,mm0
  100. movq mm3,mm1
  101. pand mm0,mm6
  102. psrlw mm2,BYTE_BIT
  103. pand mm1,mm6
  104. psrlw mm3,BYTE_BIT
  105. paddw mm0,mm2
  106. paddw mm1,mm3
  107. paddw mm0,mm7
  108. paddw mm1,mm7
  109. psrlw mm0,1
  110. psrlw mm1,1
  111. packuswb mm0,mm1
  112. movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
  113. add esi, byte 2*SIZEOF_MMWORD ; inptr
  114. add edi, byte 1*SIZEOF_MMWORD ; outptr
  115. sub ecx, byte SIZEOF_MMWORD ; outcol
  116. jnz short .columnloop
  117. pop esi
  118. pop edi
  119. pop ecx
  120. add esi, byte SIZEOF_JSAMPROW ; input_data
  121. add edi, byte SIZEOF_JSAMPROW ; output_data
  122. dec eax ; rowctr
  123. jg short .rowloop
  124. emms ; empty MMX state
  125. .return:
  126. pop edi
  127. pop esi
  128. ; pop edx ; need not be preserved
  129. ; pop ecx ; need not be preserved
  130. ; pop ebx ; unused
  131. pop ebp
  132. ret
  133. ; --------------------------------------------------------------------------
  134. ;
  135. ; Downsample pixel values of a single component.
  136. ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
  137. ; without smoothing.
  138. ;
  139. ; GLOBAL(void)
  140. ; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
  141. ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
  142. ; JSAMPARRAY input_data, JSAMPARRAY output_data);
  143. ;
  144. %define img_width(b) (b)+8 ; JDIMENSION image_width
  145. %define max_v_samp(b) (b)+12 ; int max_v_samp_factor
  146. %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
  147. %define width_blks(b) (b)+20 ; JDIMENSION width_blocks
  148. %define input_data(b) (b)+24 ; JSAMPARRAY input_data
  149. %define output_data(b) (b)+28 ; JSAMPARRAY output_data
  150. align 16
  151. global EXTN(jsimd_h2v2_downsample_mmx)
  152. EXTN(jsimd_h2v2_downsample_mmx):
  153. push ebp
  154. mov ebp,esp
  155. ; push ebx ; unused
  156. ; push ecx ; need not be preserved
  157. ; push edx ; need not be preserved
  158. push esi
  159. push edi
  160. mov ecx, JDIMENSION [width_blks(ebp)]
  161. shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
  162. jz near .return
  163. mov edx, JDIMENSION [img_width(ebp)]
  164. ; -- expand_right_edge
  165. push ecx
  166. shl ecx,1 ; output_cols * 2
  167. sub ecx,edx
  168. jle short .expand_end
  169. mov eax, INT [max_v_samp(ebp)]
  170. test eax,eax
  171. jle short .expand_end
  172. cld
  173. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  174. alignx 16,7
  175. .expandloop:
  176. push eax
  177. push ecx
  178. mov edi, JSAMPROW [esi]
  179. add edi,edx
  180. mov al, JSAMPLE [edi-1]
  181. rep stosb
  182. pop ecx
  183. pop eax
  184. add esi, byte SIZEOF_JSAMPROW
  185. dec eax
  186. jg short .expandloop
  187. .expand_end:
  188. pop ecx ; output_cols
  189. ; -- h2v2_downsample
  190. mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
  191. test eax,eax
  192. jle near .return
  193. mov edx, 0x00020001 ; bias pattern
  194. movd mm7,edx
  195. pcmpeqw mm6,mm6
  196. punpckldq mm7,mm7 ; mm7={1, 2, 1, 2}
  197. psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
  198. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  199. mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
  200. alignx 16,7
  201. .rowloop:
  202. push ecx
  203. push edi
  204. push esi
  205. mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
  206. mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
  207. mov edi, JSAMPROW [edi] ; outptr
  208. alignx 16,7
  209. .columnloop:
  210. movq mm0, MMWORD [edx+0*SIZEOF_MMWORD]
  211. movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
  212. movq mm2, MMWORD [edx+1*SIZEOF_MMWORD]
  213. movq mm3, MMWORD [esi+1*SIZEOF_MMWORD]
  214. movq mm4,mm0
  215. movq mm5,mm1
  216. pand mm0,mm6
  217. psrlw mm4,BYTE_BIT
  218. pand mm1,mm6
  219. psrlw mm5,BYTE_BIT
  220. paddw mm0,mm4
  221. paddw mm1,mm5
  222. movq mm4,mm2
  223. movq mm5,mm3
  224. pand mm2,mm6
  225. psrlw mm4,BYTE_BIT
  226. pand mm3,mm6
  227. psrlw mm5,BYTE_BIT
  228. paddw mm2,mm4
  229. paddw mm3,mm5
  230. paddw mm0,mm1
  231. paddw mm2,mm3
  232. paddw mm0,mm7
  233. paddw mm2,mm7
  234. psrlw mm0,2
  235. psrlw mm2,2
  236. packuswb mm0,mm2
  237. movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
  238. add edx, byte 2*SIZEOF_MMWORD ; inptr0
  239. add esi, byte 2*SIZEOF_MMWORD ; inptr1
  240. add edi, byte 1*SIZEOF_MMWORD ; outptr
  241. sub ecx, byte SIZEOF_MMWORD ; outcol
  242. jnz near .columnloop
  243. pop esi
  244. pop edi
  245. pop ecx
  246. add esi, byte 2*SIZEOF_JSAMPROW ; input_data
  247. add edi, byte 1*SIZEOF_JSAMPROW ; output_data
  248. dec eax ; rowctr
  249. jg near .rowloop
  250. emms ; empty MMX state
  251. .return:
  252. pop edi
  253. pop esi
  254. ; pop edx ; need not be preserved
  255. ; pop ecx ; need not be preserved
  256. ; pop ebx ; unused
  257. pop ebp
  258. ret
  259. ; For some reason, the OS X linker does not honor the request to align the
  260. ; segment unless we do this.
  261. align 16