/media/libjpeg/simd/jcqnt3dn.asm

http://github.com/zpao/v8monkey · Assembly · 233 lines · 150 code · 38 blank · 45 comment · 0 complexity · b347edb8482b01142d810d62fbe3a0ee MD5 · raw file

  1. ;
  2. ; jcqnt3dn.asm - sample data conversion and quantization (3DNow! & MMX)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ;
  6. ; Based on
  7. ; x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. ;
  17. ; [TAB8]
  18. %include "jsimdext.inc"
  19. %include "jdct.inc"
  20. ; --------------------------------------------------------------------------
  21. SECTION SEG_TEXT
  22. BITS 32
  23. ;
  24. ; Load data into workspace, applying unsigned->signed conversion
  25. ;
  26. ; GLOBAL(void)
  27. ; jsimd_convsamp_float_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col,
  28. ; FAST_FLOAT * workspace);
  29. ;
  30. %define sample_data ebp+8 ; JSAMPARRAY sample_data
  31. %define start_col ebp+12 ; JDIMENSION start_col
  32. %define workspace ebp+16 ; FAST_FLOAT * workspace
  33. align 16
  34. global EXTN(jsimd_convsamp_float_3dnow)
  35. EXTN(jsimd_convsamp_float_3dnow):
  36. push ebp
  37. mov ebp,esp
  38. push ebx
  39. ; push ecx ; need not be preserved
  40. ; push edx ; need not be preserved
  41. push esi
  42. push edi
  43. pcmpeqw mm7,mm7
  44. psllw mm7,7
  45. packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)
  46. mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
  47. mov eax, JDIMENSION [start_col]
  48. mov edi, POINTER [workspace] ; (DCTELEM *)
  49. mov ecx, DCTSIZE/2
  50. alignx 16,7
  51. .convloop:
  52. mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  53. mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  54. movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
  55. movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
  56. psubb mm0,mm7 ; mm0=(01234567)
  57. psubb mm1,mm7 ; mm1=(89ABCDEF)
  58. punpcklbw mm2,mm0 ; mm2=(*0*1*2*3)
  59. punpckhbw mm0,mm0 ; mm0=(*4*5*6*7)
  60. punpcklbw mm3,mm1 ; mm3=(*8*9*A*B)
  61. punpckhbw mm1,mm1 ; mm1=(*C*D*E*F)
  62. punpcklwd mm4,mm2 ; mm4=(***0***1)
  63. punpckhwd mm2,mm2 ; mm2=(***2***3)
  64. punpcklwd mm5,mm0 ; mm5=(***4***5)
  65. punpckhwd mm0,mm0 ; mm0=(***6***7)
  66. psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01)
  67. psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23)
  68. pi2fd mm4,mm4
  69. pi2fd mm2,mm2
  70. psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45)
  71. psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67)
  72. pi2fd mm5,mm5
  73. pi2fd mm0,mm0
  74. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
  75. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
  76. movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
  77. movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
  78. punpcklwd mm6,mm3 ; mm6=(***8***9)
  79. punpckhwd mm3,mm3 ; mm3=(***A***B)
  80. punpcklwd mm4,mm1 ; mm4=(***C***D)
  81. punpckhwd mm1,mm1 ; mm1=(***E***F)
  82. psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89)
  83. psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB)
  84. pi2fd mm6,mm6
  85. pi2fd mm3,mm3
  86. psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD)
  87. psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF)
  88. pi2fd mm4,mm4
  89. pi2fd mm1,mm1
  90. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
  91. movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
  92. movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
  93. movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
  94. add esi, byte 2*SIZEOF_JSAMPROW
  95. add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
  96. dec ecx
  97. jnz near .convloop
  98. femms ; empty MMX/3DNow! state
  99. pop edi
  100. pop esi
  101. ; pop edx ; need not be preserved
  102. ; pop ecx ; need not be preserved
  103. pop ebx
  104. pop ebp
  105. ret
  106. ; --------------------------------------------------------------------------
  107. ;
  108. ; Quantize/descale the coefficients, and store into coef_block
  109. ;
  110. ; GLOBAL(void)
  111. ; jsimd_quantize_float_3dnow (JCOEFPTR coef_block, FAST_FLOAT * divisors,
  112. ; FAST_FLOAT * workspace);
  113. ;
  114. %define coef_block ebp+8 ; JCOEFPTR coef_block
  115. %define divisors ebp+12 ; FAST_FLOAT * divisors
  116. %define workspace ebp+16 ; FAST_FLOAT * workspace
  117. align 16
  118. global EXTN(jsimd_quantize_float_3dnow)
  119. EXTN(jsimd_quantize_float_3dnow):
  120. push ebp
  121. mov ebp,esp
  122. ; push ebx ; unused
  123. ; push ecx ; unused
  124. ; push edx ; need not be preserved
  125. push esi
  126. push edi
  127. mov eax, 0x4B400000 ; (float)0x00C00000 (rndint_magic)
  128. movd mm7,eax
  129. punpckldq mm7,mm7 ; mm7={12582912.0F 12582912.0F}
  130. mov esi, POINTER [workspace]
  131. mov edx, POINTER [divisors]
  132. mov edi, JCOEFPTR [coef_block]
  133. mov eax, DCTSIZE2/16
  134. alignx 16,7
  135. .quantloop:
  136. movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
  137. movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
  138. pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
  139. pfmul mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
  140. movq mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
  141. movq mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
  142. pfmul mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
  143. pfmul mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
  144. pfadd mm0,mm7 ; mm0=(00 ** 01 **)
  145. pfadd mm1,mm7 ; mm1=(02 ** 03 **)
  146. pfadd mm2,mm7 ; mm0=(04 ** 05 **)
  147. pfadd mm3,mm7 ; mm1=(06 ** 07 **)
  148. movq mm4,mm0
  149. punpcklwd mm0,mm1 ; mm0=(00 02 ** **)
  150. punpckhwd mm4,mm1 ; mm4=(01 03 ** **)
  151. movq mm5,mm2
  152. punpcklwd mm2,mm3 ; mm2=(04 06 ** **)
  153. punpckhwd mm5,mm3 ; mm5=(05 07 ** **)
  154. punpcklwd mm0,mm4 ; mm0=(00 01 02 03)
  155. punpcklwd mm2,mm5 ; mm2=(04 05 06 07)
  156. movq mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
  157. movq mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
  158. pfmul mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
  159. pfmul mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
  160. movq mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
  161. movq mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
  162. pfmul mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
  163. pfmul mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
  164. pfadd mm6,mm7 ; mm0=(10 ** 11 **)
  165. pfadd mm1,mm7 ; mm4=(12 ** 13 **)
  166. pfadd mm3,mm7 ; mm0=(14 ** 15 **)
  167. pfadd mm4,mm7 ; mm4=(16 ** 17 **)
  168. movq mm5,mm6
  169. punpcklwd mm6,mm1 ; mm6=(10 12 ** **)
  170. punpckhwd mm5,mm1 ; mm5=(11 13 ** **)
  171. movq mm1,mm3
  172. punpcklwd mm3,mm4 ; mm3=(14 16 ** **)
  173. punpckhwd mm1,mm4 ; mm1=(15 17 ** **)
  174. punpcklwd mm6,mm5 ; mm6=(10 11 12 13)
  175. punpcklwd mm3,mm1 ; mm3=(14 15 16 17)
  176. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
  177. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
  178. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
  179. movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
  180. add esi, byte 16*SIZEOF_FAST_FLOAT
  181. add edx, byte 16*SIZEOF_FAST_FLOAT
  182. add edi, byte 16*SIZEOF_JCOEF
  183. dec eax
  184. jnz near .quantloop
  185. femms ; empty MMX/3DNow! state
  186. pop edi
  187. pop esi
  188. ; pop edx ; need not be preserved
  189. ; pop ecx ; unused
  190. ; pop ebx ; unused
  191. pop ebp
  192. ret
  193. ; For some reason, the OS X linker does not honor the request to align the
  194. ; segment unless we do this.
  195. align 16