/media/libjpeg/simd/jdmrgss2-64.asm

http://github.com/zpao/v8monkey · Assembly · 584 lines · 434 code · 81 blank · 69 comment · 1 complexity · ee244d1525fbc712ce9dde3b92e31db2 MD5 · raw file

  1. ;
  2. ; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright 2009 D. R. Commander
  6. ;
  7. ; Based on
  8. ; x86 SIMD extension for IJG JPEG library
  9. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  10. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  11. ;
  12. ; This file should be assembled with NASM (Netwide Assembler),
  13. ; can *not* be assembled with Microsoft's MASM or any compatible
  14. ; assembler (including Borland's Turbo Assembler).
  15. ; NASM is available from http://nasm.sourceforge.net/ for
  16. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  17. ;
  18. ; [TAB8]
  19. %include "jcolsamp.inc"
  20. ; --------------------------------------------------------------------------
  21. SECTION SEG_TEXT
  22. BITS 64
  23. ;
  24. ; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
  25. ;
  26. ; GLOBAL(void)
  27. ; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
  28. ; JSAMPIMAGE input_buf,
  29. ; JDIMENSION in_row_group_ctr,
  30. ; JSAMPARRAY output_buf);
  31. ;
  32. ; r10 = JDIMENSION output_width
  33. ; r11 = JSAMPIMAGE input_buf
  34. ; r12 = JDIMENSION in_row_group_ctr
  35. ; r13 = JSAMPARRAY output_buf
  36. %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
  37. %define WK_NUM 3
  38. align 16
  39. global EXTN(jsimd_h2v1_merged_upsample_sse2)
  40. EXTN(jsimd_h2v1_merged_upsample_sse2):
  41. push rbp
  42. mov rax,rsp ; rax = original rbp
  43. sub rsp, byte 4
  44. and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
  45. mov [rsp],rax
  46. mov rbp,rsp ; rbp = aligned rbp
  47. lea rsp, [wk(0)]
  48. collect_args
  49. push rbx
  50. mov rcx, r10 ; col
  51. test rcx,rcx
  52. jz near .return
  53. push rcx
  54. mov rdi, r11
  55. mov rcx, r12
  56. mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
  57. mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
  58. mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
  59. mov rdi, r13
  60. mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0
  61. mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1
  62. mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2
  63. mov rdi, JSAMPROW [rdi] ; outptr
  64. pop rcx ; col
  65. .columnloop:
  66. movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF)
  67. movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF)
  68. pxor xmm1,xmm1 ; xmm1=(all 0's)
  69. pcmpeqw xmm3,xmm3
  70. psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
  71. movdqa xmm4,xmm6
  72. punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH
  73. punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL
  74. movdqa xmm0,xmm7
  75. punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH
  76. punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL
  77. paddw xmm6,xmm3
  78. paddw xmm4,xmm3
  79. paddw xmm7,xmm3
  80. paddw xmm0,xmm3
  81. ; (Original)
  82. ; R = Y + 1.40200 * Cr
  83. ; G = Y - 0.34414 * Cb - 0.71414 * Cr
  84. ; B = Y + 1.77200 * Cb
  85. ;
  86. ; (This implementation)
  87. ; R = Y + 0.40200 * Cr + Cr
  88. ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
  89. ; B = Y - 0.22800 * Cb + Cb + Cb
  90. movdqa xmm5,xmm6 ; xmm5=CbH
  91. movdqa xmm2,xmm4 ; xmm2=CbL
  92. paddw xmm6,xmm6 ; xmm6=2*CbH
  93. paddw xmm4,xmm4 ; xmm4=2*CbL
  94. movdqa xmm1,xmm7 ; xmm1=CrH
  95. movdqa xmm3,xmm0 ; xmm3=CrL
  96. paddw xmm7,xmm7 ; xmm7=2*CrH
  97. paddw xmm0,xmm0 ; xmm0=2*CrL
  98. pmulhw xmm6,[rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800))
  99. pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800))
  100. pmulhw xmm7,[rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200))
  101. pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200))
  102. paddw xmm6,[rel PW_ONE]
  103. paddw xmm4,[rel PW_ONE]
  104. psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800))
  105. psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800))
  106. paddw xmm7,[rel PW_ONE]
  107. paddw xmm0,[rel PW_ONE]
  108. psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200))
  109. psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200))
  110. paddw xmm6,xmm5
  111. paddw xmm4,xmm2
  112. paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
  113. paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
  114. paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
  115. paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
  116. movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H
  117. movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H
  118. movdqa xmm6,xmm5
  119. movdqa xmm7,xmm2
  120. punpcklwd xmm5,xmm1
  121. punpckhwd xmm6,xmm1
  122. pmaddwd xmm5,[rel PW_MF0344_F0285]
  123. pmaddwd xmm6,[rel PW_MF0344_F0285]
  124. punpcklwd xmm2,xmm3
  125. punpckhwd xmm7,xmm3
  126. pmaddwd xmm2,[rel PW_MF0344_F0285]
  127. pmaddwd xmm7,[rel PW_MF0344_F0285]
  128. paddd xmm5,[rel PD_ONEHALF]
  129. paddd xmm6,[rel PD_ONEHALF]
  130. psrad xmm5,SCALEBITS
  131. psrad xmm6,SCALEBITS
  132. paddd xmm2,[rel PD_ONEHALF]
  133. paddd xmm7,[rel PD_ONEHALF]
  134. psrad xmm2,SCALEBITS
  135. psrad xmm7,SCALEBITS
  136. packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
  137. packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
  138. psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
  139. psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
  140. movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H
  141. mov al,2 ; Yctr
  142. jmp short .Yloop_1st
  143. .Yloop_2nd:
  144. movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H
  145. movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H
  146. movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H
  147. .Yloop_1st:
  148. movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF)
  149. pcmpeqw xmm6,xmm6
  150. psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
  151. pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE
  152. psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO
  153. movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H)
  154. movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H)
  155. movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H)
  156. paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
  157. paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
  158. packuswb xmm0,xmm0 ; xmm0=R(02468ACE********)
  159. packuswb xmm1,xmm1 ; xmm1=R(13579BDF********)
  160. paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
  161. paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
  162. packuswb xmm2,xmm2 ; xmm2=G(02468ACE********)
  163. packuswb xmm3,xmm3 ; xmm3=G(13579BDF********)
  164. paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
  165. paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
  166. packuswb xmm4,xmm4 ; xmm4=B(02468ACE********)
  167. packuswb xmm5,xmm5 ; xmm5=B(13579BDF********)
  168. %if RGB_PIXELSIZE == 3 ; ---------------
  169. ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
  170. ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
  171. ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
  172. ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
  173. punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
  174. punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
  175. punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
  176. movdqa xmmG,xmmA
  177. movdqa xmmH,xmmA
  178. punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
  179. punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
  180. psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
  181. psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
  182. movdqa xmmC,xmmD
  183. movdqa xmmB,xmmD
  184. punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
  185. punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
  186. psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
  187. movdqa xmmF,xmmE
  188. punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
  189. punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
  190. pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
  191. movdqa xmmB,xmmE
  192. punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
  193. punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
  194. punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
  195. pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
  196. movdqa xmmB,xmmF
  197. punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
  198. punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
  199. punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
  200. punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
  201. punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
  202. punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
  203. cmp rcx, byte SIZEOF_XMMWORD
  204. jb short .column_st32
  205. test rdi, SIZEOF_XMMWORD-1
  206. jnz short .out1
  207. ; --(aligned)-------------------
  208. movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
  209. movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
  210. movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
  211. add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
  212. jmp short .out0
  213. .out1: ; --(unaligned)-----------------
  214. pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
  215. maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
  216. add rdi, byte SIZEOF_XMMWORD ; outptr
  217. maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
  218. add rdi, byte SIZEOF_XMMWORD ; outptr
  219. maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
  220. add rdi, byte SIZEOF_XMMWORD ; outptr
  221. .out0:
  222. sub rcx, byte SIZEOF_XMMWORD
  223. jz near .endcolumn
  224. add rsi, byte SIZEOF_XMMWORD ; inptr0
  225. dec al ; Yctr
  226. jnz near .Yloop_2nd
  227. add rbx, byte SIZEOF_XMMWORD ; inptr1
  228. add rdx, byte SIZEOF_XMMWORD ; inptr2
  229. jmp near .columnloop
  230. .column_st32:
  231. pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
  232. lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
  233. cmp rcx, byte 2*SIZEOF_XMMWORD
  234. jb short .column_st16
  235. maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
  236. add rdi, byte SIZEOF_XMMWORD ; outptr
  237. maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
  238. add rdi, byte SIZEOF_XMMWORD ; outptr
  239. movdqa xmmA,xmmF
  240. sub rcx, byte 2*SIZEOF_XMMWORD
  241. jmp short .column_st15
  242. .column_st16:
  243. cmp rcx, byte SIZEOF_XMMWORD
  244. jb short .column_st15
  245. maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
  246. add rdi, byte SIZEOF_XMMWORD ; outptr
  247. movdqa xmmA,xmmD
  248. sub rcx, byte SIZEOF_XMMWORD
  249. .column_st15:
  250. mov rax,rcx
  251. xor rcx, byte 0x0F
  252. shl rcx, 2
  253. movd xmmB,ecx
  254. psrlq xmmH,4
  255. pcmpeqb xmmE,xmmE
  256. psrlq xmmH,xmmB
  257. psrlq xmmE,xmmB
  258. punpcklbw xmmE,xmmH
  259. ; ----------------
  260. mov rcx,rdi
  261. and rcx, byte SIZEOF_XMMWORD-1
  262. jz short .adj0
  263. add rax,rcx
  264. cmp rax, byte SIZEOF_XMMWORD
  265. ja short .adj0
  266. and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
  267. shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
  268. movdqa xmmG,xmmA
  269. movdqa xmmC,xmmE
  270. pslldq xmmA, SIZEOF_XMMWORD/2
  271. pslldq xmmE, SIZEOF_XMMWORD/2
  272. movd xmmD,ecx
  273. sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
  274. jb short .adj1
  275. movd xmmF,ecx
  276. psllq xmmA,xmmF
  277. psllq xmmE,xmmF
  278. jmp short .adj0
  279. .adj1: neg rcx
  280. movd xmmF,ecx
  281. psrlq xmmA,xmmF
  282. psrlq xmmE,xmmF
  283. psllq xmmG,xmmD
  284. psllq xmmC,xmmD
  285. por xmmA,xmmG
  286. por xmmE,xmmC
  287. .adj0: ; ----------------
  288. maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
  289. %else ; RGB_PIXELSIZE == 4 ; -----------
  290. %ifdef RGBX_FILLER_0XFF
  291. pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
  292. pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
  293. %else
  294. pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********)
  295. pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********)
  296. %endif
  297. ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
  298. ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
  299. ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
  300. ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
  301. punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
  302. punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
  303. punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
  304. punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
  305. movdqa xmmC,xmmA
  306. punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
  307. punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
  308. movdqa xmmG,xmmB
  309. punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
  310. punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
  311. movdqa xmmD,xmmA
  312. punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
  313. punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
  314. movdqa xmmH,xmmC
  315. punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
  316. punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
  317. cmp rcx, byte SIZEOF_XMMWORD
  318. jb short .column_st32
  319. test rdi, SIZEOF_XMMWORD-1
  320. jnz short .out1
  321. ; --(aligned)-------------------
  322. movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
  323. movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
  324. movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
  325. movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
  326. add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
  327. jmp short .out0
  328. .out1: ; --(unaligned)-----------------
  329. pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
  330. maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
  331. add rdi, byte SIZEOF_XMMWORD ; outptr
  332. maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
  333. add rdi, byte SIZEOF_XMMWORD ; outptr
  334. maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
  335. add rdi, byte SIZEOF_XMMWORD ; outptr
  336. maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
  337. add rdi, byte SIZEOF_XMMWORD ; outptr
  338. .out0:
  339. sub rcx, byte SIZEOF_XMMWORD
  340. jz near .endcolumn
  341. add rsi, byte SIZEOF_XMMWORD ; inptr0
  342. dec al ; Yctr
  343. jnz near .Yloop_2nd
  344. add rbx, byte SIZEOF_XMMWORD ; inptr1
  345. add rdx, byte SIZEOF_XMMWORD ; inptr2
  346. jmp near .columnloop
  347. .column_st32:
  348. pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
  349. cmp rcx, byte SIZEOF_XMMWORD/2
  350. jb short .column_st16
  351. maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
  352. add rdi, byte SIZEOF_XMMWORD ; outptr
  353. maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
  354. add rdi, byte SIZEOF_XMMWORD ; outptr
  355. movdqa xmmA,xmmC
  356. movdqa xmmD,xmmH
  357. sub rcx, byte SIZEOF_XMMWORD/2
  358. .column_st16:
  359. cmp rcx, byte SIZEOF_XMMWORD/4
  360. jb short .column_st15
  361. maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
  362. add rdi, byte SIZEOF_XMMWORD ; outptr
  363. movdqa xmmA,xmmD
  364. sub rcx, byte SIZEOF_XMMWORD/4
  365. .column_st15:
  366. cmp rcx, byte SIZEOF_XMMWORD/16
  367. jb near .endcolumn
  368. mov rax,rcx
  369. xor rcx, byte 0x03
  370. inc rcx
  371. shl rcx, 4
  372. movd xmmF,ecx
  373. psrlq xmmE,xmmF
  374. punpcklbw xmmE,xmmE
  375. ; ----------------
  376. mov rcx,rdi
  377. and rcx, byte SIZEOF_XMMWORD-1
  378. jz short .adj0
  379. lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
  380. cmp rax, byte SIZEOF_XMMWORD
  381. ja short .adj0
  382. and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
  383. shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
  384. movdqa xmmB,xmmA
  385. movdqa xmmG,xmmE
  386. pslldq xmmA, SIZEOF_XMMWORD/2
  387. pslldq xmmE, SIZEOF_XMMWORD/2
  388. movd xmmC,ecx
  389. sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
  390. jb short .adj1
  391. movd xmmH,ecx
  392. psllq xmmA,xmmH
  393. psllq xmmE,xmmH
  394. jmp short .adj0
  395. .adj1: neg rcx
  396. movd xmmH,ecx
  397. psrlq xmmA,xmmH
  398. psrlq xmmE,xmmH
  399. psllq xmmB,xmmC
  400. psllq xmmG,xmmC
  401. por xmmA,xmmB
  402. por xmmE,xmmG
  403. .adj0: ; ----------------
  404. maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
  405. %endif ; RGB_PIXELSIZE ; ---------------
  406. .endcolumn:
  407. sfence ; flush the write buffer
  408. .return:
  409. pop rbx
  410. uncollect_args
  411. mov rsp,rbp ; rsp <- aligned rbp
  412. pop rsp ; rsp <- original rbp
  413. pop rbp
  414. ret
  415. ; --------------------------------------------------------------------------
  416. ;
  417. ; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
  418. ;
  419. ; GLOBAL(void)
  420. ; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
  421. ; JSAMPIMAGE input_buf,
  422. ; JDIMENSION in_row_group_ctr,
  423. ; JSAMPARRAY output_buf);
  424. ;
  425. ; r10 = JDIMENSION output_width
  426. ; r11 = JSAMPIMAGE input_buf
  427. ; r12 = JDIMENSION in_row_group_ctr
  428. ; r13 = JSAMPARRAY output_buf
  429. align 16
  430. global EXTN(jsimd_h2v2_merged_upsample_sse2)
  431. EXTN(jsimd_h2v2_merged_upsample_sse2):
  432. push rbp
  433. mov rax,rsp
  434. mov rbp,rsp
  435. collect_args
  436. push rbx
  437. mov rax, r10
  438. mov rdi, r11
  439. mov rcx, r12
  440. mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
  441. mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
  442. mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
  443. mov rdi, r13
  444. lea rsi, [rsi+rcx*SIZEOF_JSAMPROW]
  445. push rdx ; inptr2
  446. push rbx ; inptr1
  447. push rsi ; inptr00
  448. mov rbx,rsp
  449. push rdi
  450. push rcx
  451. push rax
  452. %ifdef WIN64
  453. mov r8, rcx
  454. mov r9, rdi
  455. mov rcx, rax
  456. mov rdx, rbx
  457. %else
  458. mov rdx, rcx
  459. mov rcx, rdi
  460. mov rdi, rax
  461. mov rsi, rbx
  462. %endif
  463. call EXTN(jsimd_h2v1_merged_upsample_sse2)
  464. pop rax
  465. pop rcx
  466. pop rdi
  467. pop rsi
  468. pop rbx
  469. pop rdx
  470. add rdi, byte SIZEOF_JSAMPROW ; outptr1
  471. add rsi, byte SIZEOF_JSAMPROW ; inptr01
  472. push rdx ; inptr2
  473. push rbx ; inptr1
  474. push rsi ; inptr00
  475. mov rbx,rsp
  476. push rdi
  477. push rcx
  478. push rax
  479. %ifdef WIN64
  480. mov r8, rcx
  481. mov r9, rdi
  482. mov rcx, rax
  483. mov rdx, rbx
  484. %else
  485. mov rdx, rcx
  486. mov rcx, rdi
  487. mov rdi, rax
  488. mov rsi, rbx
  489. %endif
  490. call EXTN(jsimd_h2v1_merged_upsample_sse2)
  491. pop rax
  492. pop rcx
  493. pop rdi
  494. pop rsi
  495. pop rbx
  496. pop rdx
  497. pop rbx
  498. uncollect_args
  499. pop rbp
  500. ret
  501. ; For some reason, the OS X linker does not honor the request to align the
  502. ; segment unless we do this.
  503. align 16