PageRenderTime 48ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/src/opts/SkBlitRow_opts_SSE2.cpp

http://skia.googlecode.com/
C++ | 736 lines | 466 code | 129 blank | 141 comment | 61 complexity | fbab2bd8298d218908753895fb5d3be8 MD5 | raw file
Possible License(s): BSD-3-Clause, CC-BY-SA-3.0
  1. /*
  2. * Copyright 2012 The Android Open Source Project
  3. *
  4. * Use of this source code is governed by a BSD-style license that can be
  5. * found in the LICENSE file.
  6. */
  7. #include "SkBlitRow_opts_SSE2.h"
  8. #include "SkColorPriv.h"
  9. #include "SkUtils.h"
  10. #include <emmintrin.h>
  11. /* SSE2 version of S32_Blend_BlitRow32()
  12. * portable version is in core/SkBlitRow_D32.cpp
  13. */
  14. void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
  15. const SkPMColor* SK_RESTRICT src,
  16. int count, U8CPU alpha) {
  17. SkASSERT(alpha <= 255);
  18. if (count <= 0) {
  19. return;
  20. }
  21. uint32_t src_scale = SkAlpha255To256(alpha);
  22. uint32_t dst_scale = 256 - src_scale;
  23. if (count >= 4) {
  24. SkASSERT(((size_t)dst & 0x03) == 0);
  25. while (((size_t)dst & 0x0F) != 0) {
  26. *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
  27. src++;
  28. dst++;
  29. count--;
  30. }
  31. const __m128i *s = reinterpret_cast<const __m128i*>(src);
  32. __m128i *d = reinterpret_cast<__m128i*>(dst);
  33. __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
  34. __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
  35. // Move scale factors to upper byte of word
  36. __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
  37. __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
  38. while (count >= 4) {
  39. // Load 4 pixels each of src and dest.
  40. __m128i src_pixel = _mm_loadu_si128(s);
  41. __m128i dst_pixel = _mm_load_si128(d);
  42. // Interleave Atom port 0/1 operations based on the execution port
  43. // constraints that multiply can only be executed on port 0 (while
  44. // boolean operations can be executed on either port 0 or port 1)
  45. // because GCC currently doesn't do a good job scheduling
  46. // instructions based on these constraints.
  47. // Get red and blue pixels into lower byte of each word.
  48. // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
  49. __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
  50. // Multiply by scale.
  51. // (4 x (0, rs.h, 0, bs.h))
  52. // where rs.h stands for the higher byte of r * scale, and
  53. // bs.h the higher byte of b * scale.
  54. src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
  55. // Get alpha and green pixels into higher byte of each word.
  56. // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
  57. __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
  58. // Multiply by scale.
  59. // (4 x (as.h, as.l, gs.h, gs.l))
  60. src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
  61. // Clear the lower byte of the a*scale and g*scale results
  62. // (4 x (as.h, 0, gs.h, 0))
  63. src_ag = _mm_and_si128(src_ag, ag_mask);
  64. // Operations the destination pixels are the same as on the
  65. // source pixels. See the comments above.
  66. __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
  67. dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
  68. __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
  69. dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
  70. dst_ag = _mm_and_si128(dst_ag, ag_mask);
  71. // Combine back into RGBA.
  72. // (4 x (as.h, rs.h, gs.h, bs.h))
  73. src_pixel = _mm_or_si128(src_rb, src_ag);
  74. dst_pixel = _mm_or_si128(dst_rb, dst_ag);
  75. // Add result
  76. __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
  77. _mm_store_si128(d, result);
  78. s++;
  79. d++;
  80. count -= 4;
  81. }
  82. src = reinterpret_cast<const SkPMColor*>(s);
  83. dst = reinterpret_cast<SkPMColor*>(d);
  84. }
  85. while (count > 0) {
  86. *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
  87. src++;
  88. dst++;
  89. count--;
  90. }
  91. }
  92. void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
  93. const SkPMColor* SK_RESTRICT src,
  94. int count, U8CPU alpha) {
  95. SkASSERT(alpha == 255);
  96. if (count <= 0) {
  97. return;
  98. }
  99. if (count >= 4) {
  100. SkASSERT(((size_t)dst & 0x03) == 0);
  101. while (((size_t)dst & 0x0F) != 0) {
  102. *dst = SkPMSrcOver(*src, *dst);
  103. src++;
  104. dst++;
  105. count--;
  106. }
  107. const __m128i *s = reinterpret_cast<const __m128i*>(src);
  108. __m128i *d = reinterpret_cast<__m128i*>(dst);
  109. #ifdef SK_USE_ACCURATE_BLENDING
  110. __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
  111. __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
  112. __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
  113. while (count >= 4) {
  114. // Load 4 pixels
  115. __m128i src_pixel = _mm_loadu_si128(s);
  116. __m128i dst_pixel = _mm_load_si128(d);
  117. __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
  118. __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
  119. // Shift alphas down to lower 8 bits of each quad.
  120. __m128i alpha = _mm_srli_epi32(src_pixel, 24);
  121. // Copy alpha to upper 3rd byte of each quad
  122. alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
  123. // Subtract alphas from 255, to get 0..255
  124. alpha = _mm_sub_epi16(c_255, alpha);
  125. // Multiply by red and blue by src alpha.
  126. dst_rb = _mm_mullo_epi16(dst_rb, alpha);
  127. // Multiply by alpha and green by src alpha.
  128. dst_ag = _mm_mullo_epi16(dst_ag, alpha);
  129. // dst_rb_low = (dst_rb >> 8)
  130. __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
  131. __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
  132. // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
  133. dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
  134. dst_rb = _mm_add_epi16(dst_rb, c_128);
  135. dst_rb = _mm_srli_epi16(dst_rb, 8);
  136. // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
  137. dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
  138. dst_ag = _mm_add_epi16(dst_ag, c_128);
  139. dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
  140. // Combine back into RGBA.
  141. dst_pixel = _mm_or_si128(dst_rb, dst_ag);
  142. // Add result
  143. __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
  144. _mm_store_si128(d, result);
  145. s++;
  146. d++;
  147. count -= 4;
  148. }
  149. #else
  150. __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
  151. __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
  152. while (count >= 4) {
  153. // Load 4 pixels
  154. __m128i src_pixel = _mm_loadu_si128(s);
  155. __m128i dst_pixel = _mm_load_si128(d);
  156. __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
  157. __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
  158. // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)
  159. __m128i alpha = _mm_srli_epi16(src_pixel, 8);
  160. // (a0, a0, a1, a1, a2, g2, a3, g3)
  161. alpha = _mm_shufflehi_epi16(alpha, 0xF5);
  162. // (a0, a0, a1, a1, a2, a2, a3, a3)
  163. alpha = _mm_shufflelo_epi16(alpha, 0xF5);
  164. // Subtract alphas from 256, to get 1..256
  165. alpha = _mm_sub_epi16(c_256, alpha);
  166. // Multiply by red and blue by src alpha.
  167. dst_rb = _mm_mullo_epi16(dst_rb, alpha);
  168. // Multiply by alpha and green by src alpha.
  169. dst_ag = _mm_mullo_epi16(dst_ag, alpha);
  170. // Divide by 256.
  171. dst_rb = _mm_srli_epi16(dst_rb, 8);
  172. // Mask out high bits (already in the right place)
  173. dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
  174. // Combine back into RGBA.
  175. dst_pixel = _mm_or_si128(dst_rb, dst_ag);
  176. // Add result
  177. __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
  178. _mm_store_si128(d, result);
  179. s++;
  180. d++;
  181. count -= 4;
  182. }
  183. #endif
  184. src = reinterpret_cast<const SkPMColor*>(s);
  185. dst = reinterpret_cast<SkPMColor*>(d);
  186. }
  187. while (count > 0) {
  188. *dst = SkPMSrcOver(*src, *dst);
  189. src++;
  190. dst++;
  191. count--;
  192. }
  193. }
  194. void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
  195. const SkPMColor* SK_RESTRICT src,
  196. int count, U8CPU alpha) {
  197. SkASSERT(alpha <= 255);
  198. if (count <= 0) {
  199. return;
  200. }
  201. if (count >= 4) {
  202. while (((size_t)dst & 0x0F) != 0) {
  203. *dst = SkBlendARGB32(*src, *dst, alpha);
  204. src++;
  205. dst++;
  206. count--;
  207. }
  208. uint32_t src_scale = SkAlpha255To256(alpha);
  209. const __m128i *s = reinterpret_cast<const __m128i*>(src);
  210. __m128i *d = reinterpret_cast<__m128i*>(dst);
  211. __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
  212. __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
  213. __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)
  214. while (count >= 4) {
  215. // Load 4 pixels each of src and dest.
  216. __m128i src_pixel = _mm_loadu_si128(s);
  217. __m128i dst_pixel = _mm_load_si128(d);
  218. // Get red and blue pixels into lower byte of each word.
  219. __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
  220. __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
  221. // Get alpha and green into lower byte of each word.
  222. __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
  223. __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
  224. // Put per-pixel alpha in low byte of each word.
  225. // After the following two statements, the dst_alpha looks like
  226. // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
  227. __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
  228. dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
  229. // dst_alpha = dst_alpha * src_scale
  230. // Because src_scales are in the higher byte of each word and
  231. // we use mulhi here, the resulting alpha values are already
  232. // in the right place and don't need to be divided by 256.
  233. // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
  234. dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
  235. // Subtract alphas from 256, to get 1..256
  236. dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
  237. // Multiply red and blue by dst pixel alpha.
  238. dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
  239. // Multiply alpha and green by dst pixel alpha.
  240. dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
  241. // Multiply red and blue by global alpha.
  242. // (4 x (0, rs.h, 0, bs.h))
  243. // where rs.h stands for the higher byte of r * src_scale,
  244. // and bs.h the higher byte of b * src_scale.
  245. // Again, because we use mulhi, the resuling red and blue
  246. // values are already in the right place and don't need to
  247. // be divided by 256.
  248. src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
  249. // Multiply alpha and green by global alpha.
  250. // (4 x (0, as.h, 0, gs.h))
  251. src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
  252. // Divide by 256.
  253. dst_rb = _mm_srli_epi16(dst_rb, 8);
  254. // Mask out low bits (goodies already in the right place; no need to divide)
  255. dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
  256. // Shift alpha and green to higher byte of each word.
  257. // (4 x (as.h, 0, gs.h, 0))
  258. src_ag = _mm_slli_epi16(src_ag, 8);
  259. // Combine back into RGBA.
  260. dst_pixel = _mm_or_si128(dst_rb, dst_ag);
  261. src_pixel = _mm_or_si128(src_rb, src_ag);
  262. // Add two pixels into result.
  263. __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
  264. _mm_store_si128(d, result);
  265. s++;
  266. d++;
  267. count -= 4;
  268. }
  269. src = reinterpret_cast<const SkPMColor*>(s);
  270. dst = reinterpret_cast<SkPMColor*>(d);
  271. }
  272. while (count > 0) {
  273. *dst = SkBlendARGB32(*src, *dst, alpha);
  274. src++;
  275. dst++;
  276. count--;
  277. }
  278. }
  279. /* SSE2 version of Color32()
  280. * portable version is in core/SkBlitRow_D32.cpp
  281. */
  282. void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
  283. SkPMColor color) {
  284. if (count <= 0) {
  285. return;
  286. }
  287. if (0 == color) {
  288. if (src != dst) {
  289. memcpy(dst, src, count * sizeof(SkPMColor));
  290. }
  291. return;
  292. }
  293. unsigned colorA = SkGetPackedA32(color);
  294. if (255 == colorA) {
  295. sk_memset32(dst, color, count);
  296. } else {
  297. unsigned scale = 256 - SkAlpha255To256(colorA);
  298. if (count >= 4) {
  299. SkASSERT(((size_t)dst & 0x03) == 0);
  300. while (((size_t)dst & 0x0F) != 0) {
  301. *dst = color + SkAlphaMulQ(*src, scale);
  302. src++;
  303. dst++;
  304. count--;
  305. }
  306. const __m128i *s = reinterpret_cast<const __m128i*>(src);
  307. __m128i *d = reinterpret_cast<__m128i*>(dst);
  308. __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
  309. __m128i src_scale_wide = _mm_set1_epi16(scale);
  310. __m128i color_wide = _mm_set1_epi32(color);
  311. while (count >= 4) {
  312. // Load 4 pixels each of src and dest.
  313. __m128i src_pixel = _mm_loadu_si128(s);
  314. // Get red and blue pixels into lower byte of each word.
  315. __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
  316. // Get alpha and green into lower byte of each word.
  317. __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
  318. // Multiply by scale.
  319. src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
  320. src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
  321. // Divide by 256.
  322. src_rb = _mm_srli_epi16(src_rb, 8);
  323. src_ag = _mm_andnot_si128(rb_mask, src_ag);
  324. // Combine back into RGBA.
  325. src_pixel = _mm_or_si128(src_rb, src_ag);
  326. // Add color to result.
  327. __m128i result = _mm_add_epi8(color_wide, src_pixel);
  328. // Store result.
  329. _mm_store_si128(d, result);
  330. s++;
  331. d++;
  332. count -= 4;
  333. }
  334. src = reinterpret_cast<const SkPMColor*>(s);
  335. dst = reinterpret_cast<SkPMColor*>(d);
  336. }
  337. while (count > 0) {
  338. *dst = color + SkAlphaMulQ(*src, scale);
  339. src += 1;
  340. dst += 1;
  341. count--;
  342. }
  343. }
  344. }
  345. void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
  346. size_t maskRB, SkColor origColor,
  347. int width, int height) {
  348. SkPMColor color = SkPreMultiplyColor(origColor);
  349. size_t dstOffset = dstRB - (width << 2);
  350. size_t maskOffset = maskRB - width;
  351. SkPMColor* dst = (SkPMColor *)device;
  352. const uint8_t* mask = (const uint8_t*)maskPtr;
  353. do {
  354. int count = width;
  355. if (count >= 4) {
  356. while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
  357. *dst = SkBlendARGB32(color, *dst, *mask);
  358. mask++;
  359. dst++;
  360. count--;
  361. }
  362. __m128i *d = reinterpret_cast<__m128i*>(dst);
  363. __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
  364. __m128i c_256 = _mm_set1_epi16(256);
  365. __m128i c_1 = _mm_set1_epi16(1);
  366. __m128i src_pixel = _mm_set1_epi32(color);
  367. while (count >= 4) {
  368. // Load 4 pixels each of src and dest.
  369. __m128i dst_pixel = _mm_load_si128(d);
  370. //set the aphla value
  371. __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\
  372. 0, *(mask+3),0, \
  373. *(mask+2),0, *(mask+2),\
  374. 0,*(mask+1), 0,*(mask+1),\
  375. 0, *mask,0,*mask);
  376. //call SkAlpha255To256()
  377. src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
  378. // Get red and blue pixels into lower byte of each word.
  379. __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
  380. __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
  381. // Get alpha and green into lower byte of each word.
  382. __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
  383. __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
  384. // Put per-pixel alpha in low byte of each word.
  385. __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
  386. dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
  387. // dst_alpha = dst_alpha * src_scale
  388. dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
  389. // Divide by 256.
  390. dst_alpha = _mm_srli_epi16(dst_alpha, 8);
  391. // Subtract alphas from 256, to get 1..256
  392. dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
  393. // Multiply red and blue by dst pixel alpha.
  394. dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
  395. // Multiply alpha and green by dst pixel alpha.
  396. dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
  397. // Multiply red and blue by global alpha.
  398. src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
  399. // Multiply alpha and green by global alpha.
  400. src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
  401. // Divide by 256.
  402. dst_rb = _mm_srli_epi16(dst_rb, 8);
  403. src_rb = _mm_srli_epi16(src_rb, 8);
  404. // Mask out low bits (goodies already in the right place; no need to divide)
  405. dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
  406. src_ag = _mm_andnot_si128(rb_mask, src_ag);
  407. // Combine back into RGBA.
  408. dst_pixel = _mm_or_si128(dst_rb, dst_ag);
  409. __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
  410. // Add two pixels into result.
  411. __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
  412. _mm_store_si128(d, result);
  413. // load the next 4 pixel
  414. mask = mask + 4;
  415. d++;
  416. count -= 4;
  417. }
  418. dst = reinterpret_cast<SkPMColor *>(d);
  419. }
  420. while(count > 0) {
  421. *dst= SkBlendARGB32(color, *dst, *mask);
  422. dst += 1;
  423. mask++;
  424. count --;
  425. }
  426. dst = (SkPMColor *)((char*)dst + dstOffset);
  427. mask += maskOffset;
  428. } while (--height != 0);
  429. }
  430. static __m128i SkBlendLCD16_SSE2(__m128i &srci, __m128i &dst,
  431. __m128i &mask, __m128i &scale) {
  432. // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
  433. __m128i r = _mm_and_si128(_mm_slli_epi32(mask,
  434. 16-SK_R16_SHIFT-(SK_R16_BITS-5)),
  435. _mm_set1_epi32(0x001F0000));
  436. __m128i g = _mm_and_si128(_mm_slli_epi32(mask,
  437. 8-SK_G16_SHIFT-(SK_G16_BITS-5)),
  438. _mm_set1_epi32(0x00001F00));
  439. __m128i b = _mm_and_si128(_mm_slli_epi32(mask,
  440. SK_B16_BITS-5),
  441. _mm_set1_epi32(0x0000001F));
  442. // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
  443. mask = _mm_or_si128(_mm_or_si128(r, g), b);
  444. // Interleave R,G,B into the lower byte of word.
  445. __m128i maskLo, maskHi;
  446. maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
  447. maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
  448. // Upscale to 0..32
  449. maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
  450. maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
  451. maskLo = _mm_mullo_epi16(maskLo, scale);
  452. maskHi = _mm_mullo_epi16(maskHi, scale);
  453. maskLo = _mm_srli_epi16(maskLo, 8);
  454. maskHi = _mm_srli_epi16(maskHi, 8);
  455. // Interleave R,G,B into the lower byte of the word.
  456. __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
  457. __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
  458. maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
  459. maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));
  460. maskLo = _mm_srai_epi16(maskLo, 5);
  461. maskHi = _mm_srai_epi16(maskHi, 5);
  462. // Add two pixels into result.
  463. __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
  464. __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
  465. // Pack into 4 32bit dst pixels
  466. return _mm_packus_epi16(resultLo, resultHi);
  467. }
  468. static __m128i SkBlendLCD16Opaque_SSE2(__m128i &srci, __m128i &dst,
  469. __m128i &mask) {
  470. // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
  471. __m128i r = _mm_and_si128(_mm_slli_epi32(mask,
  472. 16-SK_R16_SHIFT-(SK_R16_BITS-5)),
  473. _mm_set1_epi32(0x001F0000));
  474. __m128i g = _mm_and_si128(_mm_slli_epi32(mask,
  475. 8-SK_G16_SHIFT-(SK_G16_BITS-5)),
  476. _mm_set1_epi32(0x00001F00));
  477. __m128i b = _mm_and_si128(_mm_slli_epi32(mask, SK_B16_BITS-5),
  478. _mm_set1_epi32(0x0000001F));
  479. // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
  480. mask = _mm_or_si128(_mm_or_si128(r, g), b);
  481. // Interleave R,G,B into the lower byte of word.
  482. __m128i maskLo, maskHi;
  483. maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
  484. maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
  485. // Upscale to 0..32
  486. maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
  487. maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
  488. // Interleave R,G,B into the lower byte of the word.
  489. __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
  490. __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
  491. maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
  492. maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));
  493. maskLo = _mm_srai_epi16(maskLo, 5);
  494. maskHi = _mm_srai_epi16(maskHi, 5);
  495. // Add two pixels into result.
  496. __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
  497. __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
  498. // Pack into 4 32bit dst pixels
  499. return _mm_packus_epi16(resultLo, resultHi);
  500. }
  501. void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],
  502. SkColor color, int width, SkPMColor) {
  503. if (width <= 0) {
  504. return;
  505. }
  506. int srcA = SkColorGetA(color);
  507. int srcR = SkColorGetR(color);
  508. int srcG = SkColorGetG(color);
  509. int srcB = SkColorGetB(color);
  510. srcA = SkAlpha255To256(srcA);
  511. if (width >= 4) {
  512. SkASSERT(((size_t)dst & 0x03) == 0);
  513. while (((size_t)dst & 0x0F) != 0) {
  514. *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src);
  515. src++;
  516. dst++;
  517. width--;
  518. }
  519. __m128i *d = reinterpret_cast<__m128i*>(dst);
  520. __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
  521. srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
  522. __m128i scale = _mm_set1_epi16(srcA);
  523. while (width >= 4) {
  524. __m128i dst_pixel = _mm_load_si128(d);
  525. __m128i mask_pixel = _mm_loadl_epi64(
  526. reinterpret_cast<const __m128i*>(src));
  527. // Check whether mask_pixels are equal to 0 and get the highest bit
  528. // of each byte of result, if mask pixes are all zero, we will get
  529. // pack_cmp to 0xFFFF
  530. int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
  531. _mm_setzero_si128()));
  532. // if mask pixels are not all zero, we will blend the dst pixels
  533. if (pack_cmp != 0xFFFF) {
  534. // Unpack 4 16bit mask pixels to
  535. // (p0, 0, p1, 0, p2, 0, p3, 0)
  536. mask_pixel = _mm_unpacklo_epi16(mask_pixel,
  537. _mm_setzero_si128());
  538. // Process 4 32bit dst pixels
  539. __m128i result = SkBlendLCD16_SSE2(srci, dst_pixel,
  540. mask_pixel, scale);
  541. _mm_store_si128(d, result);
  542. }
  543. d++;
  544. src += 4;
  545. width -= 4;
  546. }
  547. dst = reinterpret_cast<SkPMColor*>(d);
  548. }
  549. while (width > 0) {
  550. *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src);
  551. src++;
  552. dst++;
  553. width--;
  554. }
  555. }
  556. void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],
  557. SkColor color, int width, SkPMColor opaqueDst) {
  558. if (width <= 0) {
  559. return;
  560. }
  561. int srcR = SkColorGetR(color);
  562. int srcG = SkColorGetG(color);
  563. int srcB = SkColorGetB(color);
  564. if (width >= 4) {
  565. SkASSERT(((size_t)dst & 0x03) == 0);
  566. while (((size_t)dst & 0x0F) != 0) {
  567. *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst);
  568. src++;
  569. dst++;
  570. width--;
  571. }
  572. __m128i *d = reinterpret_cast<__m128i*>(dst);
  573. __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
  574. srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
  575. while (width >= 4) {
  576. __m128i dst_pixel = _mm_load_si128(d);
  577. __m128i mask_pixel = _mm_loadl_epi64(
  578. reinterpret_cast<const __m128i*>(src));
  579. // Check whether mask_pixels are equal to 0 and get the highest bit
  580. // of each byte of result, if mask pixes are all zero, we will get
  581. // pack_cmp to 0xFFFF
  582. int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
  583. _mm_setzero_si128()));
  584. // if mask pixels are not all zero, we will blend the dst pixels
  585. if (pack_cmp != 0xFFFF) {
  586. // Unpack 4 16bit mask pixels to
  587. // (p0, 0, p1, 0, p2, 0, p3, 0)
  588. mask_pixel = _mm_unpacklo_epi16(mask_pixel,
  589. _mm_setzero_si128());
  590. // Process 4 32bit dst pixels
  591. __m128i result = SkBlendLCD16Opaque_SSE2(srci, dst_pixel,
  592. mask_pixel);
  593. _mm_store_si128(d, result);
  594. }
  595. d++;
  596. src += 4;
  597. width -= 4;
  598. }
  599. dst = reinterpret_cast<SkPMColor*>(d);
  600. }
  601. while (width > 0) {
  602. *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst);
  603. src++;
  604. dst++;
  605. width--;
  606. }
  607. }