PageRenderTime 62ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/src/opts/SkBlitRow_opts_SSE2.cpp

https://github.com/CyanogenMod/android_external_skia
C++ | 1122 lines | 748 code | 171 blank | 203 comment | 99 complexity | b10587834bc263fdc0f6605a0305b29f MD5 | raw file
Possible License(s): LGPL-3.0, Apache-2.0
  1. /*
  2. * Copyright 2012 The Android Open Source Project
  3. *
  4. * Use of this source code is governed by a BSD-style license that can be
  5. * found in the LICENSE file.
  6. */
  7. #include <emmintrin.h>
  8. #include "SkBitmapProcState_opts_SSE2.h"
  9. #include "SkBlitRow_opts_SSE2.h"
  10. #include "SkColorPriv.h"
  11. #include "SkColor_opts_SSE2.h"
  12. #include "SkDither.h"
  13. #include "SkMSAN.h"
  14. #include "SkUtils.h"
  15. /* SSE2 version of S32_Blend_BlitRow32()
  16. * portable version is in core/SkBlitRow_D32.cpp
  17. */
  18. void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
  19. const SkPMColor* SK_RESTRICT src,
  20. int count, U8CPU alpha) {
  21. SkASSERT(alpha <= 255);
  22. if (count <= 0) {
  23. return;
  24. }
  25. uint32_t src_scale = SkAlpha255To256(alpha);
  26. uint32_t dst_scale = 256 - src_scale;
  27. if (count >= 4) {
  28. SkASSERT(((size_t)dst & 0x03) == 0);
  29. while (((size_t)dst & 0x0F) != 0) {
  30. *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
  31. src++;
  32. dst++;
  33. count--;
  34. }
  35. const __m128i *s = reinterpret_cast<const __m128i*>(src);
  36. __m128i *d = reinterpret_cast<__m128i*>(dst);
  37. while (count >= 4) {
  38. // Load 4 pixels each of src and dest.
  39. __m128i src_pixel = _mm_loadu_si128(s);
  40. __m128i dst_pixel = _mm_load_si128(d);
  41. src_pixel = SkAlphaMulQ_SSE2(src_pixel, src_scale);
  42. dst_pixel = SkAlphaMulQ_SSE2(dst_pixel, dst_scale);
  43. // Add result
  44. __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
  45. _mm_store_si128(d, result);
  46. s++;
  47. d++;
  48. count -= 4;
  49. }
  50. src = reinterpret_cast<const SkPMColor*>(s);
  51. dst = reinterpret_cast<SkPMColor*>(d);
  52. }
  53. while (count > 0) {
  54. *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
  55. src++;
  56. dst++;
  57. count--;
  58. }
  59. }
  60. void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
  61. const SkPMColor* SK_RESTRICT src,
  62. int count, U8CPU alpha) {
  63. sk_msan_assert_initialized(src, src+count);
  64. SkASSERT(alpha == 255);
  65. if (count <= 0) {
  66. return;
  67. }
  68. #ifdef SK_USE_ACCURATE_BLENDING
  69. if (count >= 4) {
  70. SkASSERT(((size_t)dst & 0x03) == 0);
  71. while (((size_t)dst & 0x0F) != 0) {
  72. *dst = SkPMSrcOver(*src, *dst);
  73. src++;
  74. dst++;
  75. count--;
  76. }
  77. const __m128i *s = reinterpret_cast<const __m128i*>(src);
  78. __m128i *d = reinterpret_cast<__m128i*>(dst);
  79. __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
  80. __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
  81. __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
  82. while (count >= 4) {
  83. // Load 4 pixels
  84. __m128i src_pixel = _mm_loadu_si128(s);
  85. __m128i dst_pixel = _mm_load_si128(d);
  86. __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
  87. __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
  88. // Shift alphas down to lower 8 bits of each quad.
  89. __m128i alpha = _mm_srli_epi32(src_pixel, 24);
  90. // Copy alpha to upper 3rd byte of each quad
  91. alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
  92. // Subtract alphas from 255, to get 0..255
  93. alpha = _mm_sub_epi16(c_255, alpha);
  94. // Multiply by red and blue by src alpha.
  95. dst_rb = _mm_mullo_epi16(dst_rb, alpha);
  96. // Multiply by alpha and green by src alpha.
  97. dst_ag = _mm_mullo_epi16(dst_ag, alpha);
  98. // dst_rb_low = (dst_rb >> 8)
  99. __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
  100. __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
  101. // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
  102. dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
  103. dst_rb = _mm_add_epi16(dst_rb, c_128);
  104. dst_rb = _mm_srli_epi16(dst_rb, 8);
  105. // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
  106. dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
  107. dst_ag = _mm_add_epi16(dst_ag, c_128);
  108. dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
  109. // Combine back into RGBA.
  110. dst_pixel = _mm_or_si128(dst_rb, dst_ag);
  111. // Add result
  112. __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
  113. _mm_store_si128(d, result);
  114. s++;
  115. d++;
  116. count -= 4;
  117. }
  118. src = reinterpret_cast<const SkPMColor*>(s);
  119. dst = reinterpret_cast<SkPMColor*>(d);
  120. }
  121. while (count > 0) {
  122. *dst = SkPMSrcOver(*src, *dst);
  123. src++;
  124. dst++;
  125. count--;
  126. }
  127. #else
  128. int count16 = count / 16;
  129. __m128i* dst4 = (__m128i*)dst;
  130. const __m128i* src4 = (const __m128i*)src;
  131. for (int i = 0; i < count16 * 4; i += 4) {
  132. // Load 16 source pixels.
  133. __m128i s0 = _mm_loadu_si128(src4+i+0),
  134. s1 = _mm_loadu_si128(src4+i+1),
  135. s2 = _mm_loadu_si128(src4+i+2),
  136. s3 = _mm_loadu_si128(src4+i+3);
  137. const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
  138. const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
  139. __m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero_si128());
  140. if (0xffff == _mm_movemask_epi8(cmp)) {
  141. // All 16 source pixels are fully transparent. There's nothing to do!
  142. continue;
  143. }
  144. const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
  145. cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask);
  146. if (0xffff == _mm_movemask_epi8(cmp)) {
  147. // All 16 source pixels are fully opaque. There's no need to read dst or blend it.
  148. _mm_storeu_si128(dst4+i+0, s0);
  149. _mm_storeu_si128(dst4+i+1, s1);
  150. _mm_storeu_si128(dst4+i+2, s2);
  151. _mm_storeu_si128(dst4+i+3, s3);
  152. continue;
  153. }
  154. // The general slow case: do the blend for all 16 pixels.
  155. _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
  156. _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
  157. _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
  158. _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
  159. }
  160. // Wrap up the last <= 15 pixels.
  161. SkASSERT(count - (count16*16) <= 15);
  162. for (int i = count16*16; i < count; i++) {
  163. // This check is not really necessarily, but it prevents pointless autovectorization.
  164. if (src[i] & 0xFF000000) {
  165. dst[i] = SkPMSrcOver(src[i], dst[i]);
  166. }
  167. }
  168. #endif
  169. }
  170. void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
  171. const SkPMColor* SK_RESTRICT src,
  172. int count, U8CPU alpha) {
  173. SkASSERT(alpha <= 255);
  174. if (count <= 0) {
  175. return;
  176. }
  177. if (count >= 4) {
  178. while (((size_t)dst & 0x0F) != 0) {
  179. *dst = SkBlendARGB32(*src, *dst, alpha);
  180. src++;
  181. dst++;
  182. count--;
  183. }
  184. const __m128i *s = reinterpret_cast<const __m128i*>(src);
  185. __m128i *d = reinterpret_cast<__m128i*>(dst);
  186. while (count >= 4) {
  187. // Load 4 pixels each of src and dest.
  188. __m128i src_pixel = _mm_loadu_si128(s);
  189. __m128i dst_pixel = _mm_load_si128(d);
  190. __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
  191. _mm_store_si128(d, result);
  192. s++;
  193. d++;
  194. count -= 4;
  195. }
  196. src = reinterpret_cast<const SkPMColor*>(s);
  197. dst = reinterpret_cast<SkPMColor*>(d);
  198. }
  199. while (count > 0) {
  200. *dst = SkBlendARGB32(*src, *dst, alpha);
  201. src++;
  202. dst++;
  203. count--;
  204. }
  205. }
  206. void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) {
  207. SkASSERT(count > 0);
  208. uint32_t src_expand = (SkGetPackedG32(src) << 24) |
  209. (SkGetPackedR32(src) << 13) |
  210. (SkGetPackedB32(src) << 2);
  211. unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
  212. // Check if we have enough pixels to run SIMD
  213. if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
  214. __m128i* dst_wide;
  215. const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2);
  216. const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3);
  217. const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2);
  218. const __m128i scale_wide = _mm_set1_epi16(scale);
  219. const __m128i mask_blue = _mm_set1_epi16(SK_B16_MASK);
  220. const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT);
  221. // Align dst to an even 16 byte address (0-7 pixels)
  222. while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
  223. *dst = SkBlend32_RGB16(src_expand, *dst, scale);
  224. dst += 1;
  225. count--;
  226. }
  227. dst_wide = reinterpret_cast<__m128i*>(dst);
  228. do {
  229. // Load eight RGB565 pixels
  230. __m128i pixels = _mm_load_si128(dst_wide);
  231. // Mask out sub-pixels
  232. __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT);
  233. __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS);
  234. pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS);
  235. __m128i pixel_B = _mm_and_si128(pixels, mask_blue);
  236. // Scale with alpha
  237. pixel_R = _mm_mullo_epi16(pixel_R, scale_wide);
  238. pixel_G = _mm_mullo_epi16(pixel_G, scale_wide);
  239. pixel_B = _mm_mullo_epi16(pixel_B, scale_wide);
  240. // Add src_X_wide and shift down again
  241. pixel_R = _mm_add_epi16(pixel_R, src_R_wide);
  242. pixel_R = _mm_srli_epi16(pixel_R, 5);
  243. pixel_G = _mm_add_epi16(pixel_G, src_G_wide);
  244. pixel_B = _mm_add_epi16(pixel_B, src_B_wide);
  245. pixel_B = _mm_srli_epi16(pixel_B, 5);
  246. // Combine into RGB565 and store
  247. pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT);
  248. pixel_G = _mm_and_si128(pixel_G, mask_green);
  249. pixels = _mm_or_si128(pixel_R, pixel_G);
  250. pixels = _mm_or_si128(pixels, pixel_B);
  251. _mm_store_si128(dst_wide, pixels);
  252. count -= 8;
  253. dst_wide++;
  254. } while (count >= 8);
  255. dst = reinterpret_cast<uint16_t*>(dst_wide);
  256. }
  257. // Small loop to handle remaining pixels.
  258. while (count > 0) {
  259. *dst = SkBlend32_RGB16(src_expand, *dst, scale);
  260. dst += 1;
  261. count--;
  262. }
  263. }
  264. // The following (left) shifts cause the top 5 bits of the mask components to
  265. // line up with the corresponding components in an SkPMColor.
  266. // Note that the mask's RGB16 order may differ from the SkPMColor order.
  267. #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
  268. #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
  269. #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
  270. #if SK_R16x5_R32x5_SHIFT == 0
  271. #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
  272. #elif SK_R16x5_R32x5_SHIFT > 0
  273. #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
  274. #else
  275. #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
  276. #endif
  277. #if SK_G16x5_G32x5_SHIFT == 0
  278. #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
  279. #elif SK_G16x5_G32x5_SHIFT > 0
  280. #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
  281. #else
  282. #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
  283. #endif
  284. #if SK_B16x5_B32x5_SHIFT == 0
  285. #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
  286. #elif SK_B16x5_B32x5_SHIFT > 0
  287. #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
  288. #else
  289. #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
  290. #endif
  291. static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
  292. __m128i &mask, __m128i &srcA) {
  293. // In the following comments, the components of src, dst and mask are
  294. // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
  295. // by an R, G, B, or A suffix. Components of one of the four pixels that
  296. // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
  297. // example is the blue channel of the second destination pixel. Memory
  298. // layout is shown for an ARGB byte order in a color value.
  299. // src and srcA store 8-bit values interleaved with zeros.
  300. // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
  301. // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
  302. // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
  303. // mask stores 16-bit values (compressed three channels) interleaved with zeros.
  304. // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
  305. // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
  306. // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
  307. // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
  308. // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
  309. __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
  310. _mm_set1_epi32(0x1F << SK_R32_SHIFT));
  311. // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
  312. __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
  313. _mm_set1_epi32(0x1F << SK_G32_SHIFT));
  314. // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
  315. __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
  316. _mm_set1_epi32(0x1F << SK_B32_SHIFT));
  317. // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
  318. // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
  319. // 8-bit position
  320. // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
  321. // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
  322. mask = _mm_or_si128(_mm_or_si128(r, g), b);
  323. // Interleave R,G,B into the lower byte of word.
  324. // i.e. split the sixteen 8-bit values from mask into two sets of eight
  325. // 16-bit values, padded by zero.
  326. __m128i maskLo, maskHi;
  327. // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
  328. maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
  329. // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
  330. maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
  331. // Upscale from 0..31 to 0..32
  332. // (allows to replace division by left-shift further down)
  333. // Left-shift each component by 4 and add the result back to that component,
  334. // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
  335. maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
  336. maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
  337. // Multiply each component of maskLo and maskHi by srcA
  338. maskLo = _mm_mullo_epi16(maskLo, srcA);
  339. maskHi = _mm_mullo_epi16(maskHi, srcA);
  340. // Left shift mask components by 8 (divide by 256)
  341. maskLo = _mm_srli_epi16(maskLo, 8);
  342. maskHi = _mm_srli_epi16(maskHi, 8);
  343. // Interleave R,G,B into the lower byte of the word
  344. // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
  345. __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
  346. // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
  347. __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
  348. // mask = (src - dst) * mask
  349. maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
  350. maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
  351. // mask = (src - dst) * mask >> 5
  352. maskLo = _mm_srai_epi16(maskLo, 5);
  353. maskHi = _mm_srai_epi16(maskHi, 5);
  354. // Add two pixels into result.
  355. // result = dst + ((src - dst) * mask >> 5)
  356. __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
  357. __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
  358. // Pack into 4 32bit dst pixels.
  359. // resultLo and resultHi contain eight 16-bit components (two pixels) each.
  360. // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
  361. // clamping to 255 if necessary.
  362. return _mm_packus_epi16(resultLo, resultHi);
  363. }
  364. static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
  365. __m128i &mask) {
  366. // In the following comments, the components of src, dst and mask are
  367. // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
  368. // by an R, G, B, or A suffix. Components of one of the four pixels that
  369. // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
  370. // example is the blue channel of the second destination pixel. Memory
  371. // layout is shown for an ARGB byte order in a color value.
  372. // src and srcA store 8-bit values interleaved with zeros.
  373. // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
  374. // mask stores 16-bit values (shown as high and low bytes) interleaved with
  375. // zeros
  376. // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
  377. // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
  378. // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
  379. // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
  380. __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
  381. _mm_set1_epi32(0x1F << SK_R32_SHIFT));
  382. // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
  383. __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
  384. _mm_set1_epi32(0x1F << SK_G32_SHIFT));
  385. // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
  386. __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
  387. _mm_set1_epi32(0x1F << SK_B32_SHIFT));
  388. // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
  389. // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
  390. // 8-bit position
  391. // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
  392. // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
  393. mask = _mm_or_si128(_mm_or_si128(r, g), b);
  394. // Interleave R,G,B into the lower byte of word.
  395. // i.e. split the sixteen 8-bit values from mask into two sets of eight
  396. // 16-bit values, padded by zero.
  397. __m128i maskLo, maskHi;
  398. // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
  399. maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
  400. // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
  401. maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
  402. // Upscale from 0..31 to 0..32
  403. // (allows to replace division by left-shift further down)
  404. // Left-shift each component by 4 and add the result back to that component,
  405. // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
  406. maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
  407. maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
  408. // Interleave R,G,B into the lower byte of the word
  409. // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
  410. __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
  411. // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
  412. __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
  413. // mask = (src - dst) * mask
  414. maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
  415. maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
  416. // mask = (src - dst) * mask >> 5
  417. maskLo = _mm_srai_epi16(maskLo, 5);
  418. maskHi = _mm_srai_epi16(maskHi, 5);
  419. // Add two pixels into result.
  420. // result = dst + ((src - dst) * mask >> 5)
  421. __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
  422. __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
  423. // Pack into 4 32bit dst pixels and force opaque.
  424. // resultLo and resultHi contain eight 16-bit components (two pixels) each.
  425. // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
  426. // clamping to 255 if necessary. Set alpha components to 0xFF.
  427. return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
  428. _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
  429. }
  430. void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
  431. SkColor src, int width, SkPMColor) {
  432. if (width <= 0) {
  433. return;
  434. }
  435. int srcA = SkColorGetA(src);
  436. int srcR = SkColorGetR(src);
  437. int srcG = SkColorGetG(src);
  438. int srcB = SkColorGetB(src);
  439. srcA = SkAlpha255To256(srcA);
  440. if (width >= 4) {
  441. SkASSERT(((size_t)dst & 0x03) == 0);
  442. while (((size_t)dst & 0x0F) != 0) {
  443. *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
  444. mask++;
  445. dst++;
  446. width--;
  447. }
  448. __m128i *d = reinterpret_cast<__m128i*>(dst);
  449. // Set alpha to 0xFF and replicate source four times in SSE register.
  450. __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
  451. // Interleave with zeros to get two sets of four 16-bit values.
  452. src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
  453. // Set srcA_sse to contain eight copies of srcA, padded with zero.
  454. // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
  455. __m128i srcA_sse = _mm_set1_epi16(srcA);
  456. while (width >= 4) {
  457. // Load four destination pixels into dst_sse.
  458. __m128i dst_sse = _mm_load_si128(d);
  459. // Load four 16-bit masks into lower half of mask_sse.
  460. __m128i mask_sse = _mm_loadl_epi64(
  461. reinterpret_cast<const __m128i*>(mask));
  462. // Check whether masks are equal to 0 and get the highest bit
  463. // of each byte of result, if masks are all zero, we will get
  464. // pack_cmp to 0xFFFF
  465. int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
  466. _mm_setzero_si128()));
  467. // if mask pixels are not all zero, we will blend the dst pixels
  468. if (pack_cmp != 0xFFFF) {
  469. // Unpack 4 16bit mask pixels to
  470. // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
  471. // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
  472. mask_sse = _mm_unpacklo_epi16(mask_sse,
  473. _mm_setzero_si128());
  474. // Process 4 32bit dst pixels
  475. __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
  476. mask_sse, srcA_sse);
  477. _mm_store_si128(d, result);
  478. }
  479. d++;
  480. mask += 4;
  481. width -= 4;
  482. }
  483. dst = reinterpret_cast<SkPMColor*>(d);
  484. }
  485. while (width > 0) {
  486. *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
  487. mask++;
  488. dst++;
  489. width--;
  490. }
  491. }
  492. void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
  493. SkColor src, int width, SkPMColor opaqueDst) {
  494. if (width <= 0) {
  495. return;
  496. }
  497. int srcR = SkColorGetR(src);
  498. int srcG = SkColorGetG(src);
  499. int srcB = SkColorGetB(src);
  500. if (width >= 4) {
  501. SkASSERT(((size_t)dst & 0x03) == 0);
  502. while (((size_t)dst & 0x0F) != 0) {
  503. *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
  504. mask++;
  505. dst++;
  506. width--;
  507. }
  508. __m128i *d = reinterpret_cast<__m128i*>(dst);
  509. // Set alpha to 0xFF and replicate source four times in SSE register.
  510. __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
  511. // Set srcA_sse to contain eight copies of srcA, padded with zero.
  512. // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
  513. src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
  514. while (width >= 4) {
  515. // Load four destination pixels into dst_sse.
  516. __m128i dst_sse = _mm_load_si128(d);
  517. // Load four 16-bit masks into lower half of mask_sse.
  518. __m128i mask_sse = _mm_loadl_epi64(
  519. reinterpret_cast<const __m128i*>(mask));
  520. // Check whether masks are equal to 0 and get the highest bit
  521. // of each byte of result, if masks are all zero, we will get
  522. // pack_cmp to 0xFFFF
  523. int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
  524. _mm_setzero_si128()));
  525. // if mask pixels are not all zero, we will blend the dst pixels
  526. if (pack_cmp != 0xFFFF) {
  527. // Unpack 4 16bit mask pixels to
  528. // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
  529. // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
  530. mask_sse = _mm_unpacklo_epi16(mask_sse,
  531. _mm_setzero_si128());
  532. // Process 4 32bit dst pixels
  533. __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
  534. mask_sse);
  535. _mm_store_si128(d, result);
  536. }
  537. d++;
  538. mask += 4;
  539. width -= 4;
  540. }
  541. dst = reinterpret_cast<SkPMColor*>(d);
  542. }
  543. while (width > 0) {
  544. *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
  545. mask++;
  546. dst++;
  547. width--;
  548. }
  549. }
  550. /* SSE2 version of S32_D565_Opaque()
  551. * portable version is in core/SkBlitRow_D16.cpp
  552. */
  553. void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
  554. const SkPMColor* SK_RESTRICT src, int count,
  555. U8CPU alpha, int /*x*/, int /*y*/) {
  556. SkASSERT(255 == alpha);
  557. if (count <= 0) {
  558. return;
  559. }
  560. if (count >= 8) {
  561. while (((size_t)dst & 0x0F) != 0) {
  562. SkPMColor c = *src++;
  563. SkPMColorAssert(c);
  564. *dst++ = SkPixel32ToPixel16_ToU16(c);
  565. count--;
  566. }
  567. const __m128i* s = reinterpret_cast<const __m128i*>(src);
  568. __m128i* d = reinterpret_cast<__m128i*>(dst);
  569. while (count >= 8) {
  570. // Load 8 pixels of src.
  571. __m128i src_pixel1 = _mm_loadu_si128(s++);
  572. __m128i src_pixel2 = _mm_loadu_si128(s++);
  573. __m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2);
  574. _mm_store_si128(d++, d_pixel);
  575. count -= 8;
  576. }
  577. src = reinterpret_cast<const SkPMColor*>(s);
  578. dst = reinterpret_cast<uint16_t*>(d);
  579. }
  580. if (count > 0) {
  581. do {
  582. SkPMColor c = *src++;
  583. SkPMColorAssert(c);
  584. *dst++ = SkPixel32ToPixel16_ToU16(c);
  585. } while (--count != 0);
  586. }
  587. }
  588. /* SSE2 version of S32A_D565_Opaque()
  589. * portable version is in core/SkBlitRow_D16.cpp
  590. */
  591. void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
  592. const SkPMColor* SK_RESTRICT src,
  593. int count, U8CPU alpha, int /*x*/, int /*y*/) {
  594. SkASSERT(255 == alpha);
  595. if (count <= 0) {
  596. return;
  597. }
  598. if (count >= 8) {
  599. // Make dst 16 bytes alignment
  600. while (((size_t)dst & 0x0F) != 0) {
  601. SkPMColor c = *src++;
  602. if (c) {
  603. *dst = SkSrcOver32To16(c, *dst);
  604. }
  605. dst += 1;
  606. count--;
  607. }
  608. const __m128i* s = reinterpret_cast<const __m128i*>(src);
  609. __m128i* d = reinterpret_cast<__m128i*>(dst);
  610. __m128i var255 = _mm_set1_epi16(255);
  611. __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
  612. __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
  613. __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
  614. while (count >= 8) {
  615. // Load 8 pixels of src.
  616. __m128i src_pixel1 = _mm_loadu_si128(s++);
  617. __m128i src_pixel2 = _mm_loadu_si128(s++);
  618. // Check whether src pixels are equal to 0 and get the highest bit
  619. // of each byte of result, if src pixels are all zero, src_cmp1 and
  620. // src_cmp2 will be 0xFFFF.
  621. int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
  622. _mm_setzero_si128()));
  623. int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
  624. _mm_setzero_si128()));
  625. if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
  626. d++;
  627. count -= 8;
  628. continue;
  629. }
  630. // Load 8 pixels of dst.
  631. __m128i dst_pixel = _mm_load_si128(d);
  632. // Extract A from src.
  633. __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
  634. sa1 = _mm_srli_epi32(sa1, 24);
  635. __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
  636. sa2 = _mm_srli_epi32(sa2, 24);
  637. __m128i sa = _mm_packs_epi32(sa1, sa2);
  638. // Extract R from src.
  639. __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
  640. sr1 = _mm_srli_epi32(sr1, 24);
  641. __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
  642. sr2 = _mm_srli_epi32(sr2, 24);
  643. __m128i sr = _mm_packs_epi32(sr1, sr2);
  644. // Extract G from src.
  645. __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
  646. sg1 = _mm_srli_epi32(sg1, 24);
  647. __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
  648. sg2 = _mm_srli_epi32(sg2, 24);
  649. __m128i sg = _mm_packs_epi32(sg1, sg2);
  650. // Extract B from src.
  651. __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
  652. sb1 = _mm_srli_epi32(sb1, 24);
  653. __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
  654. sb2 = _mm_srli_epi32(sb2, 24);
  655. __m128i sb = _mm_packs_epi32(sb1, sb2);
  656. // Extract R G B from dst.
  657. __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
  658. dr = _mm_and_si128(dr, r16_mask);
  659. __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
  660. dg = _mm_and_si128(dg, g16_mask);
  661. __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
  662. db = _mm_and_si128(db, b16_mask);
  663. __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
  664. // Calculate R G B of result.
  665. // Original algorithm is in SkSrcOver32To16().
  666. dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
  667. dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
  668. dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
  669. dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
  670. db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
  671. db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
  672. // Pack R G B into 16-bit color.
  673. __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
  674. // Store 8 16-bit colors in dst.
  675. _mm_store_si128(d++, d_pixel);
  676. count -= 8;
  677. }
  678. src = reinterpret_cast<const SkPMColor*>(s);
  679. dst = reinterpret_cast<uint16_t*>(d);
  680. }
  681. if (count > 0) {
  682. do {
  683. SkPMColor c = *src++;
  684. SkPMColorAssert(c);
  685. if (c) {
  686. *dst = SkSrcOver32To16(c, *dst);
  687. }
  688. dst += 1;
  689. } while (--count != 0);
  690. }
  691. }
  692. void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
  693. const SkPMColor* SK_RESTRICT src,
  694. int count, U8CPU alpha, int x, int y) {
  695. SkASSERT(255 == alpha);
  696. if (count <= 0) {
  697. return;
  698. }
  699. if (count >= 8) {
  700. while (((size_t)dst & 0x0F) != 0) {
  701. DITHER_565_SCAN(y);
  702. SkPMColor c = *src++;
  703. SkPMColorAssert(c);
  704. unsigned dither = DITHER_VALUE(x);
  705. *dst++ = SkDitherRGB32To565(c, dither);
  706. DITHER_INC_X(x);
  707. count--;
  708. }
  709. unsigned short dither_value[8];
  710. __m128i dither;
  711. #ifdef ENABLE_DITHER_MATRIX_4X4
  712. const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
  713. dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
  714. dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
  715. dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
  716. dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
  717. #else
  718. const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
  719. dither_value[0] = dither_value[4] = (dither_scan
  720. >> (((x) & 3) << 2)) & 0xF;
  721. dither_value[1] = dither_value[5] = (dither_scan
  722. >> (((x + 1) & 3) << 2)) & 0xF;
  723. dither_value[2] = dither_value[6] = (dither_scan
  724. >> (((x + 2) & 3) << 2)) & 0xF;
  725. dither_value[3] = dither_value[7] = (dither_scan
  726. >> (((x + 3) & 3) << 2)) & 0xF;
  727. #endif
  728. dither = _mm_loadu_si128((__m128i*) dither_value);
  729. const __m128i* s = reinterpret_cast<const __m128i*>(src);
  730. __m128i* d = reinterpret_cast<__m128i*>(dst);
  731. while (count >= 8) {
  732. // Load 8 pixels of src.
  733. __m128i src_pixel1 = _mm_loadu_si128(s++);
  734. __m128i src_pixel2 = _mm_loadu_si128(s++);
  735. // Extract R from src.
  736. __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
  737. sr1 = _mm_srli_epi32(sr1, 24);
  738. __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
  739. sr2 = _mm_srli_epi32(sr2, 24);
  740. __m128i sr = _mm_packs_epi32(sr1, sr2);
  741. // SkDITHER_R32To565(sr, dither)
  742. __m128i sr_offset = _mm_srli_epi16(sr, 5);
  743. sr = _mm_add_epi16(sr, dither);
  744. sr = _mm_sub_epi16(sr, sr_offset);
  745. sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
  746. // Extract G from src.
  747. __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
  748. sg1 = _mm_srli_epi32(sg1, 24);
  749. __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
  750. sg2 = _mm_srli_epi32(sg2, 24);
  751. __m128i sg = _mm_packs_epi32(sg1, sg2);
  752. // SkDITHER_R32To565(sg, dither)
  753. __m128i sg_offset = _mm_srli_epi16(sg, 6);
  754. sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
  755. sg = _mm_sub_epi16(sg, sg_offset);
  756. sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
  757. // Extract B from src.
  758. __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
  759. sb1 = _mm_srli_epi32(sb1, 24);
  760. __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
  761. sb2 = _mm_srli_epi32(sb2, 24);
  762. __m128i sb = _mm_packs_epi32(sb1, sb2);
  763. // SkDITHER_R32To565(sb, dither)
  764. __m128i sb_offset = _mm_srli_epi16(sb, 5);
  765. sb = _mm_add_epi16(sb, dither);
  766. sb = _mm_sub_epi16(sb, sb_offset);
  767. sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
  768. // Pack and store 16-bit dst pixel.
  769. __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
  770. _mm_store_si128(d++, d_pixel);
  771. count -= 8;
  772. x += 8;
  773. }
  774. src = reinterpret_cast<const SkPMColor*>(s);
  775. dst = reinterpret_cast<uint16_t*>(d);
  776. }
  777. if (count > 0) {
  778. DITHER_565_SCAN(y);
  779. do {
  780. SkPMColor c = *src++;
  781. SkPMColorAssert(c);
  782. unsigned dither = DITHER_VALUE(x);
  783. *dst++ = SkDitherRGB32To565(c, dither);
  784. DITHER_INC_X(x);
  785. } while (--count != 0);
  786. }
  787. }
  788. /* SSE2 version of S32A_D565_Opaque_Dither()
  789. * portable version is in core/SkBlitRow_D16.cpp
  790. */
  791. void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
  792. const SkPMColor* SK_RESTRICT src,
  793. int count, U8CPU alpha, int x, int y) {
  794. SkASSERT(255 == alpha);
  795. if (count <= 0) {
  796. return;
  797. }
  798. if (count >= 8) {
  799. while (((size_t)dst & 0x0F) != 0) {
  800. DITHER_565_SCAN(y);
  801. SkPMColor c = *src++;
  802. SkPMColorAssert(c);
  803. if (c) {
  804. unsigned a = SkGetPackedA32(c);
  805. int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
  806. unsigned sr = SkGetPackedR32(c);
  807. unsigned sg = SkGetPackedG32(c);
  808. unsigned sb = SkGetPackedB32(c);
  809. sr = SkDITHER_R32_FOR_565(sr, d);
  810. sg = SkDITHER_G32_FOR_565(sg, d);
  811. sb = SkDITHER_B32_FOR_565(sb, d);
  812. uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
  813. uint32_t dst_expanded = SkExpand_rgb_16(*dst);
  814. dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
  815. // now src and dst expanded are in g:11 r:10 x:1 b:10
  816. *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
  817. }
  818. dst += 1;
  819. DITHER_INC_X(x);
  820. count--;
  821. }
  822. unsigned short dither_value[8];
  823. __m128i dither, dither_cur;
  824. #ifdef ENABLE_DITHER_MATRIX_4X4
  825. const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
  826. dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
  827. dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
  828. dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
  829. dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
  830. #else
  831. const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
  832. dither_value[0] = dither_value[4] = (dither_scan
  833. >> (((x) & 3) << 2)) & 0xF;
  834. dither_value[1] = dither_value[5] = (dither_scan
  835. >> (((x + 1) & 3) << 2)) & 0xF;
  836. dither_value[2] = dither_value[6] = (dither_scan
  837. >> (((x + 2) & 3) << 2)) & 0xF;
  838. dither_value[3] = dither_value[7] = (dither_scan
  839. >> (((x + 3) & 3) << 2)) & 0xF;
  840. #endif
  841. dither = _mm_loadu_si128((__m128i*) dither_value);
  842. const __m128i* s = reinterpret_cast<const __m128i*>(src);
  843. __m128i* d = reinterpret_cast<__m128i*>(dst);
  844. __m128i var256 = _mm_set1_epi16(256);
  845. __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
  846. __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
  847. __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
  848. while (count >= 8) {
  849. // Load 8 pixels of src and dst.
  850. __m128i src_pixel1 = _mm_loadu_si128(s++);
  851. __m128i src_pixel2 = _mm_loadu_si128(s++);
  852. __m128i dst_pixel = _mm_load_si128(d);
  853. // Extract A from src.
  854. __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
  855. sa1 = _mm_srli_epi32(sa1, 24);
  856. __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
  857. sa2 = _mm_srli_epi32(sa2, 24);
  858. __m128i sa = _mm_packs_epi32(sa1, sa2);
  859. // Calculate current dither value.
  860. dither_cur = _mm_mullo_epi16(dither,
  861. _mm_add_epi16(sa, _mm_set1_epi16(1)));
  862. dither_cur = _mm_srli_epi16(dither_cur, 8);
  863. // Extract R from src.
  864. __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
  865. sr1 = _mm_srli_epi32(sr1, 24);
  866. __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
  867. sr2 = _mm_srli_epi32(sr2, 24);
  868. __m128i sr = _mm_packs_epi32(sr1, sr2);
  869. // SkDITHER_R32_FOR_565(sr, d)
  870. __m128i sr_offset = _mm_srli_epi16(sr, 5);
  871. sr = _mm_add_epi16(sr, dither_cur);
  872. sr = _mm_sub_epi16(sr, sr_offset);
  873. // Expand sr.
  874. sr = _mm_slli_epi16(sr, 2);
  875. // Extract G from src.
  876. __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
  877. sg1 = _mm_srli_epi32(sg1, 24);
  878. __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
  879. sg2 = _mm_srli_epi32(sg2, 24);
  880. __m128i sg = _mm_packs_epi32(sg1, sg2);
  881. // sg = SkDITHER_G32_FOR_565(sg, d).
  882. __m128i sg_offset = _mm_srli_epi16(sg, 6);
  883. sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
  884. sg = _mm_sub_epi16(sg, sg_offset);
  885. // Expand sg.
  886. sg = _mm_slli_epi16(sg, 3);
  887. // Extract B from src.
  888. __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
  889. sb1 = _mm_srli_epi32(sb1, 24);
  890. __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
  891. sb2 = _mm_srli_epi32(sb2, 24);
  892. __m128i sb = _mm_packs_epi32(sb1, sb2);
  893. // sb = SkDITHER_B32_FOR_565(sb, d).
  894. __m128i sb_offset = _mm_srli_epi16(sb, 5);
  895. sb = _mm_add_epi16(sb, dither_cur);
  896. sb = _mm_sub_epi16(sb, sb_offset);
  897. // Expand sb.
  898. sb = _mm_slli_epi16(sb, 2);
  899. // Extract R G B from dst.
  900. __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
  901. dr = _mm_and_si128(dr, r16_mask);
  902. __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
  903. dg = _mm_and_si128(dg, g16_mask);
  904. __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
  905. db = _mm_and_si128(db, b16_mask);
  906. // SkAlpha255To256(255 - a) >> 3
  907. __m128i isa = _mm_sub_epi16(var256, sa);
  908. isa = _mm_srli_epi16(isa, 3);
  909. dr = _mm_mullo_epi16(dr, isa);
  910. dr = _mm_add_epi16(dr, sr);
  911. dr = _mm_srli_epi16(dr, 5);
  912. dg = _mm_mullo_epi16(dg, isa);
  913. dg = _mm_add_epi16(dg, sg);
  914. dg = _mm_srli_epi16(dg, 5);
  915. db = _mm_mullo_epi16(db, isa);
  916. db = _mm_add_epi16(db, sb);
  917. db = _mm_srli_epi16(db, 5);
  918. // Package and store dst pixel.
  919. __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
  920. _mm_store_si128(d++, d_pixel);
  921. count -= 8;
  922. x += 8;
  923. }
  924. src = reinterpret_cast<const SkPMColor*>(s);
  925. dst = reinterpret_cast<uint16_t*>(d);
  926. }
  927. if (count > 0) {
  928. DITHER_565_SCAN(y);
  929. do {
  930. SkPMColor c = *src++;
  931. SkPMColorAssert(c);
  932. if (c) {
  933. unsigned a = SkGetPackedA32(c);
  934. int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
  935. unsigned sr = SkGetPackedR32(c);
  936. unsigned sg = SkGetPackedG32(c);
  937. unsigned sb = SkGetPackedB32(c);
  938. sr = SkDITHER_R32_FOR_565(sr, d);
  939. sg = SkDITHER_G32_FOR_565(sg, d);
  940. sb = SkDITHER_B32_FOR_565(sb, d);
  941. uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
  942. uint32_t dst_expanded = SkExpand_rgb_16(*dst);
  943. dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
  944. // now src and dst expanded are in g:11 r:10 x:1 b:10
  945. *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
  946. }
  947. dst += 1;
  948. DITHER_INC_X(x);
  949. } while (--count != 0);
  950. }
  951. }