PageRenderTime 53ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/third_party/skia/src/opts/SkBlitRow_opts_SSE2.cpp

https://github.com/lianliuwei/chromium_base
C++ | 483 lines | 319 code | 82 blank | 82 comment | 43 complexity | 840d7c464e58c369b2878b45340bad51 MD5 | raw file
Possible License(s): BSD-3-Clause, MPL-2.0-no-copyleft-exception, Apache-2.0, LGPL-2.1, LGPL-3.0
  1. /*
  2. * Copyright 2009 The Android Open Source Project
  3. *
  4. * Use of this source code is governed by a BSD-style license that can be
  5. * found in the LICENSE file.
  6. */
  7. #include "SkBlitRow_opts_SSE2.h"
  8. #include "SkColorPriv.h"
  9. #include "SkUtils.h"
  10. #include <emmintrin.h>
  11. /* SSE2 version of S32_Blend_BlitRow32()
  12. * portable version is in core/SkBlitRow_D32.cpp
  13. */
  14. void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
  15. const SkPMColor* SK_RESTRICT src,
  16. int count, U8CPU alpha) {
  17. SkASSERT(alpha <= 255);
  18. if (count <= 0) {
  19. return;
  20. }
  21. uint32_t src_scale = SkAlpha255To256(alpha);
  22. uint32_t dst_scale = 256 - src_scale;
  23. if (count >= 4) {
  24. SkASSERT(((size_t)dst & 0x03) == 0);
  25. while (((size_t)dst & 0x0F) != 0) {
  26. *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
  27. src++;
  28. dst++;
  29. count--;
  30. }
  31. const __m128i *s = reinterpret_cast<const __m128i*>(src);
  32. __m128i *d = reinterpret_cast<__m128i*>(dst);
  33. __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
  34. __m128i src_scale_wide = _mm_set1_epi16(src_scale);
  35. __m128i dst_scale_wide = _mm_set1_epi16(dst_scale);
  36. while (count >= 4) {
  37. // Load 4 pixels each of src and dest.
  38. __m128i src_pixel = _mm_loadu_si128(s);
  39. __m128i dst_pixel = _mm_load_si128(d);
  40. // Get red and blue pixels into lower byte of each word.
  41. __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
  42. __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
  43. // Get alpha and green into lower byte of each word.
  44. __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
  45. __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
  46. // Multiply by scale.
  47. src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
  48. src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
  49. dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide);
  50. dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide);
  51. // Divide by 256.
  52. src_rb = _mm_srli_epi16(src_rb, 8);
  53. dst_rb = _mm_srli_epi16(dst_rb, 8);
  54. src_ag = _mm_andnot_si128(rb_mask, src_ag);
  55. dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
  56. // Combine back into RGBA.
  57. src_pixel = _mm_or_si128(src_rb, src_ag);
  58. dst_pixel = _mm_or_si128(dst_rb, dst_ag);
  59. // Add result
  60. __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
  61. _mm_store_si128(d, result);
  62. s++;
  63. d++;
  64. count -= 4;
  65. }
  66. src = reinterpret_cast<const SkPMColor*>(s);
  67. dst = reinterpret_cast<SkPMColor*>(d);
  68. }
  69. while (count > 0) {
  70. *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
  71. src++;
  72. dst++;
  73. count--;
  74. }
  75. }
  76. void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
  77. const SkPMColor* SK_RESTRICT src,
  78. int count, U8CPU alpha) {
  79. SkASSERT(alpha == 255);
  80. if (count <= 0) {
  81. return;
  82. }
  83. if (count >= 4) {
  84. SkASSERT(((size_t)dst & 0x03) == 0);
  85. while (((size_t)dst & 0x0F) != 0) {
  86. *dst = SkPMSrcOver(*src, *dst);
  87. src++;
  88. dst++;
  89. count--;
  90. }
  91. const __m128i *s = reinterpret_cast<const __m128i*>(src);
  92. __m128i *d = reinterpret_cast<__m128i*>(dst);
  93. #ifdef SK_USE_ACCURATE_BLENDING
  94. __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
  95. __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
  96. __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
  97. while (count >= 4) {
  98. // Load 4 pixels
  99. __m128i src_pixel = _mm_loadu_si128(s);
  100. __m128i dst_pixel = _mm_load_si128(d);
  101. __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
  102. __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
  103. // Shift alphas down to lower 8 bits of each quad.
  104. __m128i alpha = _mm_srli_epi32(src_pixel, 24);
  105. // Copy alpha to upper 3rd byte of each quad
  106. alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
  107. // Subtract alphas from 255, to get 0..255
  108. alpha = _mm_sub_epi16(c_255, alpha);
  109. // Multiply by red and blue by src alpha.
  110. dst_rb = _mm_mullo_epi16(dst_rb, alpha);
  111. // Multiply by alpha and green by src alpha.
  112. dst_ag = _mm_mullo_epi16(dst_ag, alpha);
  113. // dst_rb_low = (dst_rb >> 8)
  114. __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
  115. __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
  116. // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
  117. dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
  118. dst_rb = _mm_add_epi16(dst_rb, c_128);
  119. dst_rb = _mm_srli_epi16(dst_rb, 8);
  120. // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
  121. dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
  122. dst_ag = _mm_add_epi16(dst_ag, c_128);
  123. dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
  124. // Combine back into RGBA.
  125. dst_pixel = _mm_or_si128(dst_rb, dst_ag);
  126. // Add result
  127. __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
  128. _mm_store_si128(d, result);
  129. s++;
  130. d++;
  131. count -= 4;
  132. }
  133. #else
  134. __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
  135. __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
  136. while (count >= 4) {
  137. // Load 4 pixels
  138. __m128i src_pixel = _mm_loadu_si128(s);
  139. __m128i dst_pixel = _mm_load_si128(d);
  140. __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
  141. __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
  142. // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)
  143. __m128i alpha = _mm_srli_epi16(src_pixel, 8);
  144. // (a0, a0, a1, a1, a2, g2, a3, g3)
  145. alpha = _mm_shufflehi_epi16(alpha, 0xF5);
  146. // (a0, a0, a1, a1, a2, a2, a3, a3)
  147. alpha = _mm_shufflelo_epi16(alpha, 0xF5);
  148. // Subtract alphas from 256, to get 1..256
  149. alpha = _mm_sub_epi16(c_256, alpha);
  150. // Multiply by red and blue by src alpha.
  151. dst_rb = _mm_mullo_epi16(dst_rb, alpha);
  152. // Multiply by alpha and green by src alpha.
  153. dst_ag = _mm_mullo_epi16(dst_ag, alpha);
  154. // Divide by 256.
  155. dst_rb = _mm_srli_epi16(dst_rb, 8);
  156. // Mask out high bits (already in the right place)
  157. dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
  158. // Combine back into RGBA.
  159. dst_pixel = _mm_or_si128(dst_rb, dst_ag);
  160. // Add result
  161. __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
  162. _mm_store_si128(d, result);
  163. s++;
  164. d++;
  165. count -= 4;
  166. }
  167. #endif
  168. src = reinterpret_cast<const SkPMColor*>(s);
  169. dst = reinterpret_cast<SkPMColor*>(d);
  170. }
  171. while (count > 0) {
  172. *dst = SkPMSrcOver(*src, *dst);
  173. src++;
  174. dst++;
  175. count--;
  176. }
  177. }
  178. void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
  179. const SkPMColor* SK_RESTRICT src,
  180. int count, U8CPU alpha) {
  181. SkASSERT(alpha <= 255);
  182. if (count <= 0) {
  183. return;
  184. }
  185. if (count >= 4) {
  186. while (((size_t)dst & 0x0F) != 0) {
  187. *dst = SkBlendARGB32(*src, *dst, alpha);
  188. src++;
  189. dst++;
  190. count--;
  191. }
  192. uint32_t src_scale = SkAlpha255To256(alpha);
  193. const __m128i *s = reinterpret_cast<const __m128i*>(src);
  194. __m128i *d = reinterpret_cast<__m128i*>(dst);
  195. __m128i src_scale_wide = _mm_set1_epi16(src_scale);
  196. __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
  197. __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)
  198. while (count >= 4) {
  199. // Load 4 pixels each of src and dest.
  200. __m128i src_pixel = _mm_loadu_si128(s);
  201. __m128i dst_pixel = _mm_load_si128(d);
  202. // Get red and blue pixels into lower byte of each word.
  203. __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
  204. __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
  205. // Get alpha and green into lower byte of each word.
  206. __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
  207. __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
  208. // Put per-pixel alpha in low byte of each word.
  209. __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
  210. dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
  211. // dst_alpha = dst_alpha * src_scale
  212. dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
  213. // Divide by 256.
  214. dst_alpha = _mm_srli_epi16(dst_alpha, 8);
  215. // Subtract alphas from 256, to get 1..256
  216. dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
  217. // Multiply red and blue by dst pixel alpha.
  218. dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
  219. // Multiply alpha and green by dst pixel alpha.
  220. dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
  221. // Multiply red and blue by global alpha.
  222. src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
  223. // Multiply alpha and green by global alpha.
  224. src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
  225. // Divide by 256.
  226. dst_rb = _mm_srli_epi16(dst_rb, 8);
  227. src_rb = _mm_srli_epi16(src_rb, 8);
  228. // Mask out low bits (goodies already in the right place; no need to divide)
  229. dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
  230. src_ag = _mm_andnot_si128(rb_mask, src_ag);
  231. // Combine back into RGBA.
  232. dst_pixel = _mm_or_si128(dst_rb, dst_ag);
  233. src_pixel = _mm_or_si128(src_rb, src_ag);
  234. // Add two pixels into result.
  235. __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
  236. _mm_store_si128(d, result);
  237. s++;
  238. d++;
  239. count -= 4;
  240. }
  241. src = reinterpret_cast<const SkPMColor*>(s);
  242. dst = reinterpret_cast<SkPMColor*>(d);
  243. }
  244. while (count > 0) {
  245. *dst = SkBlendARGB32(*src, *dst, alpha);
  246. src++;
  247. dst++;
  248. count--;
  249. }
  250. }
  251. /* SSE2 version of Color32()
  252. * portable version is in core/SkBlitRow_D32.cpp
  253. */
  254. void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
  255. SkPMColor color) {
  256. if (count <= 0) {
  257. return;
  258. }
  259. if (0 == color) {
  260. if (src != dst) {
  261. memcpy(dst, src, count * sizeof(SkPMColor));
  262. }
  263. }
  264. unsigned colorA = SkGetPackedA32(color);
  265. if (255 == colorA) {
  266. sk_memset32(dst, color, count);
  267. } else {
  268. unsigned scale = 256 - SkAlpha255To256(colorA);
  269. if (count >= 4) {
  270. SkASSERT(((size_t)dst & 0x03) == 0);
  271. while (((size_t)dst & 0x0F) != 0) {
  272. *dst = color + SkAlphaMulQ(*src, scale);
  273. src++;
  274. dst++;
  275. count--;
  276. }
  277. const __m128i *s = reinterpret_cast<const __m128i*>(src);
  278. __m128i *d = reinterpret_cast<__m128i*>(dst);
  279. __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
  280. __m128i src_scale_wide = _mm_set1_epi16(scale);
  281. __m128i color_wide = _mm_set1_epi32(color);
  282. while (count >= 4) {
  283. // Load 4 pixels each of src and dest.
  284. __m128i src_pixel = _mm_loadu_si128(s);
  285. // Get red and blue pixels into lower byte of each word.
  286. __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
  287. // Get alpha and green into lower byte of each word.
  288. __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
  289. // Multiply by scale.
  290. src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
  291. src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
  292. // Divide by 256.
  293. src_rb = _mm_srli_epi16(src_rb, 8);
  294. src_ag = _mm_andnot_si128(rb_mask, src_ag);
  295. // Combine back into RGBA.
  296. src_pixel = _mm_or_si128(src_rb, src_ag);
  297. // Add color to result.
  298. __m128i result = _mm_add_epi8(color_wide, src_pixel);
  299. // Store result.
  300. _mm_store_si128(d, result);
  301. s++;
  302. d++;
  303. count -= 4;
  304. }
  305. src = reinterpret_cast<const SkPMColor*>(s);
  306. dst = reinterpret_cast<SkPMColor*>(d);
  307. }
  308. while (count > 0) {
  309. *dst = color + SkAlphaMulQ(*src, scale);
  310. src += 1;
  311. dst += 1;
  312. count--;
  313. }
  314. }
  315. }
  316. void SkARGB32_BlitMask_SSE2(void* device, size_t dstRB,
  317. SkBitmap::Config dstConfig, const uint8_t* mask,
  318. size_t maskRB, SkColor origColor,
  319. int width, int height)
  320. {
  321. SkPMColor color = SkPreMultiplyColor(origColor);
  322. size_t dstOffset = dstRB - (width << 2);
  323. size_t maskOffset = maskRB - width;
  324. SkPMColor* dst = (SkPMColor *)device;
  325. do {
  326. int count = width;
  327. if (count >= 4) {
  328. while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
  329. *dst = SkBlendARGB32(color, *dst, *mask);
  330. mask++;
  331. dst++;
  332. count--;
  333. }
  334. __m128i *d = reinterpret_cast<__m128i*>(dst);
  335. __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
  336. __m128i c_256 = _mm_set1_epi16(256);
  337. __m128i c_1 = _mm_set1_epi16(1);
  338. __m128i src_pixel = _mm_set1_epi32(color);
  339. while (count >= 4) {
  340. // Load 4 pixels each of src and dest.
  341. __m128i dst_pixel = _mm_load_si128(d);
  342. //set the aphla value
  343. __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\
  344. 0, *(mask+3),0, \
  345. *(mask+2),0, *(mask+2),\
  346. 0,*(mask+1), 0,*(mask+1),\
  347. 0, *mask,0,*mask);
  348. //call SkAlpha255To256()
  349. src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
  350. // Get red and blue pixels into lower byte of each word.
  351. __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
  352. __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
  353. // Get alpha and green into lower byte of each word.
  354. __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
  355. __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
  356. // Put per-pixel alpha in low byte of each word.
  357. __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
  358. dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
  359. // dst_alpha = dst_alpha * src_scale
  360. dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
  361. // Divide by 256.
  362. dst_alpha = _mm_srli_epi16(dst_alpha, 8);
  363. // Subtract alphas from 256, to get 1..256
  364. dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
  365. // Multiply red and blue by dst pixel alpha.
  366. dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
  367. // Multiply alpha and green by dst pixel alpha.
  368. dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
  369. // Multiply red and blue by global alpha.
  370. src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
  371. // Multiply alpha and green by global alpha.
  372. src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
  373. // Divide by 256.
  374. dst_rb = _mm_srli_epi16(dst_rb, 8);
  375. src_rb = _mm_srli_epi16(src_rb, 8);
  376. // Mask out low bits (goodies already in the right place; no need to divide)
  377. dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
  378. src_ag = _mm_andnot_si128(rb_mask, src_ag);
  379. // Combine back into RGBA.
  380. dst_pixel = _mm_or_si128(dst_rb, dst_ag);
  381. __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
  382. // Add two pixels into result.
  383. __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
  384. _mm_store_si128(d, result);
  385. // load the next 4 pixel
  386. mask = mask + 4;
  387. d++;
  388. count -= 4;
  389. }
  390. dst = reinterpret_cast<SkPMColor *>(d);
  391. }
  392. while(count > 0) {
  393. *dst= SkBlendARGB32(color, *dst, *mask);
  394. dst += 1;
  395. mask++;
  396. count --;
  397. }
  398. dst = (SkPMColor *)((char*)dst + dstOffset);
  399. mask += maskOffset;
  400. } while (--height != 0);
  401. }