PageRenderTime 58ms CodeModel.GetById 32ms RepoModel.GetById 0ms app.codeStats 1ms

/gfx/skia/src/opts/SkBlitRow_opts_SSE2.cpp

https://bitbucket.org/soko/mozilla-central
C++ | 484 lines | 320 code | 82 blank | 82 comment | 43 complexity | e386362c997fc46bc3b52f2b3a91acd2 MD5 | raw file
Possible License(s): GPL-2.0, JSON, 0BSD, LGPL-3.0, AGPL-1.0, MIT, MPL-2.0-no-copyleft-exception, BSD-3-Clause, LGPL-2.1, Apache-2.0
  1. /*
  2. * Copyright 2009 The Android Open Source Project
  3. *
  4. * Use of this source code is governed by a BSD-style license that can be
  5. * found in the LICENSE file.
  6. */
  7. #include "SkBlitRow_opts_SSE2.h"
  8. #include "SkColorPriv.h"
  9. #include "SkUtils.h"
  10. #include <emmintrin.h>
  11. /* SSE2 version of S32_Blend_BlitRow32()
  12. * portable version is in core/SkBlitRow_D32.cpp
  13. */
  14. void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
  15. const SkPMColor* SK_RESTRICT src,
  16. int count, U8CPU alpha) {
  17. SkASSERT(alpha <= 255);
  18. if (count <= 0) {
  19. return;
  20. }
  21. uint32_t src_scale = SkAlpha255To256(alpha);
  22. uint32_t dst_scale = 256 - src_scale;
  23. if (count >= 4) {
  24. SkASSERT(((size_t)dst & 0x03) == 0);
  25. while (((size_t)dst & 0x0F) != 0) {
  26. *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
  27. src++;
  28. dst++;
  29. count--;
  30. }
  31. const __m128i *s = reinterpret_cast<const __m128i*>(src);
  32. __m128i *d = reinterpret_cast<__m128i*>(dst);
  33. __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
  34. __m128i src_scale_wide = _mm_set1_epi16(src_scale);
  35. __m128i dst_scale_wide = _mm_set1_epi16(dst_scale);
  36. while (count >= 4) {
  37. // Load 4 pixels each of src and dest.
  38. __m128i src_pixel = _mm_loadu_si128(s);
  39. __m128i dst_pixel = _mm_load_si128(d);
  40. // Get red and blue pixels into lower byte of each word.
  41. __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
  42. __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
  43. // Get alpha and green into lower byte of each word.
  44. __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
  45. __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
  46. // Multiply by scale.
  47. src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
  48. src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
  49. dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide);
  50. dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide);
  51. // Divide by 256.
  52. src_rb = _mm_srli_epi16(src_rb, 8);
  53. dst_rb = _mm_srli_epi16(dst_rb, 8);
  54. src_ag = _mm_andnot_si128(rb_mask, src_ag);
  55. dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
  56. // Combine back into RGBA.
  57. src_pixel = _mm_or_si128(src_rb, src_ag);
  58. dst_pixel = _mm_or_si128(dst_rb, dst_ag);
  59. // Add result
  60. __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
  61. _mm_store_si128(d, result);
  62. s++;
  63. d++;
  64. count -= 4;
  65. }
  66. src = reinterpret_cast<const SkPMColor*>(s);
  67. dst = reinterpret_cast<SkPMColor*>(d);
  68. }
  69. while (count > 0) {
  70. *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
  71. src++;
  72. dst++;
  73. count--;
  74. }
  75. }
  76. void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
  77. const SkPMColor* SK_RESTRICT src,
  78. int count, U8CPU alpha) {
  79. SkASSERT(alpha == 255);
  80. if (count <= 0) {
  81. return;
  82. }
  83. if (count >= 4) {
  84. SkASSERT(((size_t)dst & 0x03) == 0);
  85. while (((size_t)dst & 0x0F) != 0) {
  86. *dst = SkPMSrcOver(*src, *dst);
  87. src++;
  88. dst++;
  89. count--;
  90. }
  91. const __m128i *s = reinterpret_cast<const __m128i*>(src);
  92. __m128i *d = reinterpret_cast<__m128i*>(dst);
  93. #ifdef SK_USE_ACCURATE_BLENDING
  94. __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
  95. __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
  96. __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
  97. while (count >= 4) {
  98. // Load 4 pixels
  99. __m128i src_pixel = _mm_loadu_si128(s);
  100. __m128i dst_pixel = _mm_load_si128(d);
  101. __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
  102. __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
  103. // Shift alphas down to lower 8 bits of each quad.
  104. __m128i alpha = _mm_srli_epi32(src_pixel, 24);
  105. // Copy alpha to upper 3rd byte of each quad
  106. alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
  107. // Subtract alphas from 255, to get 0..255
  108. alpha = _mm_sub_epi16(c_255, alpha);
  109. // Multiply by red and blue by src alpha.
  110. dst_rb = _mm_mullo_epi16(dst_rb, alpha);
  111. // Multiply by alpha and green by src alpha.
  112. dst_ag = _mm_mullo_epi16(dst_ag, alpha);
  113. // dst_rb_low = (dst_rb >> 8)
  114. __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
  115. __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
  116. // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
  117. dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
  118. dst_rb = _mm_add_epi16(dst_rb, c_128);
  119. dst_rb = _mm_srli_epi16(dst_rb, 8);
  120. // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
  121. dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
  122. dst_ag = _mm_add_epi16(dst_ag, c_128);
  123. dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
  124. // Combine back into RGBA.
  125. dst_pixel = _mm_or_si128(dst_rb, dst_ag);
  126. // Add result
  127. __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
  128. _mm_store_si128(d, result);
  129. s++;
  130. d++;
  131. count -= 4;
  132. }
  133. #else
  134. __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
  135. __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
  136. while (count >= 4) {
  137. // Load 4 pixels
  138. __m128i src_pixel = _mm_loadu_si128(s);
  139. __m128i dst_pixel = _mm_load_si128(d);
  140. __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
  141. __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
  142. // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)
  143. __m128i alpha = _mm_srli_epi16(src_pixel, 8);
  144. // (a0, a0, a1, a1, a2, g2, a3, g3)
  145. alpha = _mm_shufflehi_epi16(alpha, 0xF5);
  146. // (a0, a0, a1, a1, a2, a2, a3, a3)
  147. alpha = _mm_shufflelo_epi16(alpha, 0xF5);
  148. // Subtract alphas from 256, to get 1..256
  149. alpha = _mm_sub_epi16(c_256, alpha);
  150. // Multiply by red and blue by src alpha.
  151. dst_rb = _mm_mullo_epi16(dst_rb, alpha);
  152. // Multiply by alpha and green by src alpha.
  153. dst_ag = _mm_mullo_epi16(dst_ag, alpha);
  154. // Divide by 256.
  155. dst_rb = _mm_srli_epi16(dst_rb, 8);
  156. // Mask out high bits (already in the right place)
  157. dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
  158. // Combine back into RGBA.
  159. dst_pixel = _mm_or_si128(dst_rb, dst_ag);
  160. // Add result
  161. __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
  162. _mm_store_si128(d, result);
  163. s++;
  164. d++;
  165. count -= 4;
  166. }
  167. #endif
  168. src = reinterpret_cast<const SkPMColor*>(s);
  169. dst = reinterpret_cast<SkPMColor*>(d);
  170. }
  171. while (count > 0) {
  172. *dst = SkPMSrcOver(*src, *dst);
  173. src++;
  174. dst++;
  175. count--;
  176. }
  177. }
  178. void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
  179. const SkPMColor* SK_RESTRICT src,
  180. int count, U8CPU alpha) {
  181. SkASSERT(alpha <= 255);
  182. if (count <= 0) {
  183. return;
  184. }
  185. if (count >= 4) {
  186. while (((size_t)dst & 0x0F) != 0) {
  187. *dst = SkBlendARGB32(*src, *dst, alpha);
  188. src++;
  189. dst++;
  190. count--;
  191. }
  192. uint32_t src_scale = SkAlpha255To256(alpha);
  193. const __m128i *s = reinterpret_cast<const __m128i*>(src);
  194. __m128i *d = reinterpret_cast<__m128i*>(dst);
  195. __m128i src_scale_wide = _mm_set1_epi16(src_scale);
  196. __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
  197. __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)
  198. while (count >= 4) {
  199. // Load 4 pixels each of src and dest.
  200. __m128i src_pixel = _mm_loadu_si128(s);
  201. __m128i dst_pixel = _mm_load_si128(d);
  202. // Get red and blue pixels into lower byte of each word.
  203. __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
  204. __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
  205. // Get alpha and green into lower byte of each word.
  206. __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
  207. __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
  208. // Put per-pixel alpha in low byte of each word.
  209. __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
  210. dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
  211. // dst_alpha = dst_alpha * src_scale
  212. dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
  213. // Divide by 256.
  214. dst_alpha = _mm_srli_epi16(dst_alpha, 8);
  215. // Subtract alphas from 256, to get 1..256
  216. dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
  217. // Multiply red and blue by dst pixel alpha.
  218. dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
  219. // Multiply alpha and green by dst pixel alpha.
  220. dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
  221. // Multiply red and blue by global alpha.
  222. src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
  223. // Multiply alpha and green by global alpha.
  224. src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
  225. // Divide by 256.
  226. dst_rb = _mm_srli_epi16(dst_rb, 8);
  227. src_rb = _mm_srli_epi16(src_rb, 8);
  228. // Mask out low bits (goodies already in the right place; no need to divide)
  229. dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
  230. src_ag = _mm_andnot_si128(rb_mask, src_ag);
  231. // Combine back into RGBA.
  232. dst_pixel = _mm_or_si128(dst_rb, dst_ag);
  233. src_pixel = _mm_or_si128(src_rb, src_ag);
  234. // Add two pixels into result.
  235. __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
  236. _mm_store_si128(d, result);
  237. s++;
  238. d++;
  239. count -= 4;
  240. }
  241. src = reinterpret_cast<const SkPMColor*>(s);
  242. dst = reinterpret_cast<SkPMColor*>(d);
  243. }
  244. while (count > 0) {
  245. *dst = SkBlendARGB32(*src, *dst, alpha);
  246. src++;
  247. dst++;
  248. count--;
  249. }
  250. }
  251. /* SSE2 version of Color32()
  252. * portable version is in core/SkBlitRow_D32.cpp
  253. */
  254. void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
  255. SkPMColor color) {
  256. if (count <= 0) {
  257. return;
  258. }
  259. if (0 == color) {
  260. if (src != dst) {
  261. memcpy(dst, src, count * sizeof(SkPMColor));
  262. }
  263. return;
  264. }
  265. unsigned colorA = SkGetPackedA32(color);
  266. if (255 == colorA) {
  267. sk_memset32(dst, color, count);
  268. } else {
  269. unsigned scale = 256 - SkAlpha255To256(colorA);
  270. if (count >= 4) {
  271. SkASSERT(((size_t)dst & 0x03) == 0);
  272. while (((size_t)dst & 0x0F) != 0) {
  273. *dst = color + SkAlphaMulQ(*src, scale);
  274. src++;
  275. dst++;
  276. count--;
  277. }
  278. const __m128i *s = reinterpret_cast<const __m128i*>(src);
  279. __m128i *d = reinterpret_cast<__m128i*>(dst);
  280. __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
  281. __m128i src_scale_wide = _mm_set1_epi16(scale);
  282. __m128i color_wide = _mm_set1_epi32(color);
  283. while (count >= 4) {
  284. // Load 4 pixels each of src and dest.
  285. __m128i src_pixel = _mm_loadu_si128(s);
  286. // Get red and blue pixels into lower byte of each word.
  287. __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
  288. // Get alpha and green into lower byte of each word.
  289. __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
  290. // Multiply by scale.
  291. src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
  292. src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
  293. // Divide by 256.
  294. src_rb = _mm_srli_epi16(src_rb, 8);
  295. src_ag = _mm_andnot_si128(rb_mask, src_ag);
  296. // Combine back into RGBA.
  297. src_pixel = _mm_or_si128(src_rb, src_ag);
  298. // Add color to result.
  299. __m128i result = _mm_add_epi8(color_wide, src_pixel);
  300. // Store result.
  301. _mm_store_si128(d, result);
  302. s++;
  303. d++;
  304. count -= 4;
  305. }
  306. src = reinterpret_cast<const SkPMColor*>(s);
  307. dst = reinterpret_cast<SkPMColor*>(d);
  308. }
  309. while (count > 0) {
  310. *dst = color + SkAlphaMulQ(*src, scale);
  311. src += 1;
  312. dst += 1;
  313. count--;
  314. }
  315. }
  316. }
  317. void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
  318. size_t maskRB, SkColor origColor,
  319. int width, int height)
  320. {
  321. SkPMColor color = SkPreMultiplyColor(origColor);
  322. size_t dstOffset = dstRB - (width << 2);
  323. size_t maskOffset = maskRB - width;
  324. SkPMColor* dst = (SkPMColor *)device;
  325. const uint8_t* mask = (const uint8_t*)maskPtr;
  326. do {
  327. int count = width;
  328. if (count >= 4) {
  329. while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
  330. *dst = SkBlendARGB32(color, *dst, *mask);
  331. mask++;
  332. dst++;
  333. count--;
  334. }
  335. __m128i *d = reinterpret_cast<__m128i*>(dst);
  336. __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
  337. __m128i c_256 = _mm_set1_epi16(256);
  338. __m128i c_1 = _mm_set1_epi16(1);
  339. __m128i src_pixel = _mm_set1_epi32(color);
  340. while (count >= 4) {
  341. // Load 4 pixels each of src and dest.
  342. __m128i dst_pixel = _mm_load_si128(d);
  343. //set the aphla value
  344. __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\
  345. 0, *(mask+3),0, \
  346. *(mask+2),0, *(mask+2),\
  347. 0,*(mask+1), 0,*(mask+1),\
  348. 0, *mask,0,*mask);
  349. //call SkAlpha255To256()
  350. src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
  351. // Get red and blue pixels into lower byte of each word.
  352. __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
  353. __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
  354. // Get alpha and green into lower byte of each word.
  355. __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
  356. __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
  357. // Put per-pixel alpha in low byte of each word.
  358. __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
  359. dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
  360. // dst_alpha = dst_alpha * src_scale
  361. dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
  362. // Divide by 256.
  363. dst_alpha = _mm_srli_epi16(dst_alpha, 8);
  364. // Subtract alphas from 256, to get 1..256
  365. dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
  366. // Multiply red and blue by dst pixel alpha.
  367. dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
  368. // Multiply alpha and green by dst pixel alpha.
  369. dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
  370. // Multiply red and blue by global alpha.
  371. src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
  372. // Multiply alpha and green by global alpha.
  373. src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
  374. // Divide by 256.
  375. dst_rb = _mm_srli_epi16(dst_rb, 8);
  376. src_rb = _mm_srli_epi16(src_rb, 8);
  377. // Mask out low bits (goodies already in the right place; no need to divide)
  378. dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
  379. src_ag = _mm_andnot_si128(rb_mask, src_ag);
  380. // Combine back into RGBA.
  381. dst_pixel = _mm_or_si128(dst_rb, dst_ag);
  382. __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
  383. // Add two pixels into result.
  384. __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
  385. _mm_store_si128(d, result);
  386. // load the next 4 pixel
  387. mask = mask + 4;
  388. d++;
  389. count -= 4;
  390. }
  391. dst = reinterpret_cast<SkPMColor *>(d);
  392. }
  393. while(count > 0) {
  394. *dst= SkBlendARGB32(color, *dst, *mask);
  395. dst += 1;
  396. mask++;
  397. count --;
  398. }
  399. dst = (SkPMColor *)((char*)dst + dstOffset);
  400. mask += maskOffset;
  401. } while (--height != 0);
  402. }