/gfx/skia/src/opts/SkBlitRow_opts_SSE2.cpp
C++ | 484 lines | 320 code | 82 blank | 82 comment | 43 complexity | e386362c997fc46bc3b52f2b3a91acd2 MD5 | raw file
Possible License(s): GPL-2.0, JSON, 0BSD, LGPL-3.0, AGPL-1.0, MIT, MPL-2.0-no-copyleft-exception, BSD-3-Clause, LGPL-2.1, Apache-2.0
- /*
- * Copyright 2009 The Android Open Source Project
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
- #include "SkBlitRow_opts_SSE2.h"
- #include "SkColorPriv.h"
- #include "SkUtils.h"
- #include <emmintrin.h>
- /* SSE2 version of S32_Blend_BlitRow32()
- * portable version is in core/SkBlitRow_D32.cpp
- */
- void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src,
- int count, U8CPU alpha) {
- SkASSERT(alpha <= 255);
- if (count <= 0) {
- return;
- }
- uint32_t src_scale = SkAlpha255To256(alpha);
- uint32_t dst_scale = 256 - src_scale;
- if (count >= 4) {
- SkASSERT(((size_t)dst & 0x03) == 0);
- while (((size_t)dst & 0x0F) != 0) {
- *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
- src++;
- dst++;
- count--;
- }
- const __m128i *s = reinterpret_cast<const __m128i*>(src);
- __m128i *d = reinterpret_cast<__m128i*>(dst);
- __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
- __m128i src_scale_wide = _mm_set1_epi16(src_scale);
- __m128i dst_scale_wide = _mm_set1_epi16(dst_scale);
- while (count >= 4) {
- // Load 4 pixels each of src and dest.
- __m128i src_pixel = _mm_loadu_si128(s);
- __m128i dst_pixel = _mm_load_si128(d);
- // Get red and blue pixels into lower byte of each word.
- __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
- __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
- // Get alpha and green into lower byte of each word.
- __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
- __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
- // Multiply by scale.
- src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
- src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
- dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide);
- dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide);
- // Divide by 256.
- src_rb = _mm_srli_epi16(src_rb, 8);
- dst_rb = _mm_srli_epi16(dst_rb, 8);
- src_ag = _mm_andnot_si128(rb_mask, src_ag);
- dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
- // Combine back into RGBA.
- src_pixel = _mm_or_si128(src_rb, src_ag);
- dst_pixel = _mm_or_si128(dst_rb, dst_ag);
- // Add result
- __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
- _mm_store_si128(d, result);
- s++;
- d++;
- count -= 4;
- }
- src = reinterpret_cast<const SkPMColor*>(s);
- dst = reinterpret_cast<SkPMColor*>(d);
- }
- while (count > 0) {
- *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
- src++;
- dst++;
- count--;
- }
- }
- void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src,
- int count, U8CPU alpha) {
- SkASSERT(alpha == 255);
- if (count <= 0) {
- return;
- }
- if (count >= 4) {
- SkASSERT(((size_t)dst & 0x03) == 0);
- while (((size_t)dst & 0x0F) != 0) {
- *dst = SkPMSrcOver(*src, *dst);
- src++;
- dst++;
- count--;
- }
- const __m128i *s = reinterpret_cast<const __m128i*>(src);
- __m128i *d = reinterpret_cast<__m128i*>(dst);
- #ifdef SK_USE_ACCURATE_BLENDING
- __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
- __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
- __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
- while (count >= 4) {
- // Load 4 pixels
- __m128i src_pixel = _mm_loadu_si128(s);
- __m128i dst_pixel = _mm_load_si128(d);
- __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
- __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
- // Shift alphas down to lower 8 bits of each quad.
- __m128i alpha = _mm_srli_epi32(src_pixel, 24);
- // Copy alpha to upper 3rd byte of each quad
- alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
- // Subtract alphas from 255, to get 0..255
- alpha = _mm_sub_epi16(c_255, alpha);
- // Multiply by red and blue by src alpha.
- dst_rb = _mm_mullo_epi16(dst_rb, alpha);
- // Multiply by alpha and green by src alpha.
- dst_ag = _mm_mullo_epi16(dst_ag, alpha);
- // dst_rb_low = (dst_rb >> 8)
- __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
- __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
- // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
- dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
- dst_rb = _mm_add_epi16(dst_rb, c_128);
- dst_rb = _mm_srli_epi16(dst_rb, 8);
- // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
- dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
- dst_ag = _mm_add_epi16(dst_ag, c_128);
- dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
- // Combine back into RGBA.
- dst_pixel = _mm_or_si128(dst_rb, dst_ag);
- // Add result
- __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
- _mm_store_si128(d, result);
- s++;
- d++;
- count -= 4;
- }
- #else
- __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
- __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
- while (count >= 4) {
- // Load 4 pixels
- __m128i src_pixel = _mm_loadu_si128(s);
- __m128i dst_pixel = _mm_load_si128(d);
- __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
- __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
- // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)
- __m128i alpha = _mm_srli_epi16(src_pixel, 8);
- // (a0, a0, a1, a1, a2, g2, a3, g3)
- alpha = _mm_shufflehi_epi16(alpha, 0xF5);
- // (a0, a0, a1, a1, a2, a2, a3, a3)
- alpha = _mm_shufflelo_epi16(alpha, 0xF5);
- // Subtract alphas from 256, to get 1..256
- alpha = _mm_sub_epi16(c_256, alpha);
- // Multiply by red and blue by src alpha.
- dst_rb = _mm_mullo_epi16(dst_rb, alpha);
- // Multiply by alpha and green by src alpha.
- dst_ag = _mm_mullo_epi16(dst_ag, alpha);
- // Divide by 256.
- dst_rb = _mm_srli_epi16(dst_rb, 8);
- // Mask out high bits (already in the right place)
- dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
- // Combine back into RGBA.
- dst_pixel = _mm_or_si128(dst_rb, dst_ag);
- // Add result
- __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
- _mm_store_si128(d, result);
- s++;
- d++;
- count -= 4;
- }
- #endif
- src = reinterpret_cast<const SkPMColor*>(s);
- dst = reinterpret_cast<SkPMColor*>(d);
- }
- while (count > 0) {
- *dst = SkPMSrcOver(*src, *dst);
- src++;
- dst++;
- count--;
- }
- }
- void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
- const SkPMColor* SK_RESTRICT src,
- int count, U8CPU alpha) {
- SkASSERT(alpha <= 255);
- if (count <= 0) {
- return;
- }
- if (count >= 4) {
- while (((size_t)dst & 0x0F) != 0) {
- *dst = SkBlendARGB32(*src, *dst, alpha);
- src++;
- dst++;
- count--;
- }
- uint32_t src_scale = SkAlpha255To256(alpha);
- const __m128i *s = reinterpret_cast<const __m128i*>(src);
- __m128i *d = reinterpret_cast<__m128i*>(dst);
- __m128i src_scale_wide = _mm_set1_epi16(src_scale);
- __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
- __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)
- while (count >= 4) {
- // Load 4 pixels each of src and dest.
- __m128i src_pixel = _mm_loadu_si128(s);
- __m128i dst_pixel = _mm_load_si128(d);
- // Get red and blue pixels into lower byte of each word.
- __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
- __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
- // Get alpha and green into lower byte of each word.
- __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
- __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
- // Put per-pixel alpha in low byte of each word.
- __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
- dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
- // dst_alpha = dst_alpha * src_scale
- dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
- // Divide by 256.
- dst_alpha = _mm_srli_epi16(dst_alpha, 8);
- // Subtract alphas from 256, to get 1..256
- dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
- // Multiply red and blue by dst pixel alpha.
- dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
- // Multiply alpha and green by dst pixel alpha.
- dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
- // Multiply red and blue by global alpha.
- src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
- // Multiply alpha and green by global alpha.
- src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
- // Divide by 256.
- dst_rb = _mm_srli_epi16(dst_rb, 8);
- src_rb = _mm_srli_epi16(src_rb, 8);
- // Mask out low bits (goodies already in the right place; no need to divide)
- dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
- src_ag = _mm_andnot_si128(rb_mask, src_ag);
- // Combine back into RGBA.
- dst_pixel = _mm_or_si128(dst_rb, dst_ag);
- src_pixel = _mm_or_si128(src_rb, src_ag);
- // Add two pixels into result.
- __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
- _mm_store_si128(d, result);
- s++;
- d++;
- count -= 4;
- }
- src = reinterpret_cast<const SkPMColor*>(s);
- dst = reinterpret_cast<SkPMColor*>(d);
- }
- while (count > 0) {
- *dst = SkBlendARGB32(*src, *dst, alpha);
- src++;
- dst++;
- count--;
- }
- }
- /* SSE2 version of Color32()
- * portable version is in core/SkBlitRow_D32.cpp
- */
- void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
- SkPMColor color) {
- if (count <= 0) {
- return;
- }
- if (0 == color) {
- if (src != dst) {
- memcpy(dst, src, count * sizeof(SkPMColor));
- }
- return;
- }
- unsigned colorA = SkGetPackedA32(color);
- if (255 == colorA) {
- sk_memset32(dst, color, count);
- } else {
- unsigned scale = 256 - SkAlpha255To256(colorA);
- if (count >= 4) {
- SkASSERT(((size_t)dst & 0x03) == 0);
- while (((size_t)dst & 0x0F) != 0) {
- *dst = color + SkAlphaMulQ(*src, scale);
- src++;
- dst++;
- count--;
- }
- const __m128i *s = reinterpret_cast<const __m128i*>(src);
- __m128i *d = reinterpret_cast<__m128i*>(dst);
- __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
- __m128i src_scale_wide = _mm_set1_epi16(scale);
- __m128i color_wide = _mm_set1_epi32(color);
- while (count >= 4) {
- // Load 4 pixels each of src and dest.
- __m128i src_pixel = _mm_loadu_si128(s);
- // Get red and blue pixels into lower byte of each word.
- __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
- // Get alpha and green into lower byte of each word.
- __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
- // Multiply by scale.
- src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
- src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
- // Divide by 256.
- src_rb = _mm_srli_epi16(src_rb, 8);
- src_ag = _mm_andnot_si128(rb_mask, src_ag);
- // Combine back into RGBA.
- src_pixel = _mm_or_si128(src_rb, src_ag);
- // Add color to result.
- __m128i result = _mm_add_epi8(color_wide, src_pixel);
- // Store result.
- _mm_store_si128(d, result);
- s++;
- d++;
- count -= 4;
- }
- src = reinterpret_cast<const SkPMColor*>(s);
- dst = reinterpret_cast<SkPMColor*>(d);
- }
- while (count > 0) {
- *dst = color + SkAlphaMulQ(*src, scale);
- src += 1;
- dst += 1;
- count--;
- }
- }
- }
- void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
- size_t maskRB, SkColor origColor,
- int width, int height)
- {
- SkPMColor color = SkPreMultiplyColor(origColor);
- size_t dstOffset = dstRB - (width << 2);
- size_t maskOffset = maskRB - width;
- SkPMColor* dst = (SkPMColor *)device;
- const uint8_t* mask = (const uint8_t*)maskPtr;
- do {
- int count = width;
- if (count >= 4) {
- while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
- *dst = SkBlendARGB32(color, *dst, *mask);
- mask++;
- dst++;
- count--;
- }
- __m128i *d = reinterpret_cast<__m128i*>(dst);
- __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
- __m128i c_256 = _mm_set1_epi16(256);
- __m128i c_1 = _mm_set1_epi16(1);
- __m128i src_pixel = _mm_set1_epi32(color);
- while (count >= 4) {
- // Load 4 pixels each of src and dest.
- __m128i dst_pixel = _mm_load_si128(d);
- //set the aphla value
- __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\
- 0, *(mask+3),0, \
- *(mask+2),0, *(mask+2),\
- 0,*(mask+1), 0,*(mask+1),\
- 0, *mask,0,*mask);
- //call SkAlpha255To256()
- src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
- // Get red and blue pixels into lower byte of each word.
- __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
- __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
- // Get alpha and green into lower byte of each word.
- __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
- __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
- // Put per-pixel alpha in low byte of each word.
- __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
- dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
- // dst_alpha = dst_alpha * src_scale
- dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
- // Divide by 256.
- dst_alpha = _mm_srli_epi16(dst_alpha, 8);
- // Subtract alphas from 256, to get 1..256
- dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
- // Multiply red and blue by dst pixel alpha.
- dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
- // Multiply alpha and green by dst pixel alpha.
- dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
- // Multiply red and blue by global alpha.
- src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
- // Multiply alpha and green by global alpha.
- src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
- // Divide by 256.
- dst_rb = _mm_srli_epi16(dst_rb, 8);
- src_rb = _mm_srli_epi16(src_rb, 8);
- // Mask out low bits (goodies already in the right place; no need to divide)
- dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
- src_ag = _mm_andnot_si128(rb_mask, src_ag);
- // Combine back into RGBA.
- dst_pixel = _mm_or_si128(dst_rb, dst_ag);
- __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
- // Add two pixels into result.
- __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
- _mm_store_si128(d, result);
- // load the next 4 pixel
- mask = mask + 4;
- d++;
- count -= 4;
- }
- dst = reinterpret_cast<SkPMColor *>(d);
- }
- while(count > 0) {
- *dst= SkBlendARGB32(color, *dst, *mask);
- dst += 1;
- mask++;
- count --;
- }
- dst = (SkPMColor *)((char*)dst + dstOffset);
- mask += maskOffset;
- } while (--height != 0);
- }