/luminance-hdr-2.3.0/src/Libpfs/vex.cpp
# · C++ · 657 lines · 507 code · 107 blank · 43 comment · 50 complexity · 3ef98a3d355fe43d24fd3dc5481a8660 MD5 · raw file
- /**
- * @brief SSE for high performance vector operations
- *
- * This file is a part of LuminanceHDR package
- * ----------------------------------------------------------------------
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- * ----------------------------------------------------------------------
- *
- * @author Davide Anastasia, <davideanastasia@users.sourceforge.net>
- *
- */
- #include <iostream>
- #include "vex.h"
- #ifdef _OPENMP
- #include <omp.h>
- #endif
- // Prefetch definition taken from:
- // http://software.intel.com/en-us/forums/showthread.php?t=46284
- // tune FETCH_DISTANCE according to real world experiments
- #define PREFETCH_T0(addr, nrOfBytesAhead) _mm_prefetch(((char *)(addr))+nrOfBytesAhead, _MM_HINT_T0)
- #define FETCH_DISTANCE 512
- void VEX_vsub(const float* A, const float* B, float* C, const int N)
- {
- #ifdef LUMINANCE_USE_SSE
- __m128 a, b, c;
-
- const int LOOP1 = (N >> 4);
- const int ELEMS_LOOP1 = (LOOP1 << 4);
- const int LOOP2 = (N - ELEMS_LOOP1);
-
- #pragma omp parallel for schedule(static, 5120) private(a,b,c)
- for (int l = 0; l < ELEMS_LOOP1; l+=16)
- {
- PREFETCH_T0(&A[l], FETCH_DISTANCE);
- PREFETCH_T0(&B[l], FETCH_DISTANCE);
- PREFETCH_T0(&C[l], FETCH_DISTANCE);
-
- a = _mm_load_ps(&A[l]);
- b = _mm_load_ps(&B[l]);
- c = _mm_sub_ps(a, b);
- _mm_store_ps(&C[l], c);
-
- a = _mm_load_ps(&A[l+4]);
- b = _mm_load_ps(&B[l+4]);
- c = _mm_sub_ps(a, b);
- _mm_store_ps(&C[l+4], c);
-
- a = _mm_load_ps(&A[l+8]);
- b = _mm_load_ps(&B[l+8]);
- c = _mm_sub_ps(a, b);
- _mm_store_ps(&C[l+8], c);
-
- a = _mm_load_ps(&A[l+12]);
- b = _mm_load_ps(&B[l+12]);
- c = _mm_sub_ps(a, b);
- _mm_store_ps(&C[l+12], c);
- }
-
- const float* pA = &A[ELEMS_LOOP1];
- const float* pB = &B[ELEMS_LOOP1];
- float* pC = &C[ELEMS_LOOP1];
-
- for (int l = 0; l < LOOP2; l++)
- {
- a = _mm_load_ss(&pA[l]);
- b = _mm_load_ss(&pB[l]);
- c = _mm_sub_ss(a, b);
- _mm_store_ss(&pC[l], c);
- }
- #else
- // plain code
- #pragma omp parallel for
- for (int idx = 0; idx < N; ++idx )
- {
- C[idx] = A[idx] - B[idx];
- }
- #endif
- }
- void VEX_vsubs(const float* A, const float premultiplier, const float* B, float* C, const int N)
- {
- #ifdef LUMINANCE_USE_SSE
- __m128 a, b, c;
- const __m128 val = _mm_set1_ps(premultiplier);
-
- const int LOOP1 = (N >> 4);
- const int ELEMS_LOOP1 = (LOOP1 << 4);
- const int LOOP2 = (N - ELEMS_LOOP1);
-
- #pragma omp parallel for schedule(static, 5120) private(a,b,c)
- for (int l = 0; l < ELEMS_LOOP1; l+=16)
- {
- PREFETCH_T0(&A[l], FETCH_DISTANCE);
- PREFETCH_T0(&B[l], FETCH_DISTANCE);
- PREFETCH_T0(&C[l], FETCH_DISTANCE);
-
- a = _mm_load_ps(&A[l]);
- b = _mm_load_ps(&B[l]);
- b = _mm_mul_ps(b, val);
- c = _mm_sub_ps(a, b);
- _mm_store_ps(&C[l], c);
-
- a = _mm_load_ps(&A[l+4]);
- b = _mm_load_ps(&B[l+4]);
- b = _mm_mul_ps(b, val);
- c = _mm_sub_ps(a, b);
- _mm_store_ps(&C[l+4], c);
-
- a = _mm_load_ps(&A[l+8]);
- b = _mm_load_ps(&B[l+8]);
- b = _mm_mul_ps(b, val);
- c = _mm_sub_ps(a, b);
- _mm_store_ps(&C[l+8], c);
-
- a = _mm_load_ps(&A[l+12]);
- b = _mm_load_ps(&B[l+12]);
- b = _mm_mul_ps(b, val);
- c = _mm_sub_ps(a, b);
- _mm_store_ps(&C[l+12], c);
- }
-
- const float* pA = &A[ELEMS_LOOP1];
- const float* pB = &B[ELEMS_LOOP1];
- float* pC = &C[ELEMS_LOOP1];
-
- for (int l = 0; l < LOOP2; l++)
- {
- a = _mm_load_ss(&pA[l]);
- b = _mm_load_ss(&pB[l]);
- b = _mm_mul_ss(b, val);
- c = _mm_sub_ss(a, b);
- _mm_store_ss(&pC[l], c);
- }
- #else
- // plain code
- #pragma omp parallel for
- for (int idx = 0; idx < N; ++idx )
- {
- C[idx] = A[idx] - premultiplier * B[idx];
- }
- #endif
- }
- void VEX_vadd(const float* A, const float* B, float* C, const int N)
- {
- #ifdef LUMINANCE_USE_SSE
- __m128 a, b, c;
-
- const int LOOP1 = (N >> 4);
- const int ELEMS_LOOP1 = (LOOP1 << 4);
- const int LOOP2 = (N - ELEMS_LOOP1);
-
- #pragma omp parallel for schedule(static, 5120) private(a,b,c)
- for (int l = 0; l < ELEMS_LOOP1; l+=16)
- {
- PREFETCH_T0(&A[l], FETCH_DISTANCE);
- PREFETCH_T0(&B[l], FETCH_DISTANCE);
- PREFETCH_T0(&C[l], FETCH_DISTANCE);
-
- a = _mm_load_ps(&A[l]);
- b = _mm_load_ps(&B[l]);
- c = _mm_add_ps(a, b);
- _mm_store_ps(&C[l], c);
-
- a = _mm_load_ps(&A[l+4]);
- b = _mm_load_ps(&B[l+4]);
- c = _mm_add_ps(a, b);
- _mm_store_ps(&C[l+4], c);
-
- a = _mm_load_ps(&A[l+8]);
- b = _mm_load_ps(&B[l+8]);
- c = _mm_add_ps(a, b);
- _mm_store_ps(&C[l+8], c);
-
- a = _mm_load_ps(&A[l+12]);
- b = _mm_load_ps(&B[l+12]);
- c = _mm_add_ps(a, b);
- _mm_store_ps(&C[l+12], c);
- }
-
- const float* pA = &A[ELEMS_LOOP1];
- const float* pB = &B[ELEMS_LOOP1];
- float* pC = &C[ELEMS_LOOP1];
-
- for (int l = 0; l < LOOP2; l++)
- {
- a = _mm_load_ss(&pA[l]);
- b = _mm_load_ss(&pB[l]);
- c = _mm_add_ss(a, b);
- _mm_store_ss(&pC[l], c);
- }
- #else
- // plain code
- #pragma omp parallel for
- for (int idx = 0; idx < N; ++idx )
- {
- C[idx] = A[idx] + B[idx];
- }
- #endif
- }
- void VEX_vadds(const float* A, const float premultiplier, const float* B, float* C, const int N)
- {
- #ifdef LUMINANCE_USE_SSE
- const __m128 val = _mm_set1_ps(premultiplier);
- __m128 a, b, c;
-
- const int LOOP1 = (N >> 4);
- const int ELEMS_LOOP1 = (LOOP1 << 4);
- const int LOOP2 = (N - ELEMS_LOOP1);
-
- #pragma omp parallel for schedule(static, 5120) private(a,b,c)
- for (int l = 0; l < ELEMS_LOOP1; l+=16)
- {
- PREFETCH_T0(&A[l], FETCH_DISTANCE);
- PREFETCH_T0(&B[l], FETCH_DISTANCE);
- PREFETCH_T0(&C[l], FETCH_DISTANCE);
-
- a = _mm_load_ps(&A[l]);
- b = _mm_load_ps(&B[l]);
- b = _mm_mul_ps(b, val);
- c = _mm_add_ps(a, b);
- _mm_store_ps(&C[l], c);
-
- a = _mm_load_ps(&A[l+4]);
- b = _mm_load_ps(&B[l+4]);
- b = _mm_mul_ps(b, val);
- c = _mm_add_ps(a, b);
- _mm_store_ps(&C[l+4], c);
-
- a = _mm_load_ps(&A[l+8]);
- b = _mm_load_ps(&B[l+8]);
- b = _mm_mul_ps(b, val);
- c = _mm_add_ps(a, b);
- _mm_store_ps(&C[l+8], c);
-
- a = _mm_load_ps(&A[l+12]);
- b = _mm_load_ps(&B[l+12]);
- b = _mm_mul_ps(b, val);
- c = _mm_add_ps(a, b);
- _mm_store_ps(&C[l+12], c);
- }
-
- const float* pA = &A[ELEMS_LOOP1];
- const float* pB = &B[ELEMS_LOOP1];
- float* pC = &C[ELEMS_LOOP1];
-
- for (int l = 0; l < LOOP2; l++)
- {
- a = _mm_load_ss(&pA[l]);
- b = _mm_load_ss(&pB[l]);
- b = _mm_mul_ss(b, val);
- c = _mm_add_ss(a, b);
- _mm_store_ss(&pC[l], c);
- }
- #else
- // plain code
- #pragma omp parallel for
- for (int idx = 0; idx < N; ++idx )
- {
- C[idx] = A[idx] + premultiplier * B[idx];
- }
- #endif
- }
- void VEX_vsmul(const float* I, const float premultiplier, float* O, const int N)
- {
- #ifdef LUMINANCE_USE_SSE
- const __m128 val = _mm_set1_ps(premultiplier);
- __m128 t;
-
- const int LOOP1 = (N >> 4);
- const int ELEMS_LOOP1 = (LOOP1 << 4);
- const int LOOP2 = (N - ELEMS_LOOP1);
-
- #pragma omp parallel for schedule(static, 5120) private(t)
- for (int l = 0; l < ELEMS_LOOP1; l+=16)
- {
- PREFETCH_T0(&I[l], FETCH_DISTANCE);
- PREFETCH_T0(&O[l], FETCH_DISTANCE);
-
- t = _mm_load_ps(&I[l]);
- t = _mm_mul_ps(t, val);
- _mm_store_ps(&O[l], t);
-
- t = _mm_load_ps(&I[l+4]);
- t = _mm_mul_ps(t, val);
- _mm_store_ps(&O[l+4], t);
-
- t = _mm_load_ps(&I[l+8]);
- t = _mm_mul_ps(t, val);
- _mm_store_ps(&O[l+8], t);
-
- t = _mm_load_ps(&I[l+12]);
- t = _mm_mul_ps(t, val);
- _mm_store_ps(&O[l+12], t);
- }
-
- const float* pI = &I[ELEMS_LOOP1];
- float* pO = &O[ELEMS_LOOP1];
-
- for (int l = 0; l < LOOP2; l++)
- {
- t = _mm_load_ss(&pI[l]);
- t = _mm_mul_ss(t, val);
- _mm_store_ss(&pO[l], t);
- }
- #else
- // plain code
- #pragma omp parallel for
- for(int idx = 0; idx < N; ++idx)
- {
- O[idx] = premultiplier * I[idx];
- }
- #endif
- }
- void VEX_vmul(const float* A, const float* B, float* C, const int N)
- {
- #ifdef LUMINANCE_USE_SSE
- __m128 a, b;
-
- const int LOOP1 = (N >> 4);
- const int ELEMS_LOOP1 = (LOOP1 << 4);
- const int LOOP2 = (N - ELEMS_LOOP1);
-
- #pragma omp parallel for schedule(static, 5120) private(a, b)
- for (int l = 0; l < ELEMS_LOOP1; l+=16)
- {
- PREFETCH_T0(&A[l], FETCH_DISTANCE);
- PREFETCH_T0(&B[l], FETCH_DISTANCE);
- PREFETCH_T0(&C[l], FETCH_DISTANCE);
-
- a = _mm_load_ps(&A[l]);
- b = _mm_load_ps(&B[l]);
- a = _mm_mul_ps(a, b);
- _mm_store_ps(&C[l], a);
-
- a = _mm_load_ps(&A[l+4]);
- b = _mm_load_ps(&B[l+4]);
- a = _mm_mul_ps(a, b);
- _mm_store_ps(&C[l+4], a);
-
- a = _mm_load_ps(&A[l+8]);
- b = _mm_load_ps(&B[l+8]);
- a = _mm_mul_ps(a, b);
- _mm_store_ps(&C[l+8], a);
-
- a = _mm_load_ps(&A[l+12]);
- b = _mm_load_ps(&B[l+12]);
- a = _mm_mul_ps(a, b);
- _mm_store_ps(&C[l+12], a);
- }
-
- const float* pA = &A[ELEMS_LOOP1];
- const float* pB = &B[ELEMS_LOOP1];
- float* pC = &C[ELEMS_LOOP1];
-
- for (int l = 0; l < LOOP2; l++)
- {
- a = _mm_load_ss(&pA[l]);
- b = _mm_load_ss(&pB[l]);
- a = _mm_mul_ss(a, b);
- _mm_store_ss(&pC[l], a);
- }
- #else
- // plain code
- #pragma omp parallel for
- for (int idx = 0; idx < N; ++idx )
- {
- C[idx] = A[idx] * B[idx];
- }
- #endif
- }
- void VEX_vdiv(const float* A, const float* B, float* C, const int N)
- {
- #ifdef LUMINANCE_USE_SSE
- __m128 a, b;
-
- const int LOOP1 = (N >> 4);
- const int ELEMS_LOOP1 = (LOOP1 << 4);
- const int LOOP2 = (N - ELEMS_LOOP1);
-
- #pragma omp parallel for schedule(static, 5120) private(a, b)
- for (int l = 0; l < ELEMS_LOOP1; l+=16)
- {
- PREFETCH_T0(&A[l], FETCH_DISTANCE);
- PREFETCH_T0(&B[l], FETCH_DISTANCE);
- PREFETCH_T0(&C[l], FETCH_DISTANCE);
-
- a = _mm_load_ps(&A[l]);
- b = _mm_load_ps(&B[l]);
- a = _mm_div_ps(a, b);
- _mm_store_ps(&C[l], a);
-
- a = _mm_load_ps(&A[l+4]);
- b = _mm_load_ps(&B[l+4]);
- a = _mm_div_ps(a, b);
- _mm_store_ps(&C[l+4], a);
-
- a = _mm_load_ps(&A[l+8]);
- b = _mm_load_ps(&B[l+8]);
- a = _mm_div_ps(a, b);
- _mm_store_ps(&C[l+8], a);
-
- a = _mm_load_ps(&A[l+12]);
- b = _mm_load_ps(&B[l+12]);
- a = _mm_div_ps(a, b);
- _mm_store_ps(&C[l+12], a);
- }
-
- const float* pA = &A[ELEMS_LOOP1];
- const float* pB = &B[ELEMS_LOOP1];
- float* pC = &C[ELEMS_LOOP1];
-
- for (int l = 0; l < LOOP2; l++)
- {
- a = _mm_load_ss(&pA[l]);
- b = _mm_load_ss(&pB[l]);
- a = _mm_div_ss(a, b);
- _mm_store_ss(&pC[l], a);
- }
- #else
- // plain code
- #pragma omp parallel for
- for (int idx = 0; idx < N; ++idx )
- {
- C[idx] = A[idx] / B[idx];
- }
- #endif
- }
- void VEX_vcopy(const float* I, float* O, const int N)
- {
- #ifdef LUMINANCE_USE_SSE
- const int LOOP1 = (N >> 4);
- const int ELEMS_LOOP1 = (LOOP1 << 4);
- const int LOOP2 = (N - ELEMS_LOOP1);
-
- #pragma omp parallel for schedule(static, 5120)
- for (int l = 0; l < ELEMS_LOOP1; l+=16)
- {
- PREFETCH_T0(&O[l], FETCH_DISTANCE);
- PREFETCH_T0(&I[l], FETCH_DISTANCE);
-
- _mm_store_ps(&O[l], _mm_load_ps(&I[l]));
- _mm_store_ps(&O[l+4], _mm_load_ps(&I[l+4]));
- _mm_store_ps(&O[l+8], _mm_load_ps(&I[l+8]));
- _mm_store_ps(&O[l+12], _mm_load_ps(&I[l+12]));
- }
-
- const float* pI = &I[ELEMS_LOOP1];
- float* pO = &O[ELEMS_LOOP1];
-
- for (int l = 0; l < LOOP2; l++)
- {
- _mm_store_ss(&pO[l], _mm_load_ss(&pI[l]));
- }
- _mm_sfence();
- #else
- // plain code
- #pragma omp parallel for
- for(int idx = 0; idx < N; ++idx)
- {
- O[idx] = I[idx];
- }
- #endif
- }
- void VEX_vset(float* IO, const float premultiplier, const int N)
- {
- #ifdef LUMINANCE_USE_SSE
- const int LOOP1 = (N >> 4);
- const int ELEMS_LOOP1 = (LOOP1 << 4);
- const int LOOP2 = (N - ELEMS_LOOP1);
-
- const __m128 val = _mm_set1_ps(premultiplier);
-
- #pragma omp parallel for schedule(static, 5120)
- for (int l = 0; l < ELEMS_LOOP1; l+=16)
- {
- PREFETCH_T0(&IO[l], FETCH_DISTANCE);
-
- _mm_store_ps(&IO[l], val);
- _mm_store_ps(&IO[l+4], val);
- _mm_store_ps(&IO[l+8], val);
- _mm_store_ps(&IO[l+12], val);
- }
-
- float* pIO = &IO[ELEMS_LOOP1];
-
- for (int l = 0; l < LOOP2; l++)
- {
- _mm_store_ss(&pIO[l], val);
- }
- #else
- // plain code
- #pragma omp parallel for
- for(int idx = 0; idx < N; ++idx)
- {
- IO[idx] = premultiplier;
- }
- #endif
- }
- void VEX_vreset(float* IO, const int N)
- {
- #ifdef LUMINANCE_USE_SSE
- const int LOOP1 = (N >> 4);
- const int ELEMS_LOOP1 = (LOOP1 << 4);
- const int LOOP2 = (N - ELEMS_LOOP1);
-
- const __m128 zero = _mm_setzero_ps();
-
- #pragma omp parallel for schedule(static, 5120)
- for (int l = 0; l < ELEMS_LOOP1; l+=16)
- {
- PREFETCH_T0(&IO[l], FETCH_DISTANCE);
-
- _mm_store_ps(&IO[l], zero);
- _mm_store_ps(&IO[l+4], zero);
- _mm_store_ps(&IO[l+8], zero);
- _mm_store_ps(&IO[l+12], zero);
- }
-
- float* pIO = &IO[ELEMS_LOOP1];
-
- for (int l = 0; l < LOOP2; l++)
- {
- _mm_store_ss(&pIO[l], zero);
- }
- #else
- // plain code
- #pragma omp parallel for
- for (int idx = 0; idx < N; ++idx)
- {
- IO[idx] = 0.0f;
- }
- #endif
- }
- void VEX_dotpr(const float* I1, const float* I2, float& val, const int N)
- {
- float t_val = 0.0f;
- #pragma omp parallel for reduction(+:t_val)
- for (int idx = 0; idx < N; ++idx)
- {
- t_val += I1[idx] * I2[idx];
- }
- val = t_val;
- }
- #ifdef LUMINANCE_USE_SSE
- /* Implementation lifted from http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html */
- #define EXP_POLY_DEGREE 5
- #define POLY0(x, c0) _mm_set1_ps(c0)
- #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
- #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
- #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
- #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
- #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
- v4sf _mm_exp2_ps(v4sf x)
- {
- __m128i ipart;
- v4sf fpart, expipart, expfpart;
- x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
- x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
- /* ipart = int(x - 0.5) */
- ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
- /* fpart = x - ipart */
- fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
- /* expipart = (float) (1 << ipart) */
- expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
- /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
- #if EXP_POLY_DEGREE == 5
- expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
- #elif EXP_POLY_DEGREE == 4
- expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
- #elif EXP_POLY_DEGREE == 3
- expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
- #elif EXP_POLY_DEGREE == 2
- expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
- #else
- #error
- #endif
- return _mm_mul_ps(expipart, expfpart);
- }
- #define LOG_POLY_DEGREE 5
- v4sf _mm_log2_ps(v4sf x)
- {
- __m128i exp = _mm_set1_epi32(0x7F800000);
- __m128i mant = _mm_set1_epi32(0x007FFFFF);
- v4sf one = _mm_set1_ps(1.0f);
- __m128i i = _mm_castps_si128(x);
- v4sf e = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, exp), 23), _mm_set1_epi32(127)));
- v4sf m = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mant)), one);
- v4sf p;
- /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ */
- #if LOG_POLY_DEGREE == 6
- p = POLY5( m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
- #elif LOG_POLY_DEGREE == 5
- p = POLY4(m, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
- #elif LOG_POLY_DEGREE == 4
- p = POLY3(m, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
- #elif LOG_POLY_DEGREE == 3
- p = POLY2(m, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
- #else
- #error
- #endif
- /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
- p = _mm_mul_ps(p, _mm_sub_ps(m, one));
- return _mm_add_ps(p, e);
- }
- v4sf _mm_pow_ps(v4sf x, v4sf y)
- {
- return _mm_exp2_ps(_mm_log2_ps(x) * y);
- }
- #endif // LUMINANCE_USE_SSE