PageRenderTime 58ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 0ms

/luminance-hdr-2.3.0/src/Libpfs/vex.cpp

#
C++ | 657 lines | 507 code | 107 blank | 43 comment | 50 complexity | 3ef98a3d355fe43d24fd3dc5481a8660 MD5 | raw file
Possible License(s): GPL-2.0
  1. /**
  2. * @brief SSE for high performance vector operations
  3. *
  4. * This file is a part of LuminanceHDR package
  5. * ----------------------------------------------------------------------
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program; if not, write to the Free Software
  18. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  19. * ----------------------------------------------------------------------
  20. *
  21. * @author Davide Anastasia, <davideanastasia@users.sourceforge.net>
  22. *
  23. */
  24. #include <iostream>
  25. #include "vex.h"
  26. #ifdef _OPENMP
  27. #include <omp.h>
  28. #endif
  29. // Prefetch definition taken from:
  30. // http://software.intel.com/en-us/forums/showthread.php?t=46284
  31. // tune FETCH_DISTANCE according to real world experiments
  32. #define PREFETCH_T0(addr, nrOfBytesAhead) _mm_prefetch(((char *)(addr))+nrOfBytesAhead, _MM_HINT_T0)
  33. #define FETCH_DISTANCE 512
  34. void VEX_vsub(const float* A, const float* B, float* C, const int N)
  35. {
  36. #ifdef LUMINANCE_USE_SSE
  37. __m128 a, b, c;
  38. const int LOOP1 = (N >> 4);
  39. const int ELEMS_LOOP1 = (LOOP1 << 4);
  40. const int LOOP2 = (N - ELEMS_LOOP1);
  41. #pragma omp parallel for schedule(static, 5120) private(a,b,c)
  42. for (int l = 0; l < ELEMS_LOOP1; l+=16)
  43. {
  44. PREFETCH_T0(&A[l], FETCH_DISTANCE);
  45. PREFETCH_T0(&B[l], FETCH_DISTANCE);
  46. PREFETCH_T0(&C[l], FETCH_DISTANCE);
  47. a = _mm_load_ps(&A[l]);
  48. b = _mm_load_ps(&B[l]);
  49. c = _mm_sub_ps(a, b);
  50. _mm_store_ps(&C[l], c);
  51. a = _mm_load_ps(&A[l+4]);
  52. b = _mm_load_ps(&B[l+4]);
  53. c = _mm_sub_ps(a, b);
  54. _mm_store_ps(&C[l+4], c);
  55. a = _mm_load_ps(&A[l+8]);
  56. b = _mm_load_ps(&B[l+8]);
  57. c = _mm_sub_ps(a, b);
  58. _mm_store_ps(&C[l+8], c);
  59. a = _mm_load_ps(&A[l+12]);
  60. b = _mm_load_ps(&B[l+12]);
  61. c = _mm_sub_ps(a, b);
  62. _mm_store_ps(&C[l+12], c);
  63. }
  64. const float* pA = &A[ELEMS_LOOP1];
  65. const float* pB = &B[ELEMS_LOOP1];
  66. float* pC = &C[ELEMS_LOOP1];
  67. for (int l = 0; l < LOOP2; l++)
  68. {
  69. a = _mm_load_ss(&pA[l]);
  70. b = _mm_load_ss(&pB[l]);
  71. c = _mm_sub_ss(a, b);
  72. _mm_store_ss(&pC[l], c);
  73. }
  74. #else
  75. // plain code
  76. #pragma omp parallel for
  77. for (int idx = 0; idx < N; ++idx )
  78. {
  79. C[idx] = A[idx] - B[idx];
  80. }
  81. #endif
  82. }
  83. void VEX_vsubs(const float* A, const float premultiplier, const float* B, float* C, const int N)
  84. {
  85. #ifdef LUMINANCE_USE_SSE
  86. __m128 a, b, c;
  87. const __m128 val = _mm_set1_ps(premultiplier);
  88. const int LOOP1 = (N >> 4);
  89. const int ELEMS_LOOP1 = (LOOP1 << 4);
  90. const int LOOP2 = (N - ELEMS_LOOP1);
  91. #pragma omp parallel for schedule(static, 5120) private(a,b,c)
  92. for (int l = 0; l < ELEMS_LOOP1; l+=16)
  93. {
  94. PREFETCH_T0(&A[l], FETCH_DISTANCE);
  95. PREFETCH_T0(&B[l], FETCH_DISTANCE);
  96. PREFETCH_T0(&C[l], FETCH_DISTANCE);
  97. a = _mm_load_ps(&A[l]);
  98. b = _mm_load_ps(&B[l]);
  99. b = _mm_mul_ps(b, val);
  100. c = _mm_sub_ps(a, b);
  101. _mm_store_ps(&C[l], c);
  102. a = _mm_load_ps(&A[l+4]);
  103. b = _mm_load_ps(&B[l+4]);
  104. b = _mm_mul_ps(b, val);
  105. c = _mm_sub_ps(a, b);
  106. _mm_store_ps(&C[l+4], c);
  107. a = _mm_load_ps(&A[l+8]);
  108. b = _mm_load_ps(&B[l+8]);
  109. b = _mm_mul_ps(b, val);
  110. c = _mm_sub_ps(a, b);
  111. _mm_store_ps(&C[l+8], c);
  112. a = _mm_load_ps(&A[l+12]);
  113. b = _mm_load_ps(&B[l+12]);
  114. b = _mm_mul_ps(b, val);
  115. c = _mm_sub_ps(a, b);
  116. _mm_store_ps(&C[l+12], c);
  117. }
  118. const float* pA = &A[ELEMS_LOOP1];
  119. const float* pB = &B[ELEMS_LOOP1];
  120. float* pC = &C[ELEMS_LOOP1];
  121. for (int l = 0; l < LOOP2; l++)
  122. {
  123. a = _mm_load_ss(&pA[l]);
  124. b = _mm_load_ss(&pB[l]);
  125. b = _mm_mul_ss(b, val);
  126. c = _mm_sub_ss(a, b);
  127. _mm_store_ss(&pC[l], c);
  128. }
  129. #else
  130. // plain code
  131. #pragma omp parallel for
  132. for (int idx = 0; idx < N; ++idx )
  133. {
  134. C[idx] = A[idx] - premultiplier * B[idx];
  135. }
  136. #endif
  137. }
  138. void VEX_vadd(const float* A, const float* B, float* C, const int N)
  139. {
  140. #ifdef LUMINANCE_USE_SSE
  141. __m128 a, b, c;
  142. const int LOOP1 = (N >> 4);
  143. const int ELEMS_LOOP1 = (LOOP1 << 4);
  144. const int LOOP2 = (N - ELEMS_LOOP1);
  145. #pragma omp parallel for schedule(static, 5120) private(a,b,c)
  146. for (int l = 0; l < ELEMS_LOOP1; l+=16)
  147. {
  148. PREFETCH_T0(&A[l], FETCH_DISTANCE);
  149. PREFETCH_T0(&B[l], FETCH_DISTANCE);
  150. PREFETCH_T0(&C[l], FETCH_DISTANCE);
  151. a = _mm_load_ps(&A[l]);
  152. b = _mm_load_ps(&B[l]);
  153. c = _mm_add_ps(a, b);
  154. _mm_store_ps(&C[l], c);
  155. a = _mm_load_ps(&A[l+4]);
  156. b = _mm_load_ps(&B[l+4]);
  157. c = _mm_add_ps(a, b);
  158. _mm_store_ps(&C[l+4], c);
  159. a = _mm_load_ps(&A[l+8]);
  160. b = _mm_load_ps(&B[l+8]);
  161. c = _mm_add_ps(a, b);
  162. _mm_store_ps(&C[l+8], c);
  163. a = _mm_load_ps(&A[l+12]);
  164. b = _mm_load_ps(&B[l+12]);
  165. c = _mm_add_ps(a, b);
  166. _mm_store_ps(&C[l+12], c);
  167. }
  168. const float* pA = &A[ELEMS_LOOP1];
  169. const float* pB = &B[ELEMS_LOOP1];
  170. float* pC = &C[ELEMS_LOOP1];
  171. for (int l = 0; l < LOOP2; l++)
  172. {
  173. a = _mm_load_ss(&pA[l]);
  174. b = _mm_load_ss(&pB[l]);
  175. c = _mm_add_ss(a, b);
  176. _mm_store_ss(&pC[l], c);
  177. }
  178. #else
  179. // plain code
  180. #pragma omp parallel for
  181. for (int idx = 0; idx < N; ++idx )
  182. {
  183. C[idx] = A[idx] + B[idx];
  184. }
  185. #endif
  186. }
  187. void VEX_vadds(const float* A, const float premultiplier, const float* B, float* C, const int N)
  188. {
  189. #ifdef LUMINANCE_USE_SSE
  190. const __m128 val = _mm_set1_ps(premultiplier);
  191. __m128 a, b, c;
  192. const int LOOP1 = (N >> 4);
  193. const int ELEMS_LOOP1 = (LOOP1 << 4);
  194. const int LOOP2 = (N - ELEMS_LOOP1);
  195. #pragma omp parallel for schedule(static, 5120) private(a,b,c)
  196. for (int l = 0; l < ELEMS_LOOP1; l+=16)
  197. {
  198. PREFETCH_T0(&A[l], FETCH_DISTANCE);
  199. PREFETCH_T0(&B[l], FETCH_DISTANCE);
  200. PREFETCH_T0(&C[l], FETCH_DISTANCE);
  201. a = _mm_load_ps(&A[l]);
  202. b = _mm_load_ps(&B[l]);
  203. b = _mm_mul_ps(b, val);
  204. c = _mm_add_ps(a, b);
  205. _mm_store_ps(&C[l], c);
  206. a = _mm_load_ps(&A[l+4]);
  207. b = _mm_load_ps(&B[l+4]);
  208. b = _mm_mul_ps(b, val);
  209. c = _mm_add_ps(a, b);
  210. _mm_store_ps(&C[l+4], c);
  211. a = _mm_load_ps(&A[l+8]);
  212. b = _mm_load_ps(&B[l+8]);
  213. b = _mm_mul_ps(b, val);
  214. c = _mm_add_ps(a, b);
  215. _mm_store_ps(&C[l+8], c);
  216. a = _mm_load_ps(&A[l+12]);
  217. b = _mm_load_ps(&B[l+12]);
  218. b = _mm_mul_ps(b, val);
  219. c = _mm_add_ps(a, b);
  220. _mm_store_ps(&C[l+12], c);
  221. }
  222. const float* pA = &A[ELEMS_LOOP1];
  223. const float* pB = &B[ELEMS_LOOP1];
  224. float* pC = &C[ELEMS_LOOP1];
  225. for (int l = 0; l < LOOP2; l++)
  226. {
  227. a = _mm_load_ss(&pA[l]);
  228. b = _mm_load_ss(&pB[l]);
  229. b = _mm_mul_ss(b, val);
  230. c = _mm_add_ss(a, b);
  231. _mm_store_ss(&pC[l], c);
  232. }
  233. #else
  234. // plain code
  235. #pragma omp parallel for
  236. for (int idx = 0; idx < N; ++idx )
  237. {
  238. C[idx] = A[idx] + premultiplier * B[idx];
  239. }
  240. #endif
  241. }
  242. void VEX_vsmul(const float* I, const float premultiplier, float* O, const int N)
  243. {
  244. #ifdef LUMINANCE_USE_SSE
  245. const __m128 val = _mm_set1_ps(premultiplier);
  246. __m128 t;
  247. const int LOOP1 = (N >> 4);
  248. const int ELEMS_LOOP1 = (LOOP1 << 4);
  249. const int LOOP2 = (N - ELEMS_LOOP1);
  250. #pragma omp parallel for schedule(static, 5120) private(t)
  251. for (int l = 0; l < ELEMS_LOOP1; l+=16)
  252. {
  253. PREFETCH_T0(&I[l], FETCH_DISTANCE);
  254. PREFETCH_T0(&O[l], FETCH_DISTANCE);
  255. t = _mm_load_ps(&I[l]);
  256. t = _mm_mul_ps(t, val);
  257. _mm_store_ps(&O[l], t);
  258. t = _mm_load_ps(&I[l+4]);
  259. t = _mm_mul_ps(t, val);
  260. _mm_store_ps(&O[l+4], t);
  261. t = _mm_load_ps(&I[l+8]);
  262. t = _mm_mul_ps(t, val);
  263. _mm_store_ps(&O[l+8], t);
  264. t = _mm_load_ps(&I[l+12]);
  265. t = _mm_mul_ps(t, val);
  266. _mm_store_ps(&O[l+12], t);
  267. }
  268. const float* pI = &I[ELEMS_LOOP1];
  269. float* pO = &O[ELEMS_LOOP1];
  270. for (int l = 0; l < LOOP2; l++)
  271. {
  272. t = _mm_load_ss(&pI[l]);
  273. t = _mm_mul_ss(t, val);
  274. _mm_store_ss(&pO[l], t);
  275. }
  276. #else
  277. // plain code
  278. #pragma omp parallel for
  279. for(int idx = 0; idx < N; ++idx)
  280. {
  281. O[idx] = premultiplier * I[idx];
  282. }
  283. #endif
  284. }
  285. void VEX_vmul(const float* A, const float* B, float* C, const int N)
  286. {
  287. #ifdef LUMINANCE_USE_SSE
  288. __m128 a, b;
  289. const int LOOP1 = (N >> 4);
  290. const int ELEMS_LOOP1 = (LOOP1 << 4);
  291. const int LOOP2 = (N - ELEMS_LOOP1);
  292. #pragma omp parallel for schedule(static, 5120) private(a, b)
  293. for (int l = 0; l < ELEMS_LOOP1; l+=16)
  294. {
  295. PREFETCH_T0(&A[l], FETCH_DISTANCE);
  296. PREFETCH_T0(&B[l], FETCH_DISTANCE);
  297. PREFETCH_T0(&C[l], FETCH_DISTANCE);
  298. a = _mm_load_ps(&A[l]);
  299. b = _mm_load_ps(&B[l]);
  300. a = _mm_mul_ps(a, b);
  301. _mm_store_ps(&C[l], a);
  302. a = _mm_load_ps(&A[l+4]);
  303. b = _mm_load_ps(&B[l+4]);
  304. a = _mm_mul_ps(a, b);
  305. _mm_store_ps(&C[l+4], a);
  306. a = _mm_load_ps(&A[l+8]);
  307. b = _mm_load_ps(&B[l+8]);
  308. a = _mm_mul_ps(a, b);
  309. _mm_store_ps(&C[l+8], a);
  310. a = _mm_load_ps(&A[l+12]);
  311. b = _mm_load_ps(&B[l+12]);
  312. a = _mm_mul_ps(a, b);
  313. _mm_store_ps(&C[l+12], a);
  314. }
  315. const float* pA = &A[ELEMS_LOOP1];
  316. const float* pB = &B[ELEMS_LOOP1];
  317. float* pC = &C[ELEMS_LOOP1];
  318. for (int l = 0; l < LOOP2; l++)
  319. {
  320. a = _mm_load_ss(&pA[l]);
  321. b = _mm_load_ss(&pB[l]);
  322. a = _mm_mul_ss(a, b);
  323. _mm_store_ss(&pC[l], a);
  324. }
  325. #else
  326. // plain code
  327. #pragma omp parallel for
  328. for (int idx = 0; idx < N; ++idx )
  329. {
  330. C[idx] = A[idx] * B[idx];
  331. }
  332. #endif
  333. }
  334. void VEX_vdiv(const float* A, const float* B, float* C, const int N)
  335. {
  336. #ifdef LUMINANCE_USE_SSE
  337. __m128 a, b;
  338. const int LOOP1 = (N >> 4);
  339. const int ELEMS_LOOP1 = (LOOP1 << 4);
  340. const int LOOP2 = (N - ELEMS_LOOP1);
  341. #pragma omp parallel for schedule(static, 5120) private(a, b)
  342. for (int l = 0; l < ELEMS_LOOP1; l+=16)
  343. {
  344. PREFETCH_T0(&A[l], FETCH_DISTANCE);
  345. PREFETCH_T0(&B[l], FETCH_DISTANCE);
  346. PREFETCH_T0(&C[l], FETCH_DISTANCE);
  347. a = _mm_load_ps(&A[l]);
  348. b = _mm_load_ps(&B[l]);
  349. a = _mm_div_ps(a, b);
  350. _mm_store_ps(&C[l], a);
  351. a = _mm_load_ps(&A[l+4]);
  352. b = _mm_load_ps(&B[l+4]);
  353. a = _mm_div_ps(a, b);
  354. _mm_store_ps(&C[l+4], a);
  355. a = _mm_load_ps(&A[l+8]);
  356. b = _mm_load_ps(&B[l+8]);
  357. a = _mm_div_ps(a, b);
  358. _mm_store_ps(&C[l+8], a);
  359. a = _mm_load_ps(&A[l+12]);
  360. b = _mm_load_ps(&B[l+12]);
  361. a = _mm_div_ps(a, b);
  362. _mm_store_ps(&C[l+12], a);
  363. }
  364. const float* pA = &A[ELEMS_LOOP1];
  365. const float* pB = &B[ELEMS_LOOP1];
  366. float* pC = &C[ELEMS_LOOP1];
  367. for (int l = 0; l < LOOP2; l++)
  368. {
  369. a = _mm_load_ss(&pA[l]);
  370. b = _mm_load_ss(&pB[l]);
  371. a = _mm_div_ss(a, b);
  372. _mm_store_ss(&pC[l], a);
  373. }
  374. #else
  375. // plain code
  376. #pragma omp parallel for
  377. for (int idx = 0; idx < N; ++idx )
  378. {
  379. C[idx] = A[idx] / B[idx];
  380. }
  381. #endif
  382. }
  383. void VEX_vcopy(const float* I, float* O, const int N)
  384. {
  385. #ifdef LUMINANCE_USE_SSE
  386. const int LOOP1 = (N >> 4);
  387. const int ELEMS_LOOP1 = (LOOP1 << 4);
  388. const int LOOP2 = (N - ELEMS_LOOP1);
  389. #pragma omp parallel for schedule(static, 5120)
  390. for (int l = 0; l < ELEMS_LOOP1; l+=16)
  391. {
  392. PREFETCH_T0(&O[l], FETCH_DISTANCE);
  393. PREFETCH_T0(&I[l], FETCH_DISTANCE);
  394. _mm_store_ps(&O[l], _mm_load_ps(&I[l]));
  395. _mm_store_ps(&O[l+4], _mm_load_ps(&I[l+4]));
  396. _mm_store_ps(&O[l+8], _mm_load_ps(&I[l+8]));
  397. _mm_store_ps(&O[l+12], _mm_load_ps(&I[l+12]));
  398. }
  399. const float* pI = &I[ELEMS_LOOP1];
  400. float* pO = &O[ELEMS_LOOP1];
  401. for (int l = 0; l < LOOP2; l++)
  402. {
  403. _mm_store_ss(&pO[l], _mm_load_ss(&pI[l]));
  404. }
  405. _mm_sfence();
  406. #else
  407. // plain code
  408. #pragma omp parallel for
  409. for(int idx = 0; idx < N; ++idx)
  410. {
  411. O[idx] = I[idx];
  412. }
  413. #endif
  414. }
  415. void VEX_vset(float* IO, const float premultiplier, const int N)
  416. {
  417. #ifdef LUMINANCE_USE_SSE
  418. const int LOOP1 = (N >> 4);
  419. const int ELEMS_LOOP1 = (LOOP1 << 4);
  420. const int LOOP2 = (N - ELEMS_LOOP1);
  421. const __m128 val = _mm_set1_ps(premultiplier);
  422. #pragma omp parallel for schedule(static, 5120)
  423. for (int l = 0; l < ELEMS_LOOP1; l+=16)
  424. {
  425. PREFETCH_T0(&IO[l], FETCH_DISTANCE);
  426. _mm_store_ps(&IO[l], val);
  427. _mm_store_ps(&IO[l+4], val);
  428. _mm_store_ps(&IO[l+8], val);
  429. _mm_store_ps(&IO[l+12], val);
  430. }
  431. float* pIO = &IO[ELEMS_LOOP1];
  432. for (int l = 0; l < LOOP2; l++)
  433. {
  434. _mm_store_ss(&pIO[l], val);
  435. }
  436. #else
  437. // plain code
  438. #pragma omp parallel for
  439. for(int idx = 0; idx < N; ++idx)
  440. {
  441. IO[idx] = premultiplier;
  442. }
  443. #endif
  444. }
  445. void VEX_vreset(float* IO, const int N)
  446. {
  447. #ifdef LUMINANCE_USE_SSE
  448. const int LOOP1 = (N >> 4);
  449. const int ELEMS_LOOP1 = (LOOP1 << 4);
  450. const int LOOP2 = (N - ELEMS_LOOP1);
  451. const __m128 zero = _mm_setzero_ps();
  452. #pragma omp parallel for schedule(static, 5120)
  453. for (int l = 0; l < ELEMS_LOOP1; l+=16)
  454. {
  455. PREFETCH_T0(&IO[l], FETCH_DISTANCE);
  456. _mm_store_ps(&IO[l], zero);
  457. _mm_store_ps(&IO[l+4], zero);
  458. _mm_store_ps(&IO[l+8], zero);
  459. _mm_store_ps(&IO[l+12], zero);
  460. }
  461. float* pIO = &IO[ELEMS_LOOP1];
  462. for (int l = 0; l < LOOP2; l++)
  463. {
  464. _mm_store_ss(&pIO[l], zero);
  465. }
  466. #else
  467. // plain code
  468. #pragma omp parallel for
  469. for (int idx = 0; idx < N; ++idx)
  470. {
  471. IO[idx] = 0.0f;
  472. }
  473. #endif
  474. }
  475. void VEX_dotpr(const float* I1, const float* I2, float& val, const int N)
  476. {
  477. float t_val = 0.0f;
  478. #pragma omp parallel for reduction(+:t_val)
  479. for (int idx = 0; idx < N; ++idx)
  480. {
  481. t_val += I1[idx] * I2[idx];
  482. }
  483. val = t_val;
  484. }
  485. #ifdef LUMINANCE_USE_SSE
  486. /* Implementation lifted from http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html */
  487. #define EXP_POLY_DEGREE 5
  488. #define POLY0(x, c0) _mm_set1_ps(c0)
  489. #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
  490. #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
  491. #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
  492. #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
  493. #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
  494. v4sf _mm_exp2_ps(v4sf x)
  495. {
  496. __m128i ipart;
  497. v4sf fpart, expipart, expfpart;
  498. x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
  499. x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
  500. /* ipart = int(x - 0.5) */
  501. ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
  502. /* fpart = x - ipart */
  503. fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
  504. /* expipart = (float) (1 << ipart) */
  505. expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
  506. /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
  507. #if EXP_POLY_DEGREE == 5
  508. expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
  509. #elif EXP_POLY_DEGREE == 4
  510. expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
  511. #elif EXP_POLY_DEGREE == 3
  512. expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
  513. #elif EXP_POLY_DEGREE == 2
  514. expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
  515. #else
  516. #error
  517. #endif
  518. return _mm_mul_ps(expipart, expfpart);
  519. }
  520. #define LOG_POLY_DEGREE 5
  521. v4sf _mm_log2_ps(v4sf x)
  522. {
  523. __m128i exp = _mm_set1_epi32(0x7F800000);
  524. __m128i mant = _mm_set1_epi32(0x007FFFFF);
  525. v4sf one = _mm_set1_ps(1.0f);
  526. __m128i i = _mm_castps_si128(x);
  527. v4sf e = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, exp), 23), _mm_set1_epi32(127)));
  528. v4sf m = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mant)), one);
  529. v4sf p;
  530. /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ */
  531. #if LOG_POLY_DEGREE == 6
  532. p = POLY5( m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
  533. #elif LOG_POLY_DEGREE == 5
  534. p = POLY4(m, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
  535. #elif LOG_POLY_DEGREE == 4
  536. p = POLY3(m, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
  537. #elif LOG_POLY_DEGREE == 3
  538. p = POLY2(m, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
  539. #else
  540. #error
  541. #endif
  542. /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
  543. p = _mm_mul_ps(p, _mm_sub_ps(m, one));
  544. return _mm_add_ps(p, e);
  545. }
  546. v4sf _mm_pow_ps(v4sf x, v4sf y)
  547. {
  548. return _mm_exp2_ps(_mm_log2_ps(x) * y);
  549. }
  550. #endif // LUMINANCE_USE_SSE