/stdlib.ispc

http://github.com/ispc/ispc · Unknown · 6027 lines · 5250 code · 777 blank · 0 comment · 0 complexity · 9e136345ca764dccc89498a77a0eac4e MD5 · raw file

  1. // -*- mode: c++ -*-
  2. /*
  3. Copyright (c) 2010-2014, Intel Corporation
  4. All rights reserved.
  5. Redistribution and use in source and binary forms, with or without
  6. modification, are permitted provided that the following conditions are
  7. met:
  8. * Redistributions of source code must retain the above copyright
  9. notice, this list of conditions and the following disclaimer.
  10. * Redistributions in binary form must reproduce the above copyright
  11. notice, this list of conditions and the following disclaimer in the
  12. documentation and/or other materials provided with the distribution.
  13. * Neither the name of Intel Corporation nor the names of its
  14. contributors may be used to endorse or promote products derived from
  15. this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
  17. IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  18. TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
  19. PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
  20. OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  21. EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  22. PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  23. PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  24. LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  25. NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  26. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. */
  28. /** @file stdlib.ispc
  29. @brief Portion of the ispc standard library implementation that's in
  30. ispc code
  31. */
  32. #if (ISPC_MASK_BITS == 1)
  33. #define IntMaskType bool
  34. #define UIntMaskType bool
  35. #elif (ISPC_MASK_BITS == 8)
  36. #define IntMaskType int8
  37. #define UIntMaskType unsigned int8
  38. #elif (ISPC_MASK_BITS == 16)
  39. #define IntMaskType int16
  40. #define UIntMaskType unsigned int16
  41. #elif (ISPC_MASK_BITS == 32)
  42. #define IntMaskType int32
  43. #define UIntMaskType unsigned int32
  44. #elif (ISPC_MASK_BITS == 64)
  45. #define IntMaskType int64
  46. #define UIntMaskType unsigned int64
  47. #else
  48. #error Unknown value of ISPC_MASK_BITS
  49. #endif
  50. ///////////////////////////////////////////////////////////////////////////
  51. // CUDA Specific primitives
  52. //
  53. /***************/
  54. __declspec(safe,cost0) static inline varying int __programIndex() { return __program_index(); }
  55. __declspec(safe,cost0) static inline uniform int __programCount() { return __program_count(); }
  56. __declspec(safe,cost0) static inline uniform int __warpIndex() { return __warp_index(); }
  57. /***************/
  58. __declspec(safe,cost0) static inline uniform int __taskIndex0() { return __task_index0(); }
  59. __declspec(safe,cost0) static inline uniform int __taskIndex1() { return __task_index1(); }
  60. __declspec(safe,cost0) static inline uniform int __taskIndex2() { return __task_index2(); }
  61. __declspec(safe,cost0) static inline uniform int __taskIndex () { return __task_index (); }
  62. /***************/
  63. __declspec(safe,cost0) static inline uniform int __taskCount0() { return __task_count0(); }
  64. __declspec(safe,cost0) static inline uniform int __taskCount1() { return __task_count1(); }
  65. __declspec(safe,cost0) static inline uniform int __taskCount2() { return __task_count2(); }
  66. __declspec(safe,cost0) static inline uniform int __taskCount () { return __task_count (); }
  67. /* Limits of integral types. */
  68. #ifndef INT8_MAX
  69. #define INT8_MAX (127)
  70. #endif
  71. #ifndef INT16_MAX
  72. #define INT16_MAX (32767)
  73. #endif
  74. #ifndef INT32_MAX
  75. #define INT32_MAX (2147483647)
  76. #endif
  77. #ifndef INT64_MAX
  78. #define INT64_MAX (9223372036854775807)
  79. #endif
  80. #ifndef UINT8_MAX
  81. #define UINT8_MAX (255)
  82. #endif
  83. #ifndef UINT16_MAX
  84. #define UINT16_MAX (65535)
  85. #endif
  86. #ifndef UINT32_MAX
  87. #define UINT32_MAX (4294967295)
  88. #endif
  89. #ifndef UINT64_MAX
  90. #define UINT64_MAX (18446744073709551615)
  91. #endif
  92. #ifndef INT8_MIN
  93. #define INT8_MIN (-INT8_MAX - 1)
  94. #endif
  95. #ifndef INT16_MIN
  96. #define INT16_MIN (-INT16_MAX - 1)
  97. #endif
  98. #ifndef INT32_MIN
  99. #define INT32_MIN (-INT32_MAX - 1)
  100. #endif
  101. #ifndef INT64_MIN
  102. #define INT64_MIN (-INT64_MAX - 1)
  103. #endif
  104. ///////////////////////////////////////////////////////////////////////////
  105. // Low level primitives
  106. __declspec(safe,cost0)
  107. static inline float floatbits(unsigned int a) {
  108. return __floatbits_varying_int32(a);
  109. }
  110. __declspec(safe,cost0)
  111. static inline uniform float floatbits(uniform unsigned int a) {
  112. return __floatbits_uniform_int32(a);
  113. }
  114. __declspec(safe,cost0)
  115. static inline float floatbits(int a) {
  116. return __floatbits_varying_int32(a);
  117. }
  118. __declspec(safe,cost0)
  119. static inline uniform float floatbits(uniform int a) {
  120. return __floatbits_uniform_int32(a);
  121. }
  122. __declspec(safe,cost0)
  123. static inline double doublebits(unsigned int64 a) {
  124. return __doublebits_varying_int64(a);
  125. }
  126. __declspec(safe,cost0)
  127. static inline uniform double doublebits(uniform unsigned int64 a) {
  128. return __doublebits_uniform_int64(a);
  129. }
  130. __declspec(safe,cost0)
  131. static inline unsigned int intbits(float a) {
  132. return __intbits_varying_float(a);
  133. }
  134. __declspec(safe,cost0)
  135. static inline uniform unsigned int intbits(uniform float a) {
  136. return __intbits_uniform_float(a);
  137. }
  138. __declspec(safe,cost0)
  139. static inline unsigned int64 intbits(double d) {
  140. return __intbits_varying_double(d);
  141. }
  142. __declspec(safe,cost0)
  143. static inline uniform unsigned int64 intbits(uniform double d) {
  144. return __intbits_uniform_double(d);
  145. }
  146. __declspec(safe)
  147. static inline float broadcast(float v, uniform int i) {
  148. return __broadcast_float(v, i);
  149. }
  150. __declspec(safe)
  151. static inline int8 broadcast(int8 v, uniform int i) {
  152. return __broadcast_i8(v, i);
  153. }
  154. __declspec(safe)
  155. static inline int16 broadcast(int16 v, uniform int i) {
  156. return __broadcast_i16(v, i);
  157. }
  158. __declspec(safe)
  159. static inline int32 broadcast(int32 v, uniform int i) {
  160. return __broadcast_i32(v, i);
  161. }
  162. __declspec(safe)
  163. static inline double broadcast(double v, uniform int i) {
  164. return __broadcast_double(v, i);
  165. }
  166. __declspec(safe)
  167. static inline int64 broadcast(int64 v, uniform int i) {
  168. return __broadcast_i64(v, i);
  169. }
  170. __declspec(safe)
  171. static inline float rotate(float v, uniform int i) {
  172. return __rotate_float(v, i);
  173. }
  174. __declspec(safe)
  175. static inline int8 rotate(int8 v, uniform int i) {
  176. return __rotate_i8(v, i);
  177. }
  178. __declspec(safe)
  179. static inline int16 rotate(int16 v, uniform int i) {
  180. return __rotate_i16(v, i);
  181. }
  182. __declspec(safe)
  183. static inline int32 rotate(int32 v, uniform int i) {
  184. return __rotate_i32(v, i);
  185. }
  186. __declspec(safe)
  187. static inline double rotate(double v, uniform int i) {
  188. return __rotate_double(v, i);
  189. }
  190. __declspec(safe)
  191. static inline int64 rotate(int64 v, uniform int i) {
  192. return __rotate_i64(v, i);
  193. }
  194. __declspec(safe)
  195. static inline float shift(float v, uniform int i) {
  196. varying float result;
  197. unmasked {
  198. result = __shift_float(v, i);
  199. }
  200. return result;
  201. }
  202. __declspec(safe)
  203. static inline int8 shift(int8 v, uniform int i) {
  204. varying int8 result;
  205. unmasked {
  206. result = __shift_i8(v, i);
  207. }
  208. return result;
  209. }
  210. __declspec(safe)
  211. static inline int16 shift(int16 v, uniform int i) {
  212. varying int16 result;
  213. unmasked {
  214. result = __shift_i16(v, i);
  215. }
  216. return result;
  217. }
  218. __declspec(safe)
  219. static inline int32 shift(int32 v, uniform int i) {
  220. varying int32 result;
  221. unmasked {
  222. result = __shift_i32(v, i);
  223. }
  224. return result;
  225. }
  226. __declspec(safe)
  227. static inline double shift(double v, uniform int i) {
  228. varying double result;
  229. unmasked {
  230. result = __shift_double(v, i);
  231. }
  232. return result;
  233. }
  234. __declspec(safe)
  235. static inline int64 shift(int64 v, uniform int i) {
  236. varying int64 result;
  237. unmasked {
  238. result = __shift_i64(v, i);
  239. }
  240. return result;
  241. }
  242. __declspec(safe)
  243. static inline float shuffle(float v, int i) {
  244. return __shuffle_float(v, i);
  245. }
  246. __declspec(safe)
  247. static inline int8 shuffle(int8 v, int i) {
  248. return __shuffle_i8(v, i);
  249. }
  250. __declspec(safe)
  251. static inline int16 shuffle(int16 v, int i) {
  252. return __shuffle_i16(v, i);
  253. }
  254. __declspec(safe)
  255. static inline int32 shuffle(int32 v, int i) {
  256. return __shuffle_i32(v, i);
  257. }
  258. __declspec(safe)
  259. static inline double shuffle(double v, int i) {
  260. return __shuffle_double(v, i);
  261. }
  262. __declspec(safe)
  263. static inline int64 shuffle(int64 v, int i) {
  264. return __shuffle_i64(v, i);
  265. }
  266. __declspec(safe)
  267. static inline float shuffle(float v0, float v1, int i) {
  268. return __shuffle2_float(v0, v1, i);
  269. }
  270. __declspec(safe)
  271. static inline int8 shuffle(int8 v0, int8 v1, int i) {
  272. return __shuffle2_i8(v0, v1, i);
  273. }
  274. __declspec(safe)
  275. static inline int16 shuffle(int16 v0, int16 v1, int i) {
  276. return __shuffle2_i16(v0, v1, i);
  277. }
  278. __declspec(safe)
  279. static inline int32 shuffle(int32 v0, int32 v1, int i) {
  280. return __shuffle2_i32(v0, v1, i);
  281. }
  282. __declspec(safe)
  283. static inline double shuffle(double v0, double v1, int i) {
  284. return __shuffle2_double(v0, v1, i);
  285. }
  286. __declspec(safe)
  287. static inline int64 shuffle(int64 v0, int64 v1, int i) {
  288. return __shuffle2_i64(v0, v1, i);
  289. }
  290. // x[i]
  291. __declspec(safe,cost1)
  292. static inline uniform float extract(float x, uniform int i) {
  293. return floatbits(__extract_int32((int)intbits(x), i));
  294. }
  295. __declspec(safe,cost1)
  296. static inline uniform int8 extract(int8 x, uniform int i) {
  297. return __extract_int8(x, i);
  298. }
  299. __declspec(safe,cost1)
  300. static inline uniform unsigned int8 extract(unsigned int8 x, uniform int i) {
  301. return __extract_int8(x, (unsigned int)i);
  302. }
  303. __declspec(safe,cost1)
  304. static inline uniform int16 extract(int16 x, uniform int i) {
  305. return __extract_int16(x, i);
  306. }
  307. __declspec(safe,cost1)
  308. static inline uniform unsigned int16 extract(unsigned int16 x, uniform int i) {
  309. return __extract_int16(x, (unsigned int)i);
  310. }
  311. __declspec(safe,cost1)
  312. static inline uniform int32 extract(int32 x, uniform int i) {
  313. return __extract_int32(x, i);
  314. }
  315. __declspec(safe,cost1)
  316. static inline uniform unsigned int32 extract(unsigned int32 x, uniform int i) {
  317. return __extract_int32(x, (unsigned int)i);
  318. }
  319. __declspec(safe,cost1)
  320. static inline uniform double extract(double x, uniform int i) {
  321. return doublebits(__extract_int64((int64)intbits(x), i));
  322. }
  323. __declspec(safe,cost1)
  324. static inline uniform int64 extract(int64 x, uniform int i) {
  325. return __extract_int64(x, i);
  326. }
  327. __declspec(safe,cost1)
  328. static inline uniform unsigned int64 extract(unsigned int64 x, uniform int i) {
  329. return __extract_int64(x, (unsigned int)i);
  330. }
  331. // x[i] = v
  332. __declspec(safe,cost1)
  333. static inline float insert(float x, uniform int i, uniform float v) {
  334. return floatbits(__insert_int32((int)intbits(x), i, (int)intbits(v)));
  335. }
  336. __declspec(safe,cost1)
  337. static inline int8 insert(int8 x, uniform int i, uniform int8 v) {
  338. return __insert_int8(x, i, v);
  339. }
  340. __declspec(safe,cost1)
  341. static inline unsigned int8 insert(unsigned int8 x, uniform int i,
  342. uniform unsigned int8 v) {
  343. return __insert_int8(x, (unsigned int)i, v);
  344. }
  345. __declspec(safe,cost1)
  346. static inline int16 insert(int16 x, uniform int i, uniform int16 v) {
  347. return __insert_int16(x, i, v);
  348. }
  349. __declspec(safe,cost1)
  350. static inline unsigned int16 insert(unsigned int16 x, uniform int i,
  351. uniform unsigned int16 v) {
  352. return __insert_int16(x, (unsigned int)i, v);
  353. }
  354. __declspec(safe,cost1)
  355. static inline int32 insert(int32 x, uniform int i, uniform int32 v) {
  356. return __insert_int32(x, i, v);
  357. }
  358. __declspec(safe,cost1)
  359. static inline unsigned int32 insert(unsigned int32 x, uniform int i,
  360. uniform unsigned int32 v) {
  361. return __insert_int32(x, (unsigned int)i, v);
  362. }
  363. __declspec(safe,cost1)
  364. static inline double insert(double x, uniform int i, uniform double v) {
  365. return doublebits(__insert_int64((int64)intbits(x), i, (int64)intbits(v)));
  366. }
  367. __declspec(safe,cost1)
  368. static inline int64 insert(int64 x, uniform int i, uniform int64 v) {
  369. return __insert_int64(x, i, v);
  370. }
  371. __declspec(safe,cost1)
  372. static inline unsigned int64 insert(unsigned int64 x, uniform int i,
  373. uniform unsigned int64 v) {
  374. return __insert_int64(x, (unsigned int)i, v);
  375. }
  376. __declspec(safe,cost1)
  377. static inline uniform int32 sign_extend(uniform bool v) {
  378. return __sext_uniform_bool(v);
  379. }
  380. __declspec(safe,cost1)
  381. static inline int32 sign_extend(bool v) {
  382. return __sext_varying_bool(v);
  383. }
  384. __declspec(safe)
  385. static inline uniform bool any(bool v) {
  386. // We only care about whether "any" is true for the active program instances,
  387. // so we have to make v with the current program mask.
  388. #if (ISPC_MASK_BITS == 1)
  389. return __any(v & __mask);
  390. #else
  391. return __any((UIntMaskType)__sext_varying_bool(v) & __mask);
  392. #endif
  393. }
  394. __declspec(safe)
  395. static inline uniform bool all(bool v) {
  396. // As with any(), we need to explicitly mask v with the current program mask
  397. // so we're only looking at the current lanes
  398. #if (ISPC_MASK_BITS == 1)
  399. return __all(v | !__mask);
  400. #else
  401. return __all((UIntMaskType)__sext_varying_bool(v) | !__mask);
  402. #endif
  403. }
  404. __declspec(safe)
  405. static inline uniform bool none(bool v) {
  406. // As with any(), we need to explicitly mask v with the current program mask
  407. // so we're only looking at the current lanes
  408. #if (ISPC_MASK_BITS == 1)
  409. return __none(v & __mask);
  410. #else
  411. return __none((UIntMaskType)__sext_varying_bool(v) & __mask);
  412. #endif
  413. }
  414. __declspec(safe)
  415. static inline uniform int32 popcnt(uniform int32 v) {
  416. return __popcnt_int32(v);
  417. }
  418. __declspec(safe)
  419. static inline uniform int popcnt(uniform int64 v) {
  420. return (int32)__popcnt_int64(v);
  421. }
  422. __declspec(safe)
  423. static inline int popcnt(int v) {
  424. int r;
  425. for (uniform int i = 0; i < programCount; ++i)
  426. r = insert(r, i, popcnt(extract(v, i)));
  427. return __mask ? r : 0;
  428. }
  429. __declspec(safe)
  430. static inline int popcnt(int64 v) {
  431. int r;
  432. for (uniform int i = 0; i < programCount; ++i)
  433. r = insert(r, i, popcnt(extract(v, i)));
  434. return __mask ? r : 0;
  435. }
  436. __declspec(safe)
  437. static inline uniform int popcnt(bool v) {
  438. // As with any() and all(), only count across the active lanes
  439. #if (ISPC_MASK_BITS == 1)
  440. if (__is_nvptx_target)
  441. return __popcnt_int64(__movmsk_ptx(v & __mask));
  442. else
  443. return __popcnt_int64(__movmsk(v & __mask));
  444. #else
  445. return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask));
  446. #endif
  447. }
  448. __declspec(safe)
  449. static inline uniform unsigned int64 lanemask() {
  450. return __movmsk(__mask);
  451. }
  452. ///////////////////////////////////////////////////////////////////////////
  453. // memcpy/memmove/memset
  454. static inline void memcpy(void * uniform dst, void * uniform src,
  455. uniform int32 count) {
  456. __memcpy32((int8 * uniform)dst, (int8 * uniform)src, count);
  457. }
  458. static inline void memcpy64(void * uniform dst, void * uniform src,
  459. uniform int64 count) {
  460. __memcpy64((int8 * uniform)dst, (int8 * uniform)src, count);
  461. }
  462. static inline void memcpy(void * varying dst, void * varying src,
  463. int32 count) {
  464. void * uniform da[programCount];
  465. void * uniform sa[programCount];
  466. da[programIndex] = dst;
  467. sa[programIndex] = src;
  468. foreach_active (i) {
  469. void * uniform d = da[i], * uniform s = sa[i];
  470. __memcpy32((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
  471. }
  472. }
  473. static inline void memcpy64(void * varying dst, void * varying src,
  474. int64 count) {
  475. void * uniform da[programCount];
  476. void * uniform sa[programCount];
  477. da[programIndex] = dst;
  478. sa[programIndex] = src;
  479. foreach_active (i) {
  480. void * uniform d = da[i], * uniform s = sa[i];
  481. __memcpy64((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
  482. }
  483. }
  484. static inline void memmove(void * uniform dst, void * uniform src,
  485. uniform int32 count) {
  486. __memmove32((int8 * uniform)dst, (int8 * uniform)src, count);
  487. }
  488. static inline void memmove64(void * uniform dst, void * uniform src,
  489. uniform int64 count) {
  490. __memmove64((int8 * uniform)dst, (int8 * uniform)src, count);
  491. }
  492. static inline void memmove(void * varying dst, void * varying src,
  493. int32 count) {
  494. void * uniform da[programCount];
  495. void * uniform sa[programCount];
  496. da[programIndex] = dst;
  497. sa[programIndex] = src;
  498. foreach_active (i) {
  499. void * uniform d = da[i], * uniform s = sa[i];
  500. __memmove32((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
  501. }
  502. }
  503. static inline void memmove64(void * varying dst, void * varying src,
  504. int64 count) {
  505. void * uniform da[programCount];
  506. void * uniform sa[programCount];
  507. da[programIndex] = dst;
  508. sa[programIndex] = src;
  509. foreach_active (i) {
  510. void * uniform d = da[i], * uniform s = sa[i];
  511. __memmove64((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
  512. }
  513. }
  514. static inline void memset(void * uniform ptr, uniform int8 val,
  515. uniform int32 count) {
  516. __memset32((int8 * uniform)ptr, val, count);
  517. }
  518. static inline void memset64(void * uniform ptr, uniform int8 val,
  519. uniform int64 count) {
  520. __memset64((int8 * uniform)ptr, val, count);
  521. }
  522. static inline void memset(void * varying ptr, int8 val, int32 count) {
  523. void * uniform pa[programCount];
  524. pa[programIndex] = ptr;
  525. foreach_active (i) {
  526. __memset32((int8 * uniform)pa[i], extract(val, i), extract(count, i));
  527. }
  528. }
  529. static inline void memset64(void * varying ptr, int8 val, int64 count) {
  530. void * uniform pa[programCount];
  531. pa[programIndex] = ptr;
  532. foreach_active (i) {
  533. __memset64((int8 * uniform)pa[i], extract(val, i), extract(count, i));
  534. }
  535. }
  536. ///////////////////////////////////////////////////////////////////////////
  537. // count leading/trailing zeros
  538. __declspec(safe,cost1)
  539. static inline uniform unsigned int32
  540. count_leading_zeros(uniform unsigned int32 v) {
  541. return __count_leading_zeros_i32(v);
  542. }
  543. __declspec(safe,cost1)
  544. static inline uniform unsigned int64
  545. count_leading_zeros(uniform unsigned int64 v) {
  546. return __count_leading_zeros_i64(v);
  547. }
  548. __declspec(safe,cost1)
  549. static inline uniform unsigned int32
  550. count_trailing_zeros(uniform unsigned int32 v) {
  551. return __count_trailing_zeros_i32(v);
  552. }
  553. __declspec(safe,cost1)
  554. static inline uniform unsigned int64
  555. count_trailing_zeros(uniform unsigned int64 v) {
  556. return __count_trailing_zeros_i64(v);
  557. }
  558. __declspec(safe,cost1)
  559. static inline uniform int32
  560. count_leading_zeros(uniform int32 v) {
  561. return __count_leading_zeros_i32(v);
  562. }
  563. __declspec(safe,cost1)
  564. static inline uniform int64
  565. count_leading_zeros(uniform int64 v) {
  566. return __count_leading_zeros_i64(v);
  567. }
  568. __declspec(safe,cost1)
  569. static inline uniform int32
  570. count_trailing_zeros(uniform int32 v) {
  571. return __count_trailing_zeros_i32(v);
  572. }
  573. __declspec(safe,cost1)
  574. static inline uniform int64
  575. count_trailing_zeros(uniform int64 v) {
  576. return __count_trailing_zeros_i64(v);
  577. }
  578. __declspec(safe)
  579. static inline unsigned int32
  580. count_leading_zeros(unsigned int32 v) {
  581. unsigned int32 r;
  582. for (uniform int i = 0; i < programCount; ++i)
  583. r = insert(r, i, __count_leading_zeros_i32(extract(v, i)));
  584. return r;
  585. }
  586. __declspec(safe)
  587. static inline unsigned int64
  588. count_leading_zeros(unsigned int64 v) {
  589. unsigned int64 r;
  590. for (uniform int i = 0; i < programCount; ++i)
  591. r = insert(r, i, __count_leading_zeros_i64(extract(v, i)));
  592. return r;
  593. }
  594. __declspec(safe)
  595. static inline unsigned int32
  596. count_trailing_zeros(unsigned int32 v) {
  597. unsigned int32 r;
  598. for (uniform int i = 0; i < programCount; ++i)
  599. r = insert(r, i, __count_trailing_zeros_i32(extract(v, i)));
  600. return r;
  601. }
  602. __declspec(safe)
  603. static inline unsigned int64
  604. count_trailing_zeros(unsigned int64 v) {
  605. unsigned int64 r;
  606. for (uniform int i = 0; i < programCount; ++i)
  607. r = insert(r, i, __count_trailing_zeros_i64(extract(v, i)));
  608. return r;
  609. }
  610. __declspec(safe)
  611. static inline int32
  612. count_leading_zeros(int32 v) {
  613. int32 r;
  614. for (uniform int i = 0; i < programCount; ++i)
  615. r = insert(r, i, __count_leading_zeros_i32(extract(v, i)));
  616. return r;
  617. }
  618. __declspec(safe)
  619. static inline int64
  620. count_leading_zeros(int64 v) {
  621. int64 r;
  622. for (uniform int i = 0; i < programCount; ++i)
  623. r = insert(r, i, __count_leading_zeros_i64(extract(v, i)));
  624. return r;
  625. }
  626. __declspec(safe)
  627. static inline int32
  628. count_trailing_zeros(int32 v) {
  629. int32 r;
  630. for (uniform int i = 0; i < programCount; ++i)
  631. r = insert(r, i, __count_trailing_zeros_i32(extract(v, i)));
  632. return r;
  633. }
  634. __declspec(safe)
  635. static inline int64
  636. count_trailing_zeros(int64 v) {
  637. int64 r;
  638. for (uniform int i = 0; i < programCount; ++i)
  639. r = insert(r, i, __count_trailing_zeros_i64(extract(v, i)));
  640. return r;
  641. }
  642. ///////////////////////////////////////////////////////////////////////////
  643. // AOS/SOA conversion
  644. static inline void
  645. aos_to_soa3(uniform float a[], varying float * uniform v0,
  646. varying float * uniform v1, varying float * uniform v2) {
  647. __aos_to_soa3_float(a, v0, v1, v2);
  648. }
  649. static inline void
  650. soa_to_aos3(float v0, float v1, float v2, uniform float a[]) {
  651. __soa_to_aos3_float(v0, v1, v2, a);
  652. }
  653. static inline void
  654. aos_to_soa4(uniform float a[], varying float * uniform v0,
  655. varying float * uniform v1, varying float * uniform v2,
  656. varying float * uniform v3) {
  657. __aos_to_soa4_float(a, v0, v1, v2, v3);
  658. }
  659. static inline void
  660. soa_to_aos4(float v0, float v1, float v2, float v3, uniform float a[]) {
  661. __soa_to_aos4_float(v0, v1, v2, v3, a);
  662. }
  663. static inline void
  664. aos_to_soa3(uniform int32 a[], varying int32 * uniform v0,
  665. varying int32 * uniform v1, varying int32 * uniform v2) {
  666. aos_to_soa3((uniform float * uniform)a, (varying float * uniform)v0,
  667. (varying float * uniform)v1, (varying float * uniform)v2);
  668. }
  669. static inline void
  670. soa_to_aos3(int32 v0, int32 v1, int32 v2, uniform int32 a[]) {
  671. soa_to_aos3(floatbits(v0), floatbits(v1), floatbits(v2),
  672. (uniform float * uniform)a);
  673. }
  674. static inline void
  675. aos_to_soa4(uniform int32 a[], varying int32 * uniform v0,
  676. varying int32 * uniform v1, varying int32 * uniform v2,
  677. varying int32 * uniform v3) {
  678. aos_to_soa4((uniform float * uniform)a, (varying float * uniform )v0,
  679. (varying float * uniform)v1, (varying float * uniform)v2,
  680. (varying float * uniform)v3);
  681. }
  682. static inline void
  683. soa_to_aos4(int32 v0, int32 v1, int32 v2, int32 v3, uniform int32 a[]) {
  684. soa_to_aos4(floatbits(v0), floatbits(v1), floatbits(v2), floatbits(v3),
  685. (uniform float * uniform)a);
  686. }
  687. ///////////////////////////////////////////////////////////////////////////
  688. // Prefetching
  689. __declspec(safe,cost1)
  690. static inline void prefetch_l1(const void * uniform ptr) {
  691. __prefetch_read_uniform_1((uniform int8 * uniform)ptr);
  692. }
  693. __declspec(safe,cost1)
  694. static inline void prefetch_l2(const void * uniform ptr) {
  695. __prefetch_read_uniform_2((uniform int8 * uniform)ptr);
  696. }
  697. __declspec(safe,cost1)
  698. static inline void prefetch_l3(const void * uniform ptr) {
  699. __prefetch_read_uniform_3((uniform int8 * uniform)ptr);
  700. }
  701. __declspec(safe,cost1)
  702. static inline void prefetch_nt(const void * uniform ptr) {
  703. __prefetch_read_uniform_nt((uniform int8 * uniform)ptr);
  704. }
  705. static inline void prefetch_l1(const void * varying ptr) {
  706. __pseudo_prefetch_read_varying_1((int64)ptr, (IntMaskType)__mask);
  707. }
  708. static inline void prefetch_l2(const void * varying ptr) {
  709. __pseudo_prefetch_read_varying_2((int64)ptr, (IntMaskType)__mask);
  710. }
  711. static inline void prefetch_l3(const void * varying ptr) {
  712. __pseudo_prefetch_read_varying_3((int64)ptr, (IntMaskType)__mask);
  713. }
  714. static inline void prefetch_nt(const void * varying ptr) {
  715. __pseudo_prefetch_read_varying_nt((int64)ptr, (IntMaskType)__mask);
  716. }
  717. ///////////////////////////////////////////////////////////////////////////
  718. // non-short-circuiting alternatives
  719. __declspec(safe,cost1)
  720. static inline bool and(bool a, bool b) {
  721. return a && b;
  722. }
  723. __declspec(safe,cost1)
  724. static inline uniform bool and(uniform bool a, uniform bool b) {
  725. return a && b;
  726. }
  727. __declspec(safe,cost1)
  728. static inline bool or(bool a, bool b) {
  729. return a || b;
  730. }
  731. __declspec(safe,cost1)
  732. static inline uniform bool or(uniform bool a, uniform bool b) {
  733. return a || b;
  734. }
  735. __declspec(safe,cost1)
  736. static inline int8 select(bool c, int8 a, int8 b) {
  737. return c ? a : b;
  738. }
  739. __declspec(safe,cost1)
  740. static inline int8 select(uniform bool c, int8 a, int8 b) {
  741. return c ? a : b;
  742. }
  743. __declspec(safe,cost1)
  744. static inline uniform int8 select(uniform bool c, uniform int8 a,
  745. uniform int8 b) {
  746. return c ? a : b;
  747. }
  748. __declspec(safe,cost1)
  749. static inline int16 select(bool c, int16 a, int16 b) {
  750. return c ? a : b;
  751. }
  752. __declspec(safe,cost1)
  753. static inline int16 select(uniform bool c, int16 a, int16 b) {
  754. return c ? a : b;
  755. }
  756. __declspec(safe,cost1)
  757. static inline uniform int16 select(uniform bool c, uniform int16 a,
  758. uniform int16 b) {
  759. return c ? a : b;
  760. }
  761. __declspec(safe,cost1)
  762. static inline int32 select(bool c, int32 a, int32 b) {
  763. return c ? a : b;
  764. }
  765. __declspec(safe,cost1)
  766. static inline int32 select(uniform bool c, int32 a, int32 b) {
  767. return c ? a : b;
  768. }
  769. __declspec(safe,cost1)
  770. static inline uniform int32 select(uniform bool c, uniform int32 a,
  771. uniform int32 b) {
  772. return c ? a : b;
  773. }
  774. __declspec(safe,cost1)
  775. static inline int64 select(bool c, int64 a, int64 b) {
  776. return c ? a : b;
  777. }
  778. __declspec(safe,cost1)
  779. static inline int64 select(uniform bool c, int64 a, int64 b) {
  780. return c ? a : b;
  781. }
  782. __declspec(safe,cost1)
  783. static inline uniform int64 select(uniform bool c, uniform int64 a,
  784. uniform int64 b) {
  785. return c ? a : b;
  786. }
  787. __declspec(safe,cost1)
  788. static inline float select(bool c, float a, float b) {
  789. return c ? a : b;
  790. }
  791. __declspec(safe,cost1)
  792. static inline float select(uniform bool c, float a, float b) {
  793. return c ? a : b;
  794. }
  795. __declspec(safe,cost1)
  796. static inline uniform float select(uniform bool c, uniform float a,
  797. uniform float b) {
  798. return c ? a : b;
  799. }
  800. __declspec(safe,cost1)
  801. static inline double select(bool c, double a, double b) {
  802. return c ? a : b;
  803. }
  804. __declspec(safe,cost1)
  805. static inline double select(uniform bool c, double a, double b) {
  806. return c ? a : b;
  807. }
  808. __declspec(safe,cost1)
  809. static inline uniform double select(uniform bool c, uniform double a,
  810. uniform double b) {
  811. return c ? a : b;
  812. }
  813. ///////////////////////////////////////////////////////////////////////////
  814. // Horizontal ops / reductions
  815. __declspec(safe)
  816. static inline uniform int16 reduce_add(int8 x) {
  817. return __reduce_add_int8(__mask ? x : (int8)0);
  818. }
  819. __declspec(safe)
  820. static inline uniform unsigned int16 reduce_add(unsigned int8 x) {
  821. return __reduce_add_int8(__mask ? x : (int8)0);
  822. }
  823. __declspec(safe)
  824. static inline uniform int32 reduce_add(int16 x) {
  825. return __reduce_add_int16(__mask ? x : (int16)0);
  826. }
  827. __declspec(safe)
  828. static inline uniform unsigned int32 reduce_add(unsigned int16 x) {
  829. return __reduce_add_int16(__mask ? x : (int16)0);
  830. }
  831. __declspec(safe)
  832. static inline uniform float reduce_add(float x) {
  833. // zero the lanes where the mask is off
  834. return __reduce_add_float(__mask ? x : 0.);
  835. }
  836. __declspec(safe)
  837. static inline uniform float reduce_min(float v) {
  838. // For the lanes where the mask is off, replace the given value with
  839. // infinity, so that it doesn't affect the result.
  840. int iflt_max = 0x7f800000; // infinity
  841. // unmasked block is needed to make sure that argument for unmasked
  842. // function __reduce_min_float() are calculated without a mask.
  843. bool test = __mask;
  844. uniform float result;
  845. unmasked {
  846. result = __reduce_min_float(test ? v : floatbits(iflt_max));
  847. }
  848. return result;
  849. }
  850. __declspec(safe)
  851. static inline uniform float reduce_max(float v) {
  852. // For the lanes where the mask is off, replace the given value with
  853. // negative infinity, so that it doesn't affect the result.
  854. const int iflt_neg_max = 0xff800000; // -infinity
  855. // unmasked block is needed to make sure that argument for unmasked
  856. // function __reduce_max_float() are calculated without a mask.
  857. bool test = __mask;
  858. uniform float result;
  859. unmasked {
  860. result = __reduce_max_float(test ? v : floatbits(iflt_neg_max));
  861. }
  862. return result;
  863. }
  864. __declspec(safe)
  865. static inline uniform int64 reduce_add(int32 x) {
  866. // Zero out the values for lanes that aren't running
  867. return __reduce_add_int32(__mask ? x : 0);
  868. }
  869. __declspec(safe)
  870. static inline uniform int reduce_min(int v) {
  871. // Set values for non-running lanes to the maximum integer value so
  872. // they don't affect the result.
  873. int int_max = 0x7fffffff;
  874. return __reduce_min_int32(__mask ? v : int_max);
  875. }
  876. __declspec(safe)
  877. static inline uniform int reduce_max(int v) {
  878. // Set values for non-running lanes to the minimum integer value so
  879. // they don't affect the result.
  880. int int_min = 0x80000000;
  881. return __reduce_max_int32(__mask ? v : int_min);
  882. }
  883. __declspec(safe)
  884. static inline uniform unsigned int64 reduce_add(unsigned int32 x) {
  885. // Set values for non-running lanes to zero so they don't affect the
  886. // result.
  887. return __reduce_add_int32(__mask ? x : 0);
  888. }
  889. __declspec(safe)
  890. static inline uniform unsigned int reduce_min(unsigned int v) {
  891. // Set values for non-running lanes to the maximum unsigned integer
  892. // value so they don't affect the result.
  893. unsigned int uint_max = 0xffffffff;
  894. return __reduce_min_uint32(__mask ? v : uint_max);
  895. }
  896. __declspec(safe)
  897. static inline uniform unsigned int reduce_max(unsigned int v) {
  898. // Set values for non-running lanes to zero so they don't affect the
  899. // result.
  900. return __reduce_max_uint32(__mask ? v : 0);
  901. }
  902. __declspec(safe)
  903. static inline uniform double reduce_add(double x) {
  904. // zero the lanes where the mask is off
  905. return __reduce_add_double(__mask ? x : 0.);
  906. }
  907. __declspec(safe)
  908. static inline uniform double reduce_min(double v) {
  909. int64 iflt_max = 0x7ff0000000000000; // infinity
  910. // unmasked block is needed to make sure that argument for unmasked
  911. // function __reduce_min_double() are calculated without a mask.
  912. bool test = __mask;
  913. uniform double result;
  914. unmasked {
  915. result = __reduce_min_double(test ? v : doublebits(iflt_max));
  916. }
  917. return result;
  918. }
  919. __declspec(safe)
  920. static inline uniform double reduce_max(double v) {
  921. const int64 iflt_neg_max = 0xfff0000000000000; // -infinity
  922. // unmasked block is needed to make sure that argument for unmasked
  923. // function __reduce_max_double() are calculated without a mask.
  924. bool test = __mask;
  925. uniform double result;
  926. unmasked {
  927. result = __reduce_max_double(test ? v : doublebits(iflt_neg_max));
  928. }
  929. return result;
  930. }
  931. __declspec(safe)
  932. static inline uniform int64 reduce_add(int64 x) {
  933. // Zero out the values for lanes that aren't running
  934. return __reduce_add_int64(__mask ? x : 0);
  935. }
  936. __declspec(safe)
  937. static inline uniform int64 reduce_min(int64 v) {
  938. // Set values for non-running lanes to the maximum integer value so
  939. // they don't affect the result.
  940. int64 int_max = 0x7fffffffffffffff;
  941. return __reduce_min_int64(__mask ? v : int_max);
  942. }
  943. __declspec(safe)
  944. static inline uniform int64 reduce_max(int64 v) {
  945. // Set values for non-running lanes to the minimum integer value so
  946. // they don't affect the result.
  947. int64 int_min = 0x8000000000000000;
  948. return __reduce_max_int64(__mask ? v : int_min);
  949. }
  950. __declspec(safe)
  951. static inline uniform unsigned int64 reduce_add(unsigned int64 x) {
  952. // Set values for non-running lanes to zero so they don't affect the
  953. // result.
  954. return __reduce_add_int64(__mask ? x : 0);
  955. }
  956. __declspec(safe)
  957. static inline uniform unsigned int64 reduce_min(unsigned int64 v) {
  958. // Set values for non-running lanes to the maximum unsigned integer
  959. // value so they don't affect the result.
  960. unsigned int64 uint_max = 0xffffffffffffffff;
  961. return __reduce_min_uint64(__mask ? v : uint_max);
  962. }
  963. __declspec(safe)
  964. static inline uniform unsigned int64 reduce_max(unsigned int64 v) {
  965. // Set values for non-running lanes to zero so they don't affect the
  966. // result.
  967. return __reduce_max_uint64(__mask ? v : 0);
  968. }
  969. #define REDUCE_EQUAL(TYPE, FUNCTYPE, MASKTYPE) \
  970. __declspec(safe) \
  971. static inline uniform bool reduce_equal(TYPE v) { \
  972. uniform TYPE unusedValue; \
  973. return __reduce_equal_##FUNCTYPE(v, &unusedValue, (MASKTYPE)__mask); \
  974. } \
  975. __declspec(safe) \
  976. static inline uniform bool reduce_equal(TYPE v, uniform TYPE * uniform value) { \
  977. return __reduce_equal_##FUNCTYPE(v, value, (MASKTYPE)__mask); \
  978. }
  979. REDUCE_EQUAL(int32, int32, IntMaskType)
  980. REDUCE_EQUAL(unsigned int32, int32, UIntMaskType)
  981. REDUCE_EQUAL(float, float, IntMaskType)
  982. REDUCE_EQUAL(int64, int64, IntMaskType)
  983. REDUCE_EQUAL(unsigned int64, int64, UIntMaskType)
  984. REDUCE_EQUAL(double, double, IntMaskType)
  985. static int32 exclusive_scan_add(int32 v) {
  986. return __exclusive_scan_add_i32(v, (IntMaskType)__mask);
  987. }
  988. static unsigned int32 exclusive_scan_add(unsigned int32 v) {
  989. return __exclusive_scan_add_i32((int32)v, (IntMaskType)__mask);
  990. }
  991. static float exclusive_scan_add(float v) {
  992. return __exclusive_scan_add_float(v, __mask);
  993. }
  994. static int64 exclusive_scan_add(int64 v) {
  995. return __exclusive_scan_add_i64(v, (IntMaskType)__mask);
  996. }
  997. static unsigned int64 exclusive_scan_add(unsigned int64 v) {
  998. return __exclusive_scan_add_i64(v, (UIntMaskType)__mask);
  999. }
  1000. static double exclusive_scan_add(double v) {
  1001. return __exclusive_scan_add_double(v, __mask);
  1002. }
  1003. static int32 exclusive_scan_and(int32 v) {
  1004. return __exclusive_scan_and_i32(v, (IntMaskType)__mask);
  1005. }
  1006. static unsigned int32 exclusive_scan_and(unsigned int32 v) {
  1007. return __exclusive_scan_and_i32(v, (UIntMaskType)__mask);
  1008. }
  1009. static int64 exclusive_scan_and(int64 v) {
  1010. return __exclusive_scan_and_i64(v, (IntMaskType)__mask);
  1011. }
  1012. static unsigned int64 exclusive_scan_and(unsigned int64 v) {
  1013. return __exclusive_scan_and_i64(v, (UIntMaskType)__mask);
  1014. }
  1015. static int32 exclusive_scan_or(int32 v) {
  1016. return __exclusive_scan_or_i32(v, (IntMaskType)__mask);
  1017. }
  1018. static unsigned int32 exclusive_scan_or(unsigned int32 v) {
  1019. return __exclusive_scan_or_i32(v, (UIntMaskType)__mask);
  1020. }
  1021. static int64 exclusive_scan_or(int64 v) {
  1022. return __exclusive_scan_or_i64(v, (IntMaskType)__mask);
  1023. }
  1024. static unsigned int64 exclusive_scan_or(unsigned int64 v) {
  1025. return __exclusive_scan_or_i64(v, (UIntMaskType)__mask);
  1026. }
  1027. ///////////////////////////////////////////////////////////////////////////
  1028. // packed load, store
  1029. static inline uniform int
  1030. packed_load_active(uniform unsigned int a[],
  1031. varying unsigned int * uniform vals) {
  1032. return __packed_load_active(a, vals, (UIntMaskType)__mask);
  1033. }
  1034. static inline uniform int
  1035. packed_store_active(uniform unsigned int a[],
  1036. unsigned int vals) {
  1037. return __packed_store_active(a, vals, (UIntMaskType)__mask);
  1038. }
  1039. static inline uniform int
  1040. packed_store_active2(uniform unsigned int a[],
  1041. unsigned int vals) {
  1042. return __packed_store_active2(a, vals, (UIntMaskType)__mask);
  1043. }
  1044. static inline uniform int
  1045. packed_load_active(uniform int a[], varying int * uniform vals) {
  1046. return __packed_load_active(a, vals, (IntMaskType)__mask);
  1047. }
  1048. static inline uniform int
  1049. packed_store_active(uniform int a[], int vals) {
  1050. return __packed_store_active(a, vals, (IntMaskType)__mask);
  1051. }
  1052. static inline uniform int
  1053. packed_store_active(bool active, uniform int a[], int vals) {
  1054. return __packed_store_active(a, vals, (IntMaskType)(-(int)active));
  1055. }
  1056. static inline uniform int
  1057. packed_store_active2(uniform int a[], int vals) {
  1058. return __packed_store_active2(a, vals, (IntMaskType)__mask);
  1059. }
  1060. ///////////////////////////////////////////////////////////////////////////
  1061. // System information
  1062. static inline uniform int num_cores() {
  1063. if (__is_nvptx_target)
  1064. return 15*32; // K20/K20X/K40 - 15SMX x 32 warps/smx (max is 64 warps/smx)
  1065. else
  1066. return __num_cores();
  1067. }
  1068. __declspec(safe)
  1069. static inline uniform int64 clock() {
  1070. return __clock();
  1071. }
  1072. ///////////////////////////////////////////////////////////////////////////
  1073. // Floating-Point Math
  1074. __declspec(safe,cost1)
  1075. static inline uniform bool isnan(uniform float v) {
  1076. return v != v;
  1077. }
  1078. __declspec(safe,cost1)
  1079. static inline bool isnan(float v) {
  1080. return v != v;
  1081. }
  1082. __declspec(safe,cost1)
  1083. static inline uniform bool isnan(uniform double v) {
  1084. return v != v;
  1085. }
  1086. __declspec(safe,cost1)
  1087. static inline bool isnan(double v) {
  1088. return v != v;
  1089. }
  1090. __declspec(safe,cost1)
  1091. static inline float abs(float a) {
  1092. // Floating-point hack: zeroing the high bit clears the sign
  1093. unsigned int i = intbits(a);
  1094. i &= 0x7fffffff;
  1095. return floatbits(i);
  1096. }
  1097. __declspec(safe,cost1)
  1098. static inline uniform float abs(uniform float a) {
  1099. uniform unsigned int i = intbits(a);
  1100. i &= 0x7fffffff;
  1101. return floatbits(i);
  1102. }
  1103. __declspec(safe,cost1)
  1104. static inline double abs(double a) {
  1105. // zeroing the high bit clears the sign
  1106. unsigned int64 i = intbits(a);
  1107. i &= 0x7fffffffffffffff;
  1108. return doublebits(i);
  1109. }
  1110. __declspec(safe,cost1)
  1111. static inline uniform double abs(uniform double a) {
  1112. uniform unsigned int64 i = intbits(a);
  1113. i &= 0x7fffffffffffffff;
  1114. return doublebits(i);
  1115. }
  1116. __declspec(safe,cost1)
  1117. static inline unsigned int signbits(float x) {
  1118. unsigned int i = intbits(x);
  1119. return (i & 0x80000000);
  1120. }
  1121. __declspec(safe,cost1)
  1122. static inline uniform unsigned int signbits(uniform float x) {
  1123. uniform unsigned int i = intbits(x);
  1124. return (i & 0x80000000);
  1125. }
  1126. __declspec(safe,cost1)
  1127. static inline unsigned int64 signbits(double x) {
  1128. unsigned int64 i = intbits(x);
  1129. return (i & 0x8000000000000000);
  1130. }
  1131. __declspec(safe,cost1)
  1132. static inline uniform unsigned int64 signbits(uniform double x) {
  1133. uniform unsigned int64 i = intbits(x);
  1134. return (i & 0x8000000000000000);
  1135. }
  1136. __declspec(safe,cost2)
  1137. static inline float round(float x) {
  1138. return __round_varying_float(x);
  1139. }
  1140. __declspec(safe,cost2)
  1141. static inline uniform float round(uniform float x) {
  1142. return __round_uniform_float(x);
  1143. }
  1144. __declspec(safe,cost2)
  1145. static inline double round(double x) {
  1146. return __round_varying_double(x);
  1147. }
  1148. __declspec(safe,cost2)
  1149. static inline uniform double round(uniform double x) {
  1150. return __round_uniform_double(x);
  1151. }
  1152. __declspec(safe,cost2)
  1153. static inline float floor(float x) {
  1154. return __floor_varying_float(x);
  1155. }
  1156. __declspec(safe,cost2)
  1157. static inline uniform float floor(uniform float x) {
  1158. return __floor_uniform_float(x);
  1159. }
  1160. __declspec(safe,cost2)
  1161. static inline double floor(double x) {
  1162. return __floor_varying_double(x);
  1163. }
  1164. __declspec(safe,cost2)
  1165. static inline uniform double floor(uniform double x) {
  1166. return __floor_uniform_double(x);
  1167. }
  1168. __declspec(safe,cost2)
  1169. static inline float ceil(float x) {
  1170. return __ceil_varying_float(x);
  1171. }
  1172. __declspec(safe,cost2)
  1173. static inline uniform float ceil(uniform float x) {
  1174. return __ceil_uniform_float(x);
  1175. }
  1176. __declspec(safe,cost2)
  1177. static inline double ceil(double x) {
  1178. return __ceil_varying_double(x);
  1179. }
  1180. __declspec(safe,cost2)
  1181. static inline uniform double ceil(uniform double x) {
  1182. return __ceil_uniform_double(x);
  1183. }
  1184. __declspec(safe)
  1185. static inline float rcp(float v) {
  1186. return __rcp_varying_float(v);
  1187. }
  1188. __declspec(safe)
  1189. static inline uniform float rcp(uniform float v) {
  1190. return __rcp_uniform_float(v);
  1191. }
  1192. #define RCPD(QUAL) \
  1193. __declspec(safe) \
  1194. static inline QUAL double __rcp_iterate_##QUAL##_double(QUAL double v, QUAL double iv) \
  1195. { \
  1196. iv = iv * (2.0d - v*iv); \
  1197. iv = iv * (2.0d - v*iv); \
  1198. return iv; \
  1199. } \
  1200. __declspec(safe) \
  1201. static inline QUAL double __rcp_safe_##QUAL##_double(QUAL double x) \
  1202. { \
  1203. if (x <= 1.0d+33 && x >= 1.0d-33) \
  1204. return __rcp_iterate_##QUAL##_double(x, rcp((QUAL float)x)); \
  1205. QUAL int64 ex = intbits(x) & 0x7fe0000000000000; \
  1206. QUAL double exp = doublebits( 0x7fd0000000000000 + ~ex ); \
  1207. QUAL double y = rcp((QUAL float)(x*exp)); \
  1208. return __rcp_iterate_##QUAL##_double(x, y*exp); \
  1209. }
  1210. RCPD(varying)
  1211. __declspec(safe)
  1212. static inline double rcp(double v) {
  1213. if (__have_native_rcpd)
  1214. return __rcp_varying_double(v);
  1215. else
  1216. return __rcp_safe_varying_double(v);
  1217. }
  1218. RCPD(uniform)
  1219. __declspec(safe)
  1220. static inline uniform double rcp(uniform double v) {
  1221. if (__have_native_rcpd)
  1222. return __rcp_uniform_double(v);
  1223. else
  1224. return __rcp_safe_uniform_double(v);
  1225. }
  1226. ///////////////////////////////////////////////////////////////////////////
  1227. // min/max
  1228. // float
  1229. __declspec(safe,cost1)
  1230. static inline float min(float a, float b) {
  1231. return __min_varying_float(a, b);
  1232. }
  1233. __declspec(safe,cost1)
  1234. static inline uniform float min(uniform float a, uniform float b) {
  1235. return __min_uniform_float(a, b);
  1236. }
  1237. __declspec(safe,cost1)
  1238. static inline float max(float a, float b) {
  1239. return __max_varying_float(a, b);
  1240. }
  1241. __declspec(safe,cost1)
  1242. static inline uniform float max(uniform float a, uniform float b) {
  1243. return __max_uniform_float(a, b);
  1244. }
  1245. // double
  1246. __declspec(safe)
  1247. static inline double min(double a, double b) {
  1248. return __min_varying_double(a, b);
  1249. }
  1250. __declspec(safe)
  1251. static inline uniform double min(uniform double a, uniform double b) {
  1252. return __min_uniform_double(a, b);
  1253. }
  1254. __declspec(safe)
  1255. static inline double max(double a, double b) {
  1256. return __max_varying_double(a, b);
  1257. }
  1258. __declspec(safe)
  1259. static inline uniform double max(uniform double a, uniform double b) {
  1260. return __max_uniform_double(a, b);
  1261. }
  1262. // int8
  1263. __declspec(safe,cost1)
  1264. static inline uniform unsigned int8 min(uniform unsigned int8 a,
  1265. uniform unsigned int8 b) {
  1266. return (a < b) ? a : b;
  1267. }
  1268. __declspec(safe,cost1)
  1269. static inline uniform unsigned int8 max(uniform unsigned int8 a,
  1270. uniform unsigned int8 b) {
  1271. return (a > b) ? a : b;
  1272. }
  1273. __declspec(safe,cost1)
  1274. static inline uniform int8 min(uniform int8 a, uniform int8 b) {
  1275. return (a < b) ? a : b;
  1276. }
  1277. __declspec(safe,cost1)
  1278. static inline uniform int8 max(uniform int8 a, uniform int8 b) {
  1279. return (a > b) ? a : b;
  1280. }
  1281. __declspec(safe,cost1)
  1282. static inline unsigned int8 min(unsigned int8 a, unsigned int8 b) {
  1283. return (a < b) ? a : b;
  1284. }
  1285. __declspec(safe,cost1)
  1286. static inline unsigned int8 max(unsigned int8 a, unsigned int8 b) {
  1287. return (a > b) ? a : b;
  1288. }
  1289. __declspec(safe,cost1)
  1290. static inline int8 min(int8 a, int8 b) {
  1291. return (a < b) ? a : b;
  1292. }
  1293. __declspec(safe,cost1)
  1294. static inline int8 max(int8 a, int8 b) {
  1295. return (a > b) ? a : b;
  1296. }
  1297. // int16
  1298. __declspec(safe,cost1)
  1299. static inline uniform unsigned int16 min(uniform unsigned int16 a,
  1300. uniform unsigned int16 b) {
  1301. return (a < b) ? a : b;
  1302. }
  1303. __declspec(safe,cost1)
  1304. static inline uniform unsigned int16 max(uniform unsigned int16 a,
  1305. uniform unsigned int16 b) {
  1306. return (a > b) ? a : b;
  1307. }
  1308. __declspec(safe,cost1)
  1309. static inline uniform int16 min(uniform int16 a, uniform int16 b) {
  1310. return (a < b) ? a : b;
  1311. }
  1312. __declspec(safe,cost1)
  1313. static inline uniform int16 max(uniform int16 a, uniform int16 b) {
  1314. return (a > b) ? a : b;
  1315. }
  1316. __declspec(safe,cost1)
  1317. static inline unsigned int16 min(unsigned int16 a, unsigned int16 b) {
  1318. return (a < b) ? a : b;
  1319. }
  1320. __declspec(safe,cost1)
  1321. static inline unsigned int16 max(unsigned int16 a, unsigned int16 b) {
  1322. return (a > b) ? a : b;
  1323. }
  1324. __declspec(safe,cost1)
  1325. static inline int16 min(int16 a, int16 b) {
  1326. return (a < b) ? a : b;
  1327. }
  1328. __declspec(safe,cost1)
  1329. static inline int16 max(int16 a, int16 b) {
  1330. return (a > b) ? a : b;
  1331. }
  1332. // int32
  1333. __declspec(safe,cost1)
  1334. static inline unsigned int min(unsigned int a, unsigned int b) {
  1335. return __min_varying_uint32(a, b);
  1336. }
  1337. __declspec(safe,cost1)
  1338. static inline uniform unsigned int min(uniform unsigned int a, uniform unsigned int b) {
  1339. return __min_uniform_uint32(a, b);
  1340. }
  1341. __declspec(safe,cost1)
  1342. static inline unsigned int max(unsigned int a, unsigned int b) {
  1343. return __max_varying_uint32(a, b);
  1344. }
  1345. __declspec(safe,cost1)
  1346. static inline uniform unsigned int max(uniform unsigned int a, uniform unsigned int b) {
  1347. return __max_uniform_uint32(a, b);
  1348. }
  1349. __declspec(safe,cost1)
  1350. static inline int min(int a, int b) {
  1351. return __min_varying_int32(a, b);
  1352. }
  1353. __declspec(safe,cost1)
  1354. static inline uniform int min(uniform int a, uniform int b) {
  1355. return __min_uniform_int32(a, b);
  1356. }
  1357. __declspec(safe,cost1)
  1358. static inline int max(int a, int b) {
  1359. return __max_varying_int32(a, b);
  1360. }
  1361. __declspec(safe,cost1)
  1362. static inline uniform int max(uniform int a, uniform int b) {
  1363. return __max_uniform_int32(a, b);
  1364. }
  1365. // int64
  1366. __declspec(safe,cost1)
  1367. static inline unsigned int64 min(unsigned int64 a, unsigned int64 b) {
  1368. return __min_varying_uint64(a, b);
  1369. }
  1370. __declspec(safe,cost1)
  1371. static inline uniform unsigned int64 min(uniform unsigned int64 a, uniform unsigned int64 b) {
  1372. return __min_uniform_uint64(a, b);
  1373. }
  1374. __declspec(safe,cost1)
  1375. static inline unsigned int64 max(unsigned int64 a, unsigned int64 b) {
  1376. return __max_varying_uint64(a, b);
  1377. }
  1378. __declspec(safe,cost1)
  1379. static inline uniform unsigned int64 max(uniform unsigned int64 a, uniform unsigned int64 b) {
  1380. return __max_uniform_uint64(a, b);
  1381. }
  1382. __declspec(safe,cost1)
  1383. static inline int64 min(int64 a, int64 b) {
  1384. return __min_varying_int64(a, b);
  1385. }
  1386. __declspec(safe,cost1)
  1387. static inline uniform int64 min(uniform int64 a, uniform int64 b) {
  1388. return __min_uniform_int64(a, b);
  1389. }
  1390. __declspec(safe,cost1)
  1391. static inline int64 max(int64 a, int64 b) {
  1392. return __max_varying_int64(a, b);
  1393. }
  1394. __declspec(safe,cost1)
  1395. static inline uniform int64 max(uniform int64 a, uniform int64 b) {
  1396. return __max_uniform_int64(a, b);
  1397. }
  1398. ///////////////////////////////////////////////////////////////////////////
  1399. // clamps
  1400. // float
  1401. __declspec(safe,cost2)
  1402. static inline float clamp(float v, float low, float high) {
  1403. return min(max(v, low), high);
  1404. }
  1405. __declspec(safe,cost2)
  1406. static inline uniform float clamp(uniform float v, uniform float low, uniform float high) {
  1407. return min(max(v, low), high);
  1408. }
  1409. // double
  1410. __declspec(safe,cost2)
  1411. static inline double clamp(double v, double low, double high) {
  1412. return min(max(v, low), high);
  1413. }
  1414. __declspec(safe,cost2)
  1415. static inline uniform double clamp(uniform double v, uniform double low, uniform double high) {
  1416. return min(max(v, low), high);
  1417. }
  1418. // int8
  1419. __declspec(safe,cost2)
  1420. static inline unsigned int8 clamp(unsigned int8 v, unsigned int8 low,
  1421. unsigned int8 high) {
  1422. return min(max(v, low), high);
  1423. }
  1424. __declspec(safe,cost2)
  1425. static inline uniform unsigned int8 clamp(uniform unsigned int8 v,
  1426. uniform unsigned int8 low,
  1427. uniform unsigned int8 high) {
  1428. return min(max(v, low), high);
  1429. }
  1430. __declspec(safe,cost2)
  1431. static inline int8 clamp(int8 v, int8 low, int8 high) {
  1432. return min(max(v, low), high);
  1433. }
  1434. __declspec(safe,cost2)
  1435. static inline uniform int8 clamp(uniform int8 v, uniform int8 low,
  1436. uniform int8 high) {
  1437. return min(max(v, low), high);
  1438. }
  1439. // int16
  1440. __declspec(safe,cost2)
  1441. static inline unsigned int16 clamp(unsigned int16 v, unsigned int16 low,
  1442. unsigned int16 high) {
  1443. return min(max(v, low), high);
  1444. }
  1445. __declspec(safe,cost2)
  1446. static inline uniform unsigned int16 clamp(uniform unsigned int16 v,
  1447. uniform unsigned int16 low,
  1448. uniform unsigned int16 high) {
  1449. return min(max(v, low), high);
  1450. }
  1451. __declspec(safe,cost2)
  1452. static inline int16 clamp(int16 v, int16 low, int16 high) {
  1453. return min(max(v, low), high);
  1454. }
  1455. __declspec(safe,cost2)
  1456. static inline uniform int16 clamp(uniform int16 v, uniform int16 low,
  1457. uniform int16 high) {
  1458. return min(max(v, low), high);
  1459. }
  1460. // int32
  1461. __declspec(safe,cost2)
  1462. static inline unsigned int clamp(unsigned int v, unsigned int low, unsigned int high) {
  1463. return min(max(v, low), high);
  1464. }
  1465. __declspec(safe,cost2)
  1466. static inline uniform unsigned int clamp(uniform unsigned int v, uniform unsigned int low,
  1467. uniform unsigned int high) {
  1468. return min(max(v, low), high);
  1469. }
  1470. __declspec(safe,cost2)
  1471. static inline int clamp(int v, int low, int high) {
  1472. return min(max(v, low), high);
  1473. }
  1474. __declspec(safe,cost2)
  1475. static inline uniform int clamp(uniform int v, uniform int low, uniform int high) {
  1476. return min(max(v, low), high);
  1477. }
  1478. // int64
  1479. __declspec(safe,cost2)
  1480. static inline unsigned int64 clamp(unsigned int64 v, unsigned int64 low,
  1481. unsigned int64 high) {
  1482. return min(max(v, low), high);
  1483. }
  1484. __declspec(safe,cost2)
  1485. static inline uniform unsigned int64 clamp(uniform unsigned int64 v,
  1486. uniform unsigned int64 low,
  1487. uniform unsigned int64 high) {
  1488. return min(max(v, low), high);
  1489. }
  1490. __declspec(safe,cost2)
  1491. static inline int64 clamp(int64 v, int64 low, int64 high) {
  1492. return min(max(v, low), high);
  1493. }
  1494. __declspec(safe,cost2)
  1495. static inline uniform int64 clamp(uniform int64 v, uniform int64 low,
  1496. uniform int64 high) {
  1497. return min(max(v, low), high);
  1498. }
  1499. ///////////////////////////////////////////////////////////////////////////
  1500. // Global atomics and memory barriers
  1501. static inline void memory_barrier() {
  1502. __memory_barrier();
  1503. }
  1504. #define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE,TC) \
  1505. static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
  1506. TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
  1507. return ret; \
  1508. } \
  1509. static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
  1510. uniform TA value) { \
  1511. uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
  1512. return ret; \
  1513. } \
  1514. static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
  1515. if (__is_nvptx_target) { \
  1516. TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
  1517. return ret; \
  1518. } else { \
  1519. uniform TA * uniform ptrArray[programCount]; \
  1520. ptrArray[programIndex] = ptr; \
  1521. TA ret; \
  1522. foreach_active (i) { \
  1523. uniform TA * uniform p = ptrArray[i]; \
  1524. uniform TA v = extract(value, i); \
  1525. uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \
  1526. ret = insert(ret, i, r); \
  1527. } \
  1528. return ret; \
  1529. } \
  1530. } \
  1531. #define DEFINE_ATOMIC_SWAP(TA,TB,MASKTYPE,TC) \
  1532. static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
  1533. if (__is_nvptx_target) { \
  1534. TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
  1535. return ret; \
  1536. } else { \
  1537. uniform int i = 0; \
  1538. TA ret[programCount]; \
  1539. TA memVal; \
  1540. uniform int lastSwap; \
  1541. uniform unsigned int64 mask = lanemask(); \
  1542. /* First, have the first running program instance (if any) perform \
  1543. the swap with memory with its value of "value"; record the \
  1544. value returned. */ \
  1545. for (; i < programCount; ++i) { \
  1546. if ((mask & (1ull << i)) == 0) \
  1547. continue; \
  1548. memVal = __atomic_swap_uniform_##TB##_global(ptr, extract(value, i)); \
  1549. lastSwap = i; \
  1550. break; \
  1551. } \
  1552. /* Now, for all of the remaining running program instances, set the \
  1553. return value of the last instance that did a swap with this \
  1554. instance's value of "value"; this gives the same effect as if the \
  1555. current instance had executed a hardware atomic swap right before \
  1556. the last one that did a swap. */ \
  1557. for (; i < programCount; ++i) { \
  1558. if ((mask & (1ull << i)) == 0) \
  1559. continue; \
  1560. ret[lastSwap] = extract(value, i); \
  1561. lastSwap = i; \
  1562. } \
  1563. /* And the last instance that wanted to swap gets the value we \
  1564. originally got back from memory... */ \
  1565. ret[lastSwap] = memVal; \
  1566. return ret[programIndex]; \
  1567. }\
  1568. } \
  1569. static inline uniform TA atomic_swap_global(uniform TA * uniform ptr, \
  1570. uniform TA value) { \
  1571. uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value); \
  1572. return ret; \
  1573. } \
  1574. static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
  1575. if (__is_nvptx_target) { \
  1576. TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
  1577. return ret; \
  1578. } else { \
  1579. uniform TA * uniform ptrArray[programCount]; \
  1580. ptrArray[programIndex] = ptr; \
  1581. TA ret; \
  1582. foreach_active (i) { \
  1583. uniform TA * uniform p = ptrArray[i]; \
  1584. uniform TA v = extract(value, i); \
  1585. uniform TA r = __atomic_swap_uniform_##TB##_global(p, v); \
  1586. ret = insert(ret, i, r); \
  1587. } \
  1588. return ret; \
  1589. }\
  1590. } \
  1591. #define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB,MASKTYPE,TC) \
  1592. static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
  1593. uniform TA oneval = reduce_##OPA(value); \
  1594. TA ret; \
  1595. if (lanemask() != 0) \
  1596. ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval); \
  1597. return ret; \
  1598. } \
  1599. static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
  1600. uniform TA value) { \
  1601. uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
  1602. return ret; \
  1603. } \
  1604. static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
  1605. TA value) { \
  1606. if (__is_nvptx_target) { \
  1607. TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
  1608. return ret; \
  1609. } else { \
  1610. uniform TA * uniform ptrArray[programCount]; \
  1611. ptrArray[programIndex] = ptr; \
  1612. TA ret; \
  1613. foreach_active (i) { \
  1614. uniform TA * uniform p = ptrArray[i]; \
  1615. uniform TA v = extract(value, i); \
  1616. uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \
  1617. ret = insert(ret, i, r); \
  1618. } \
  1619. return ret; \
  1620. } \
  1621. }
  1622. DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType,int64)
  1623. DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType,int64)
  1624. DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType,int64)
  1625. DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType,int64)
  1626. DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType,int64)
  1627. DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType,int64)
  1628. DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType,int64)
  1629. DEFINE_ATOMIC_SWAP(int32,int32,IntMaskType,int64)
  1630. // For everything but atomic min and max, we can use the same
  1631. // implementations for unsigned as for signed.
  1632. DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType, unsigned int64)
  1633. DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType, unsigned int64)
  1634. DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType,unsigned int64)
  1635. DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType,unsigned int64)
  1636. DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType, unsigned int64)
  1637. DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType, unsigned int64)
  1638. DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType, unsigned int64)
  1639. DEFINE_ATOMIC_SWAP(unsigned int32,int32,UIntMaskType, unsigned int64)
  1640. DEFINE_ATOMIC_SWAP(float,float,IntMaskType,int64)
  1641. DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType,int64)
  1642. DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType,int64)
  1643. DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType,int64)
  1644. DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType,int64)
  1645. DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType,int64)
  1646. DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType,int64)
  1647. DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType,int64)
  1648. DEFINE_ATOMIC_SWAP(int64,int64,IntMaskType, int64)
  1649. // For everything but atomic min and max, we can use the same
  1650. // implementations for unsigned as for signed.
  1651. DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType,unsigned int64)
  1652. DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType,unsigned int64)
  1653. DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType,unsigned int64)
  1654. DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType,unsigned int64)
  1655. DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType,unsigned int64)
  1656. DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType,unsigned int64)
  1657. DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType,unsigned int64)
  1658. DEFINE_ATOMIC_SWAP(unsigned int64,int64,UIntMaskType, unsigned int64)
  1659. DEFINE_ATOMIC_SWAP(double,double,IntMaskType, int64)
  1660. #undef DEFINE_ATOMIC_OP
  1661. #undef DEFINE_ATOMIC_MINMAX_OP
  1662. #undef DEFINE_ATOMIC_SWAP
  1663. #define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE, TC) \
  1664. static inline uniform TA atomic_compare_exchange_global( \
  1665. uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
  1666. uniform TA ret = \
  1667. __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval); \
  1668. return ret; \
  1669. } \
  1670. static inline TA atomic_compare_exchange_global( \
  1671. uniform TA * uniform ptr, TA oldval, TA newval) { \
  1672. TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval, \
  1673. (MASKTYPE)__mask); \
  1674. return ret; \
  1675. } \
  1676. static inline TA atomic_compare_exchange_global( \
  1677. uniform TA * varying ptr, TA oldval, TA newval) { \
  1678. if (__is_nvptx_target) { \
  1679. TA ret = __atomic_compare_exchange_varying_##TB##_global((TC)ptr, oldval, newval, (MASKTYPE)__mask); \
  1680. return ret; \
  1681. } else { \
  1682. uniform TA * uniform ptrArray[programCount]; \
  1683. ptrArray[programIndex] = ptr; \
  1684. TA ret; \
  1685. foreach_active (i) { \
  1686. uniform TA r = \
  1687. __atomic_compare_exchange_uniform_##TB##_global(ptrArray[i], \
  1688. extract(oldval, i), \
  1689. extract(newval, i)); \
  1690. ret = insert(ret, i, r); \
  1691. } \
  1692. return ret; \
  1693. } \
  1694. }
  1695. ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType,int64)
  1696. ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType,unsigned int64)
  1697. ATOMIC_DECL_CMPXCHG(float, float, IntMaskType,int64)
  1698. ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType,int64)
  1699. ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType,unsigned int64)
  1700. ATOMIC_DECL_CMPXCHG(double, double, IntMaskType,int64)
  1701. #undef ATOMIC_DECL_CMPXCHG
  1702. // void * variants of swap and compare exchange
  1703. static inline void *atomic_swap_global(void ** uniform ptr,
  1704. void * value) {
  1705. return (void *)atomic_swap_global((intptr_t * uniform)ptr,
  1706. (intptr_t)value);
  1707. }
  1708. static inline void * uniform atomic_swap_global(void ** uniform ptr,
  1709. void * uniform value) {
  1710. return (void * uniform)atomic_swap_global((intptr_t * uniform)ptr,
  1711. (uniform intptr_t)value);
  1712. }
  1713. static inline void *atomic_swap_global(void ** ptr, void * value) {
  1714. return (void *)atomic_swap_global((intptr_t *)ptr,
  1715. (intptr_t)value);
  1716. }
  1717. static inline void *
  1718. atomic_compare_exchange_global(void ** uniform ptr,
  1719. void * oldval, void * newval) {
  1720. return (void *)atomic_compare_exchange_global((intptr_t * uniform)ptr,
  1721. (intptr_t)oldval,
  1722. (intptr_t)newval);
  1723. }
  1724. static inline void * uniform
  1725. atomic_compare_exchange_global(void ** uniform ptr, void * uniform oldval,
  1726. void * uniform newval) {
  1727. return (void * uniform)atomic_compare_exchange_global((intptr_t * uniform)ptr,
  1728. (uniform intptr_t)oldval,
  1729. (uniform intptr_t)newval);
  1730. }
  1731. static inline void *
  1732. atomic_compare_exchange_global(void ** ptr, void * oldval,
  1733. void * newval) {
  1734. return (void *)atomic_compare_exchange_global((intptr_t *)ptr,
  1735. (intptr_t)oldval,
  1736. (intptr_t)newval);
  1737. }
  1738. ///////////////////////////////////////////////////////////////////////////
  1739. // local atomics
  1740. #define LOCAL_ATOMIC(TYPE,NAME,OPFUNC) \
  1741. static inline uniform TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, \
  1742. uniform TYPE value) { \
  1743. uniform TYPE ret = *ptr; \
  1744. *ptr = OPFUNC(*ptr, value); \
  1745. return ret; \
  1746. } \
  1747. static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value) { \
  1748. TYPE ret; \
  1749. foreach_active (i) { \
  1750. ret = insert(ret, i, *ptr); \
  1751. *ptr = OPFUNC(*ptr, extract(value, i)); \
  1752. } \
  1753. return ret; \
  1754. } \
  1755. static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) { \
  1756. TYPE ret; \
  1757. if (__is_nvptx_target) { \
  1758. foreach_active (i) { \
  1759. uniform TYPE * uniform ptr = (uniform TYPE * uniform)extract((int64)p, i); \
  1760. ret = insert(ret, i, *ptr); \
  1761. *ptr = OPFUNC(*ptr, extract(value, i)); \
  1762. } \
  1763. } else { \
  1764. uniform TYPE * uniform ptrs[programCount]; \
  1765. ptrs[programIndex] = p; \
  1766. foreach_active (i) { \
  1767. ret = insert(ret, i, *ptrs[i]); \
  1768. *ptrs[i] = OPFUNC(*ptrs[i], extract(value, i)); \
  1769. } \
  1770. } \
  1771. return ret; \
  1772. }
  1773. static inline uniform int32 __add(uniform int32 a, uniform int32 b) { return a+b; }
  1774. static inline uniform int32 __sub(uniform int32 a, uniform int32 b) { return a-b; }
  1775. static inline uniform int32 __and(uniform int32 a, uniform int32 b) { return a & b; }
  1776. static inline uniform int32 __or(uniform int32 a, uniform int32 b) { return a | b; }
  1777. static inline uniform int32 __xor(uniform int32 a, uniform int32 b) { return a ^ b; }
  1778. static inline uniform int32 __swap(uniform int32 a, uniform int32 b) { return b; }
  1779. static inline uniform unsigned int32 __add(uniform unsigned int32 a,
  1780. uniform unsigned int32 b) { return a+b; }
  1781. static inline uniform unsigned int32 __sub(uniform unsigned int32 a,
  1782. uniform unsigned int32 b) { return a-b; }
  1783. static inline uniform unsigned int32 __and(uniform unsigned int32 a,
  1784. uniform unsigned int32 b) { return a & b; }
  1785. static inline uniform unsigned int32 __or(uniform unsigned int32 a,
  1786. uniform unsigned int32 b) { return a | b; }
  1787. static inline uniform unsigned int32 __xor(uniform unsigned int32 a,
  1788. uniform unsigned int32 b) { return a ^ b; }
  1789. static inline uniform unsigned int32 __swap(uniform unsigned int32 a,
  1790. uniform unsigned int32 b) { return b; }
  1791. static inline uniform float __add(uniform float a, uniform float b) { return a+b; }
  1792. static inline uniform float __sub(uniform float a, uniform float b) { return a-b; }
  1793. static inline uniform float __swap(uniform float a, uniform float b) { return b; }
  1794. static inline uniform int64 __add(uniform int64 a, uniform int64 b) { return a+b; }
  1795. static inline uniform int64 __sub(uniform int64 a, uniform int64 b) { return a-b; }
  1796. static inline uniform int64 __and(uniform int64 a, uniform int64 b) { return a & b; }
  1797. static inline uniform int64 __or(uniform int64 a, uniform int64 b) { return a | b; }
  1798. static inline uniform int64 __xor(uniform int64 a, uniform int64 b) { return a ^ b; }
  1799. static inline uniform int64 __swap(uniform int64 a, uniform int64 b) { return b; }
  1800. static inline uniform unsigned int64 __add(uniform unsigned int64 a,
  1801. uniform unsigned int64 b) { return a+b; }
  1802. static inline uniform unsigned int64 __sub(uniform unsigned int64 a,
  1803. uniform unsigned int64 b) { return a-b; }
  1804. static inline uniform unsigned int64 __and(uniform unsigned int64 a,
  1805. uniform unsigned int64 b) { return a & b; }
  1806. static inline uniform unsigned int64 __or(uniform unsigned int64 a,
  1807. uniform unsigned int64 b) { return a | b; }
  1808. static inline uniform unsigned int64 __xor(uniform unsigned int64 a,
  1809. uniform unsigned int64 b) { return a ^ b; }
  1810. static inline uniform unsigned int64 __swap(uniform unsigned int64 a,
  1811. uniform unsigned int64 b) { return b; }
  1812. static inline uniform double __add(uniform double a, uniform double b) { return a+b; }
  1813. static inline uniform double __sub(uniform double a, uniform double b) { return a-b; }
  1814. static inline uniform double __swap(uniform double a, uniform double b) { return a-b; }
  1815. LOCAL_ATOMIC(int32, add, __add)
  1816. LOCAL_ATOMIC(int32, subtract, __sub)
  1817. LOCAL_ATOMIC(int32, and, __and)
  1818. LOCAL_ATOMIC(int32, or, __or)
  1819. LOCAL_ATOMIC(int32, xor, __xor)
  1820. LOCAL_ATOMIC(int32, min, min)
  1821. LOCAL_ATOMIC(int32, max, max)
  1822. LOCAL_ATOMIC(int32, swap, __swap)
  1823. LOCAL_ATOMIC(unsigned int32, add, __add)
  1824. LOCAL_ATOMIC(unsigned int32, subtract, __sub)
  1825. LOCAL_ATOMIC(unsigned int32, and, __and)
  1826. LOCAL_ATOMIC(unsigned int32, or, __or)
  1827. LOCAL_ATOMIC(unsigned int32, xor, __xor)
  1828. LOCAL_ATOMIC(unsigned int32, min, min)
  1829. LOCAL_ATOMIC(unsigned int32, max, max)
  1830. LOCAL_ATOMIC(unsigned int32, swap, __swap)
  1831. LOCAL_ATOMIC(float, add, __add)
  1832. LOCAL_ATOMIC(float, subtract, __sub)
  1833. LOCAL_ATOMIC(float, min, min)
  1834. LOCAL_ATOMIC(float, max, max)
  1835. LOCAL_ATOMIC(float, swap, __swap)
  1836. LOCAL_ATOMIC(int64, add, __add)
  1837. LOCAL_ATOMIC(int64, subtract, __sub)
  1838. LOCAL_ATOMIC(int64, and, __and)
  1839. LOCAL_ATOMIC(int64, or, __or)
  1840. LOCAL_ATOMIC(int64, xor, __xor)
  1841. LOCAL_ATOMIC(int64, min, min)
  1842. LOCAL_ATOMIC(int64, max, max)
  1843. LOCAL_ATOMIC(int64, swap, __swap)
  1844. LOCAL_ATOMIC(unsigned int64, add, __add)
  1845. LOCAL_ATOMIC(unsigned int64, subtract, __sub)
  1846. LOCAL_ATOMIC(unsigned int64, and, __and)
  1847. LOCAL_ATOMIC(unsigned int64, or, __or)
  1848. LOCAL_ATOMIC(unsigned int64, xor, __xor)
  1849. LOCAL_ATOMIC(unsigned int64, min, min)
  1850. LOCAL_ATOMIC(unsigned int64, max, max)
  1851. LOCAL_ATOMIC(unsigned int64, swap, __swap)
  1852. LOCAL_ATOMIC(double, add, __add)
  1853. LOCAL_ATOMIC(double, subtract, __sub)
  1854. LOCAL_ATOMIC(double, min, min)
  1855. LOCAL_ATOMIC(double, max, max)
  1856. LOCAL_ATOMIC(double, swap, __swap)
  1857. // compare exchange
  1858. #define LOCAL_CMPXCHG(TYPE) \
  1859. static inline uniform TYPE atomic_compare_exchange_local(uniform TYPE * uniform ptr, \
  1860. uniform TYPE cmp, \
  1861. uniform TYPE update) { \
  1862. uniform TYPE old = *ptr; \
  1863. if (old == cmp) \
  1864. *ptr = update; \
  1865. return old; \
  1866. } \
  1867. static inline TYPE atomic_compare_exchange_local(uniform TYPE * uniform ptr, \
  1868. TYPE cmp, TYPE update) { \
  1869. TYPE ret; \
  1870. foreach_active (i) { \
  1871. uniform TYPE old = *ptr; \
  1872. if (old == extract(cmp, i)) \
  1873. *ptr = extract(update, i); \
  1874. ret = insert(ret, i, old); \
  1875. } \
  1876. return ret; \
  1877. } \
  1878. static inline TYPE atomic_compare_exchange_local(uniform TYPE * varying p, \
  1879. TYPE cmp, TYPE update) { \
  1880. uniform TYPE * uniform ptrs[programCount]; \
  1881. ptrs[programIndex] = p; \
  1882. TYPE ret; \
  1883. foreach_active (i) { \
  1884. uniform TYPE old = *ptrs[i]; \
  1885. if (old == extract(cmp, i)) \
  1886. *ptrs[i] = extract(update, i); \
  1887. ret = insert(ret, i, old); \
  1888. } \
  1889. return ret; \
  1890. }
  1891. LOCAL_CMPXCHG(int32)
  1892. LOCAL_CMPXCHG(unsigned int32)
  1893. LOCAL_CMPXCHG(float)
  1894. LOCAL_CMPXCHG(int64)
  1895. LOCAL_CMPXCHG(unsigned int64)
  1896. LOCAL_CMPXCHG(double)
  1897. #undef LOCAL_ATOMIC
  1898. #undef LOCAL_CMPXCHG
  1899. // void * variants of swap and compare exchange
  1900. static inline void *atomic_swap_local(void ** uniform ptr,
  1901. void * value) {
  1902. return (void *)atomic_swap_local((intptr_t * uniform)ptr,
  1903. (intptr_t)value);
  1904. }
  1905. static inline void * uniform atomic_swap_local(void ** uniform ptr,
  1906. void * uniform value) {
  1907. return (void * uniform)atomic_swap_local((intptr_t * uniform)ptr,
  1908. (uniform intptr_t)value);
  1909. }
  1910. static inline void *atomic_swap_local(void ** ptr, void * value) {
  1911. return (void *)atomic_swap_local((intptr_t *)ptr,
  1912. (intptr_t)value);
  1913. }
  1914. static inline void *
  1915. atomic_compare_exchange_local(void ** uniform ptr,
  1916. void * oldval, void * newval) {
  1917. return (void *)atomic_compare_exchange_local((intptr_t * uniform)ptr,
  1918. (intptr_t)oldval,
  1919. (intptr_t)newval);
  1920. }
  1921. static inline void * uniform
  1922. atomic_compare_exchange_local(void ** uniform ptr, void * uniform oldval,
  1923. void * uniform newval) {
  1924. return (void * uniform)atomic_compare_exchange_local((intptr_t * uniform)ptr,
  1925. (uniform intptr_t)oldval,
  1926. (uniform intptr_t)newval);
  1927. }
  1928. static inline void *
  1929. atomic_compare_exchange_local(void ** ptr, void * oldval,
  1930. void * newval) {
  1931. return (void *)atomic_compare_exchange_local((intptr_t *)ptr,
  1932. (intptr_t)oldval,
  1933. (intptr_t)newval);
  1934. }
  1935. ///////////////////////////////////////////////////////////////////////////
  1936. // Transcendentals (float precision)
  1937. __declspec(safe)
  1938. static inline float sqrt(float v) {
  1939. return __sqrt_varying_float(v);
  1940. }
  1941. __declspec(safe)
  1942. static inline uniform float sqrt(uniform float v) {
  1943. return __sqrt_uniform_float(v);
  1944. }
  1945. __declspec(safe)
  1946. static inline float rsqrt(float v) {
  1947. return __rsqrt_varying_float(v);
  1948. }
  1949. __declspec(safe)
  1950. static inline uniform float rsqrt(uniform float v) {
  1951. return __rsqrt_uniform_float(v);
  1952. }
  1953. __declspec(safe)
  1954. static inline float ldexp(float x, int n) {
  1955. unsigned int ex = 0x7F800000u;
  1956. unsigned int ix = intbits(x);
  1957. ex &= ix; // extract old exponent;
  1958. ix = ix & ~0x7F800000u; // clear exponent
  1959. n = (n << 23) + ex;
  1960. ix |= n; // insert new exponent
  1961. return floatbits(ix);
  1962. }
  1963. __declspec(safe)
  1964. static inline uniform float ldexp(uniform float x, uniform int n) {
  1965. uniform unsigned int ex = 0x7F800000u;
  1966. uniform unsigned int ix = intbits(x);
  1967. ex &= ix; // extract old exponent;
  1968. ix = ix & ~0x7F800000u; // clear exponent
  1969. n = (n << 23) + ex;
  1970. ix |= n; // insert new exponent
  1971. return floatbits(ix);
  1972. }
  1973. __declspec(safe)
  1974. static inline float frexp(float x, varying int * uniform pw2) {
  1975. unsigned int ex = 0x7F800000u; // exponent mask
  1976. unsigned int ix = intbits(x);
  1977. ex &= ix;
  1978. ix &= ~0x7F800000u; // clear exponent
  1979. *pw2 = (int)(ex >> 23) - 126; // compute exponent
  1980. ix |= 0x3F000000u; // insert exponent +1 in x
  1981. return floatbits(ix);
  1982. }
  1983. __declspec(safe)
  1984. static inline uniform float frexp(uniform float x, uniform int * uniform pw2) {
  1985. uniform unsigned int ex = 0x7F800000u; // exponent mask
  1986. uniform unsigned int ix = intbits(x);
  1987. ex &= ix;
  1988. ix &= ~0x7F800000u; // clear exponent
  1989. *pw2 = (uniform int)(ex >> 23) - 126; // compute exponent
  1990. ix |= 0x3F000000u; // insert exponent +1 in x
  1991. return floatbits(ix);
  1992. }
  1993. // Most of the transcendental implementations in ispc code here come from
  1994. // Solomon Boulos's "syrah": https://github.com/boulos/syrah/
  1995. __declspec(safe)
  1996. static inline float sin(float x_full) {
  1997. if (__have_native_trigonometry)
  1998. {
  1999. return __sin_varying_float(x_full);
  2000. }
  2001. else if (__math_lib == __math_lib_svml) {
  2002. return __svml_sinf(x_full);
  2003. }
  2004. else if (__math_lib == __math_lib_system) {
  2005. float ret;
  2006. foreach_active (i) {
  2007. uniform float r = __stdlib_sinf(extract(x_full, i));
  2008. ret = insert(ret, i, r);
  2009. }
  2010. return ret;
  2011. }
  2012. else if (__math_lib == __math_lib_ispc ||
  2013. __math_lib == __math_lib_ispc_fast) {
  2014. static const float pi_over_two_vec = 1.57079637050628662109375;
  2015. static const float two_over_pi_vec = 0.636619746685028076171875;
  2016. float scaled = x_full * two_over_pi_vec;
  2017. float k_real = floor(scaled);
  2018. int k = (int)k_real;
  2019. // Reduced range version of x
  2020. float x = x_full - k_real * pi_over_two_vec;
  2021. int k_mod4 = k & 3;
  2022. bool sin_usecos = (k_mod4 == 1 || k_mod4 == 3);
  2023. bool flip_sign = (k_mod4 > 1);
  2024. // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
  2025. // 4, 6, 8, 10|], [|single...|], [0;Pi/2]);
  2026. static const float sin_c2 = -0.16666667163372039794921875;
  2027. static const float sin_c4 = 8.333347737789154052734375e-3;
  2028. static const float sin_c6 = -1.9842604524455964565277099609375e-4;
  2029. static const float sin_c8 = 2.760012648650445044040679931640625e-6;
  2030. static const float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
  2031. static const float cos_c2 = -0.5;
  2032. static const float cos_c4 = 4.166664183139801025390625e-2;
  2033. static const float cos_c6 = -1.388833043165504932403564453125e-3;
  2034. static const float cos_c8 = 2.47562347794882953166961669921875e-5;
  2035. static const float cos_c10 = -2.59630184018533327616751194000244140625e-7;
  2036. float outside = sin_usecos ? 1 : x;
  2037. float c2 = sin_usecos ? cos_c2 : sin_c2;
  2038. float c4 = sin_usecos ? cos_c4 : sin_c4;
  2039. float c6 = sin_usecos ? cos_c6 : sin_c6;
  2040. float c8 = sin_usecos ? cos_c8 : sin_c8;
  2041. float c10 = sin_usecos ? cos_c10 : sin_c10;
  2042. float x2 = x * x;
  2043. float formula = x2 * c10 + c8;
  2044. formula = x2 * formula + c6;
  2045. formula = x2 * formula + c4;
  2046. formula = x2 * formula + c2;
  2047. formula = x2 * formula + 1;
  2048. formula *= outside;
  2049. formula = flip_sign ? -formula : formula;
  2050. return formula;
  2051. }
  2052. }
  2053. __declspec(safe)
  2054. static inline uniform float sin(uniform float x_full) {
  2055. if (__have_native_trigonometry)
  2056. {
  2057. return __sin_uniform_float(x_full);
  2058. }
  2059. else if (__math_lib == __math_lib_system ||
  2060. __math_lib == __math_lib_svml) {
  2061. return __stdlib_sinf(x_full);
  2062. }
  2063. else if (__math_lib == __math_lib_ispc ||
  2064. __math_lib == __math_lib_ispc_fast) {
  2065. static const uniform float pi_over_two_vec = 1.57079637050628662109375;
  2066. static const uniform float two_over_pi_vec = 0.636619746685028076171875;
  2067. uniform float scaled = x_full * two_over_pi_vec;
  2068. uniform float k_real = floor(scaled);
  2069. uniform int k = (int)k_real;
  2070. // Reduced range version of x
  2071. uniform float x = x_full - k_real * pi_over_two_vec;
  2072. uniform int k_mod4 = k & 3;
  2073. uniform bool sin_usecos = (k_mod4 == 1 || k_mod4 == 3);
  2074. uniform bool flip_sign = (k_mod4 > 1);
  2075. // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
  2076. // 4, 6, 8, 10|], [|single...|], [0;Pi/2]);
  2077. static const uniform float sin_c2 = -0.16666667163372039794921875;
  2078. static const uniform float sin_c4 = 8.333347737789154052734375e-3;
  2079. static const uniform float sin_c6 = -1.9842604524455964565277099609375e-4;
  2080. static const uniform float sin_c8 = 2.760012648650445044040679931640625e-6;
  2081. static const uniform float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
  2082. static const uniform float cos_c2 = -0.5;
  2083. static const uniform float cos_c4 = 4.166664183139801025390625e-2;
  2084. static const uniform float cos_c6 = -1.388833043165504932403564453125e-3;
  2085. static const uniform float cos_c8 = 2.47562347794882953166961669921875e-5;
  2086. static const uniform float cos_c10 = -2.59630184018533327616751194000244140625e-7;
  2087. uniform float outside, c2, c4, c6, c8, c10;
  2088. if (sin_usecos) {
  2089. outside = 1.;
  2090. c2 = cos_c2;
  2091. c4 = cos_c4;
  2092. c6 = cos_c6;
  2093. c8 = cos_c8;
  2094. c10 = cos_c10;
  2095. }
  2096. else {
  2097. outside = x;
  2098. c2 = sin_c2;
  2099. c4 = sin_c4;
  2100. c6 = sin_c6;
  2101. c8 = sin_c8;
  2102. c10 = sin_c10;
  2103. }
  2104. uniform float x2 = x * x;
  2105. uniform float formula = x2 * c10 + c8;
  2106. formula = x2 * formula + c6;
  2107. formula = x2 * formula + c4;
  2108. formula = x2 * formula + c2;
  2109. formula = x2 * formula + 1.;
  2110. formula *= outside;
  2111. formula = flip_sign ? -formula : formula;
  2112. return formula;
  2113. }
  2114. }
  2115. __declspec(safe)
  2116. static inline float asin(float x0) {
  2117. bool isneg = x0< 0;
  2118. float x = abs(x0);
  2119. bool isnan = (x > 1);
  2120. float v;
  2121. if (__have_native_trigonometry)
  2122. {
  2123. return __asin_varying_float(x0);
  2124. }
  2125. else if (__math_lib == __math_lib_svml) {
  2126. return __svml_asinf(x0);
  2127. }
  2128. else if (__math_lib == __math_lib_system) {
  2129. float ret;
  2130. foreach_active (i) {
  2131. uniform float r = __stdlib_asinf(extract(x0, i));
  2132. ret = insert(ret, i, r);
  2133. }
  2134. return ret;
  2135. }
  2136. else if (__math_lib == __math_lib_ispc)
  2137. {
  2138. // sollya
  2139. // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5,6,7,8,9,10|],
  2140. // [|single...|], [1e-20;.9999999999999999]);
  2141. // avg error: 8.5716801e-09, max error: 2.1373853e-07
  2142. v = 1.57079637050628662109375f +
  2143. x * (-0.21460501849651336669921875f +
  2144. x * (8.9116774499416351318359375e-2f +
  2145. x * (-5.146093666553497314453125e-2f +
  2146. x * (3.7269376218318939208984375e-2f +
  2147. x * (-3.5882405936717987060546875e-2f +
  2148. x * (4.14929799735546112060546875e-2f +
  2149. x * (-4.25077490508556365966796875e-2f +
  2150. x * (3.05023305118083953857421875e-2f +
  2151. x * (-1.2897425331175327301025390625e-2f +
  2152. x * 2.38926825113594532012939453125e-3f)))))))));
  2153. }
  2154. else if (__math_lib == __math_lib_ispc_fast)
  2155. {
  2156. // sollya
  2157. // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|],
  2158. // [1e-20;.9999999999999999]);
  2159. // avg error: 1.1105439e-06, max error 1.3187528e-06
  2160. v = 1.57079517841339111328125f +
  2161. x * (-0.21450997889041900634765625f +
  2162. x * (8.78556668758392333984375e-2f +
  2163. x * (-4.489909112453460693359375e-2f +
  2164. x * (1.928029954433441162109375e-2f +
  2165. x * (-4.3095736764371395111083984375e-3f)))));
  2166. }
  2167. v *= -sqrt(1.f - x);
  2168. v = v + 1.57079637050628662109375;
  2169. if (v < 0) v = 0;
  2170. // v = max(0, v);
  2171. if (isneg) v = -v;
  2172. if (isnan) v = floatbits(0x7fc00000);
  2173. return v;
  2174. }
  2175. __declspec(safe)
  2176. static inline uniform float asin(uniform float x0) {
  2177. uniform bool isneg = x0 < 0;
  2178. uniform float x = abs(x0);
  2179. uniform bool isnan = (x > 1);
  2180. uniform float v;
  2181. if (__have_native_trigonometry)
  2182. {
  2183. return __asin_uniform_float(x0);
  2184. }
  2185. else if (__math_lib == __math_lib_svml ||
  2186. __math_lib == __math_lib_system) {
  2187. return __stdlib_asinf(x0);
  2188. }
  2189. else if (__math_lib == __math_lib_ispc)
  2190. {
  2191. // sollya
  2192. // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5,6,7,8,9,10|],
  2193. // [|single...|], [1e-20;.9999999999999999]);
  2194. // avg error: 8.5716801e-09, max error: 2.1373853e-07
  2195. v = 1.57079637050628662109375f +
  2196. x * (-0.21460501849651336669921875f +
  2197. x * (8.9116774499416351318359375e-2f +
  2198. x * (-5.146093666553497314453125e-2f +
  2199. x * (3.7269376218318939208984375e-2f +
  2200. x * (-3.5882405936717987060546875e-2f +
  2201. x * (4.14929799735546112060546875e-2f +
  2202. x * (-4.25077490508556365966796875e-2f +
  2203. x * (3.05023305118083953857421875e-2f +
  2204. x * (-1.2897425331175327301025390625e-2f +
  2205. x * 2.38926825113594532012939453125e-3f)))))))));
  2206. }
  2207. else if (__math_lib == __math_lib_ispc_fast)
  2208. {
  2209. // sollya
  2210. // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|],
  2211. // [1e-20;.9999999999999999]);
  2212. // avg error: 1.1105439e-06, max error 1.3187528e-06
  2213. v = 1.57079517841339111328125f +
  2214. x * (-0.21450997889041900634765625f +
  2215. x * (8.78556668758392333984375e-2f +
  2216. x * (-4.489909112453460693359375e-2f +
  2217. x * (1.928029954433441162109375e-2f +
  2218. x * (-4.3095736764371395111083984375e-3f)))));
  2219. }
  2220. v *= -sqrt(1.f - x);
  2221. v = v + 1.57079637050628662109375;
  2222. if (v < 0) v = 0;
  2223. // v = max(0, v);
  2224. if (isneg) v = -v;
  2225. if (isnan) v = floatbits(0x7fc00000);
  2226. return v;
  2227. }
  2228. __declspec(safe)
  2229. static inline float cos(float x_full) {
  2230. if (__have_native_trigonometry)
  2231. {
  2232. return __cos_varying_float(x_full);
  2233. }
  2234. if (__math_lib == __math_lib_svml) {
  2235. return __svml_cosf(x_full);
  2236. }
  2237. else if (__math_lib == __math_lib_system) {
  2238. float ret;
  2239. foreach_active (i) {
  2240. uniform float r = __stdlib_cosf(extract(x_full, i));
  2241. ret = insert(ret, i, r);
  2242. }
  2243. return ret;
  2244. }
  2245. else if (__math_lib == __math_lib_ispc ||
  2246. __math_lib == __math_lib_ispc_fast) {
  2247. static const float pi_over_two_vec = 1.57079637050628662109375;
  2248. static const float two_over_pi_vec = 0.636619746685028076171875;
  2249. float scaled = x_full * two_over_pi_vec;
  2250. float k_real = floor(scaled);
  2251. int k = (int)k_real;
  2252. // Reduced range version of x
  2253. float x = x_full - k_real * pi_over_two_vec;
  2254. int k_mod4 = k & 3;
  2255. bool cos_usecos = (k_mod4 == 0 || k_mod4 == 2);
  2256. bool flip_sign = (k_mod4 == 1 || k_mod4 == 2);
  2257. const float sin_c2 = -0.16666667163372039794921875;
  2258. const float sin_c4 = 8.333347737789154052734375e-3;
  2259. const float sin_c6 = -1.9842604524455964565277099609375e-4;
  2260. const float sin_c8 = 2.760012648650445044040679931640625e-6;
  2261. const float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
  2262. const float cos_c2 = -0.5;
  2263. const float cos_c4 = 4.166664183139801025390625e-2;
  2264. const float cos_c6 = -1.388833043165504932403564453125e-3;
  2265. const float cos_c8 = 2.47562347794882953166961669921875e-5;
  2266. const float cos_c10 = -2.59630184018533327616751194000244140625e-7;
  2267. float outside = cos_usecos ? 1. : x;
  2268. float c2 = cos_usecos ? cos_c2 : sin_c2;
  2269. float c4 = cos_usecos ? cos_c4 : sin_c4;
  2270. float c6 = cos_usecos ? cos_c6 : sin_c6;
  2271. float c8 = cos_usecos ? cos_c8 : sin_c8;
  2272. float c10 = cos_usecos ? cos_c10 : sin_c10;
  2273. float x2 = x * x;
  2274. float formula = x2 * c10 + c8;
  2275. formula = x2 * formula + c6;
  2276. formula = x2 * formula + c4;
  2277. formula = x2 * formula + c2;
  2278. formula = x2 * formula + 1.;
  2279. formula *= outside;
  2280. formula = flip_sign ? -formula : formula;
  2281. return formula;
  2282. }
  2283. }
  2284. __declspec(safe)
  2285. static inline uniform float cos(uniform float x_full) {
  2286. if (__have_native_trigonometry)
  2287. {
  2288. return __cos_uniform_float(x_full);
  2289. }
  2290. else if (__math_lib == __math_lib_system ||
  2291. __math_lib == __math_lib_svml) {
  2292. return __stdlib_cosf(x_full);
  2293. }
  2294. else if (__math_lib == __math_lib_ispc ||
  2295. __math_lib == __math_lib_ispc_fast) {
  2296. static const uniform float pi_over_two_vec = 1.57079637050628662109375;
  2297. static const uniform float two_over_pi_vec = 0.636619746685028076171875;
  2298. uniform float scaled = x_full * two_over_pi_vec;
  2299. uniform float k_real = floor(scaled);
  2300. uniform int k = (int)k_real;
  2301. // Reduced range version of x
  2302. uniform float x = x_full - k_real * pi_over_two_vec;
  2303. uniform int k_mod4 = k & 3;
  2304. uniform bool cos_usecos = (k_mod4 == 0 || k_mod4 == 2);
  2305. uniform bool flip_sign = (k_mod4 == 1 || k_mod4 == 2);
  2306. const uniform float sin_c2 = -0.16666667163372039794921875;
  2307. const uniform float sin_c4 = 8.333347737789154052734375e-3;
  2308. const uniform float sin_c6 = -1.9842604524455964565277099609375e-4;
  2309. const uniform float sin_c8 = 2.760012648650445044040679931640625e-6;
  2310. const uniform float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
  2311. const uniform float cos_c2 = -0.5;
  2312. const uniform float cos_c4 = 4.166664183139801025390625e-2;
  2313. const uniform float cos_c6 = -1.388833043165504932403564453125e-3;
  2314. const uniform float cos_c8 = 2.47562347794882953166961669921875e-5;
  2315. const uniform float cos_c10 = -2.59630184018533327616751194000244140625e-7;
  2316. uniform float outside, c2, c4, c6, c8, c10;
  2317. if (cos_usecos) {
  2318. outside = 1.;
  2319. c2 = cos_c2;
  2320. c4 = cos_c4;
  2321. c6 = cos_c6;
  2322. c8 = cos_c8;
  2323. c10 = cos_c10;
  2324. }
  2325. else {
  2326. outside = x;
  2327. c2 = sin_c2;
  2328. c4 = sin_c4;
  2329. c6 = sin_c6;
  2330. c8 = sin_c8;
  2331. c10 = sin_c10;
  2332. }
  2333. uniform float x2 = x * x;
  2334. uniform float formula = x2 * c10 + c8;
  2335. formula = x2 * formula + c6;
  2336. formula = x2 * formula + c4;
  2337. formula = x2 * formula + c2;
  2338. formula = x2 * formula + 1.;
  2339. formula *= outside;
  2340. formula = flip_sign ? -formula : formula;
  2341. return formula;
  2342. }
  2343. }
  2344. __declspec(safe)
  2345. static inline float acos(float v) {
  2346. if (__have_native_trigonometry)
  2347. return __acos_varying_float(v);
  2348. else
  2349. return 1.57079637050628662109375 - asin(v);
  2350. }
  2351. __declspec(safe)
  2352. static inline double acos(const double v) {
  2353. if (__have_native_trigonometry)
  2354. return __acos_varying_double(v);
  2355. else
  2356. return 1.57079637050628662109375d0 - asin(v);
  2357. }
  2358. __declspec(safe)
  2359. static inline uniform float acos(uniform float v) {
  2360. if (__have_native_trigonometry)
  2361. return __acos_uniform_float(v);
  2362. else
  2363. return 1.57079637050628662109375 - asin(v);
  2364. }
  2365. __declspec(safe)
  2366. static inline uniform double acos(const uniform double v) {
  2367. if (__have_native_trigonometry)
  2368. return __acos_uniform_double(v);
  2369. else
  2370. return 1.57079637050628662109375d0 - asin(v);
  2371. }
  2372. __declspec(safe)
  2373. static inline void sincos(float x_full, varying float * uniform sin_result,
  2374. varying float * uniform cos_result) {
  2375. if (__have_native_trigonometry)
  2376. {
  2377. __sincos_varying_float(x_full,sin_result,cos_result);
  2378. }
  2379. if (__math_lib == __math_lib_svml) {
  2380. __svml_sincosf(x_full, sin_result, cos_result);
  2381. }
  2382. else if (__math_lib == __math_lib_system) {
  2383. foreach_active (i) {
  2384. uniform float s, c;
  2385. __stdlib_sincosf(extract(x_full, i), &s, &c);
  2386. *sin_result = insert(*sin_result, i, s);
  2387. *cos_result = insert(*cos_result, i, c);
  2388. }
  2389. }
  2390. else if (__math_lib == __math_lib_ispc ||
  2391. __math_lib == __math_lib_ispc_fast) {
  2392. const float pi_over_two_vec = 1.57079637050628662109375;
  2393. const float two_over_pi_vec = 0.636619746685028076171875;
  2394. float scaled = x_full * two_over_pi_vec;
  2395. float k_real = floor(scaled);
  2396. int k = (int)k_real;
  2397. // Reduced range version of x
  2398. float x = x_full - k_real * pi_over_two_vec;
  2399. int k_mod4 = k & 3;
  2400. bool cos_usecos = (k_mod4 == 0 || k_mod4 == 2);
  2401. bool sin_usecos = (k_mod4 == 1 || k_mod4 == 3);
  2402. bool sin_flipsign = (k_mod4 > 1);
  2403. bool cos_flipsign = (k_mod4 == 1 || k_mod4 == 2);
  2404. const float one_vec = 1.;
  2405. const float sin_c2 = -0.16666667163372039794921875;
  2406. const float sin_c4 = 8.333347737789154052734375e-3;
  2407. const float sin_c6 = -1.9842604524455964565277099609375e-4;
  2408. const float sin_c8 = 2.760012648650445044040679931640625e-6;
  2409. const float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
  2410. const float cos_c2 = -0.5;
  2411. const float cos_c4 = 4.166664183139801025390625e-2;
  2412. const float cos_c6 = -1.388833043165504932403564453125e-3;
  2413. const float cos_c8 = 2.47562347794882953166961669921875e-5;
  2414. const float cos_c10 = -2.59630184018533327616751194000244140625e-7;
  2415. float x2 = x * x;
  2416. float sin_formula = x2 * sin_c10 + sin_c8;
  2417. float cos_formula = x2 * cos_c10 + cos_c8;
  2418. sin_formula = x2 * sin_formula + sin_c6;
  2419. cos_formula = x2 * cos_formula + cos_c6;
  2420. sin_formula = x2 * sin_formula + sin_c4;
  2421. cos_formula = x2 * cos_formula + cos_c4;
  2422. sin_formula = x2 * sin_formula + sin_c2;
  2423. cos_formula = x2 * cos_formula + cos_c2;
  2424. sin_formula = x2 * sin_formula + one_vec;
  2425. cos_formula = x2 * cos_formula + one_vec;
  2426. sin_formula *= x;
  2427. *sin_result = sin_usecos ? cos_formula : sin_formula;
  2428. *cos_result = cos_usecos ? cos_formula : sin_formula;
  2429. *sin_result = sin_flipsign ? -*sin_result : *sin_result;
  2430. *cos_result = cos_flipsign ? -*cos_result : *cos_result;
  2431. }
  2432. }
  2433. __declspec(safe)
  2434. static inline void sincos(uniform float x_full, uniform float * uniform sin_result,
  2435. uniform float * uniform cos_result) {
  2436. if (__have_native_trigonometry)
  2437. {
  2438. __sincos_uniform_float(x_full, sin_result, cos_result);
  2439. }
  2440. if (__math_lib == __math_lib_system ||
  2441. __math_lib == __math_lib_svml) {
  2442. __stdlib_sincosf(x_full, sin_result, cos_result);
  2443. }
  2444. else if (__math_lib == __math_lib_ispc ||
  2445. __math_lib == __math_lib_ispc_fast) {
  2446. const uniform float pi_over_two_vec = 1.57079637050628662109375;
  2447. const uniform float two_over_pi_vec = 0.636619746685028076171875;
  2448. uniform float scaled = x_full * two_over_pi_vec;
  2449. uniform float k_real = floor(scaled);
  2450. uniform int k = (uniform int)k_real;
  2451. // Reduced range version of x
  2452. uniform float x = x_full - k_real * pi_over_two_vec;
  2453. uniform int k_mod4 = k & 3;
  2454. uniform bool cos_usecos = (k_mod4 == 0 || k_mod4 == 2);
  2455. uniform bool sin_usecos = (k_mod4 == 1 || k_mod4 == 3);
  2456. uniform bool sin_flipsign = (k_mod4 > 1);
  2457. uniform bool cos_flipsign = (k_mod4 == 1 || k_mod4 == 2);
  2458. const uniform float one_vec = 1.;
  2459. const uniform float sin_c2 = -0.16666667163372039794921875;
  2460. const uniform float sin_c4 = 8.333347737789154052734375e-3;
  2461. const uniform float sin_c6 = -1.9842604524455964565277099609375e-4;
  2462. const uniform float sin_c8 = 2.760012648650445044040679931640625e-6;
  2463. const uniform float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
  2464. const uniform float cos_c2 = -0.5;
  2465. const uniform float cos_c4 = 4.166664183139801025390625e-2;
  2466. const uniform float cos_c6 = -1.388833043165504932403564453125e-3;
  2467. const uniform float cos_c8 = 2.47562347794882953166961669921875e-5;
  2468. const uniform float cos_c10 = -2.59630184018533327616751194000244140625e-7;
  2469. uniform float x2 = x * x;
  2470. uniform float sin_formula = x2 * sin_c10 + sin_c8;
  2471. uniform float cos_formula = x2 * cos_c10 + cos_c8;
  2472. sin_formula = x2 * sin_formula + sin_c6;
  2473. cos_formula = x2 * cos_formula + cos_c6;
  2474. sin_formula = x2 * sin_formula + sin_c4;
  2475. cos_formula = x2 * cos_formula + cos_c4;
  2476. sin_formula = x2 * sin_formula + sin_c2;
  2477. cos_formula = x2 * cos_formula + cos_c2;
  2478. sin_formula = x2 * sin_formula + one_vec;
  2479. cos_formula = x2 * cos_formula + one_vec;
  2480. sin_formula *= x;
  2481. *sin_result = sin_usecos ? cos_formula : sin_formula;
  2482. *cos_result = cos_usecos ? cos_formula : sin_formula;
  2483. *sin_result = sin_flipsign ? -*sin_result : *sin_result;
  2484. *cos_result = cos_flipsign ? -*cos_result : *cos_result;
  2485. }
  2486. }
  2487. __declspec(safe)
  2488. static inline float tan(float x_full) {
  2489. if (__have_native_trigonometry)
  2490. {
  2491. return __tan_varying_float(x_full);
  2492. }
  2493. else if (__math_lib == __math_lib_svml) {
  2494. return __svml_tanf(x_full);
  2495. }
  2496. else if (__math_lib == __math_lib_system) {
  2497. float ret;
  2498. foreach_active (i) {
  2499. uniform float r = __stdlib_tanf(extract(x_full, i));
  2500. ret = insert(ret, i, r);
  2501. }
  2502. return ret;
  2503. }
  2504. else if (__math_lib == __math_lib_ispc ||
  2505. __math_lib == __math_lib_ispc_fast) {
  2506. const float pi_over_four_vec = 0.785398185253143310546875;
  2507. const float four_over_pi_vec = 1.27323949337005615234375;
  2508. bool x_lt_0 = x_full < 0.;
  2509. float y = x_lt_0 ? -x_full : x_full;
  2510. float scaled = y * four_over_pi_vec;
  2511. float k_real = floor(scaled);
  2512. int k = (int)k_real;
  2513. float x = y - k_real * pi_over_four_vec;
  2514. // if k & 1, x -= Pi/4
  2515. bool need_offset = (k & 1) != 0;
  2516. x = need_offset ? x - pi_over_four_vec : x;
  2517. // if k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...
  2518. int k_mod4 = k & 3;
  2519. bool use_cotan = (k_mod4 == 1) || (k_mod4 == 2);
  2520. const float one_vec = 1.0;
  2521. const float tan_c2 = 0.33333075046539306640625;
  2522. const float tan_c4 = 0.13339905440807342529296875;
  2523. const float tan_c6 = 5.3348250687122344970703125e-2;
  2524. const float tan_c8 = 2.46033705770969390869140625e-2;
  2525. const float tan_c10 = 2.892402000725269317626953125e-3;
  2526. const float tan_c12 = 9.500005282461643218994140625e-3;
  2527. const float cot_c2 = -0.3333333432674407958984375;
  2528. const float cot_c4 = -2.222204394638538360595703125e-2;
  2529. const float cot_c6 = -2.11752182804048061370849609375e-3;
  2530. const float cot_c8 = -2.0846328698098659515380859375e-4;
  2531. const float cot_c10 = -2.548247357481159269809722900390625e-5;
  2532. const float cot_c12 = -3.5257363606433500535786151885986328125e-7;
  2533. float x2 = x * x;
  2534. float z;
  2535. cif (use_cotan) {
  2536. float cot_val = x2 * cot_c12 + cot_c10;
  2537. cot_val = x2 * cot_val + cot_c8;
  2538. cot_val = x2 * cot_val + cot_c6;
  2539. cot_val = x2 * cot_val + cot_c4;
  2540. cot_val = x2 * cot_val + cot_c2;
  2541. cot_val = x2 * cot_val + one_vec;
  2542. // The equation is for x * cot(x) but we need -x * cot(x) for the tan part.
  2543. cot_val /= -x;
  2544. z = cot_val;
  2545. } else {
  2546. float tan_val = x2 * tan_c12 + tan_c10;
  2547. tan_val = x2 * tan_val + tan_c8;
  2548. tan_val = x2 * tan_val + tan_c6;
  2549. tan_val = x2 * tan_val + tan_c4;
  2550. tan_val = x2 * tan_val + tan_c2;
  2551. tan_val = x2 * tan_val + one_vec;
  2552. // Equation was for tan(x)/x
  2553. tan_val *= x;
  2554. z = tan_val;
  2555. }
  2556. return x_lt_0 ? -z : z;
  2557. }
  2558. }
  2559. __declspec(safe)
  2560. static inline uniform float tan(uniform float x_full) {
  2561. if (__have_native_trigonometry)
  2562. {
  2563. return __tan_uniform_float(x_full);
  2564. }
  2565. else if (__math_lib == __math_lib_system ||
  2566. __math_lib == __math_lib_svml) {
  2567. return __stdlib_tanf(x_full);
  2568. }
  2569. else if (__math_lib == __math_lib_ispc ||
  2570. __math_lib == __math_lib_ispc_fast) {
  2571. const uniform float pi_over_four_vec = 0.785398185253143310546875;
  2572. const uniform float four_over_pi_vec = 1.27323949337005615234375;
  2573. uniform bool x_lt_0 = x_full < 0.;
  2574. uniform float y = x_lt_0 ? -x_full : x_full;
  2575. uniform float scaled = y * four_over_pi_vec;
  2576. uniform float k_real = floor(scaled);
  2577. uniform int k = (int)k_real;
  2578. uniform float x = y - k_real * pi_over_four_vec;
  2579. // if k & 1, x -= Pi/4
  2580. uniform bool need_offset = (k & 1) != 0;
  2581. x = need_offset ? x - pi_over_four_vec : x;
  2582. // if k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...
  2583. uniform int k_mod4 = k & 3;
  2584. uniform bool use_cotan = (k_mod4 == 1) || (k_mod4 == 2);
  2585. const uniform float one_vec = 1.0;
  2586. const uniform float tan_c2 = 0.33333075046539306640625;
  2587. const uniform float tan_c4 = 0.13339905440807342529296875;
  2588. const uniform float tan_c6 = 5.3348250687122344970703125e-2;
  2589. const uniform float tan_c8 = 2.46033705770969390869140625e-2;
  2590. const uniform float tan_c10 = 2.892402000725269317626953125e-3;
  2591. const uniform float tan_c12 = 9.500005282461643218994140625e-3;
  2592. const uniform float cot_c2 = -0.3333333432674407958984375;
  2593. const uniform float cot_c4 = -2.222204394638538360595703125e-2;
  2594. const uniform float cot_c6 = -2.11752182804048061370849609375e-3;
  2595. const uniform float cot_c8 = -2.0846328698098659515380859375e-4;
  2596. const uniform float cot_c10 = -2.548247357481159269809722900390625e-5;
  2597. const uniform float cot_c12 = -3.5257363606433500535786151885986328125e-7;
  2598. uniform float x2 = x * x;
  2599. uniform float z;
  2600. if (use_cotan) {
  2601. uniform float cot_val = x2 * cot_c12 + cot_c10;
  2602. cot_val = x2 * cot_val + cot_c8;
  2603. cot_val = x2 * cot_val + cot_c6;
  2604. cot_val = x2 * cot_val + cot_c4;
  2605. cot_val = x2 * cot_val + cot_c2;
  2606. cot_val = x2 * cot_val + one_vec;
  2607. // The equation is for x * cot(x) but we need -x * cot(x) for the tan part.
  2608. cot_val /= -x;
  2609. z = cot_val;
  2610. } else {
  2611. uniform float tan_val = x2 * tan_c12 + tan_c10;
  2612. tan_val = x2 * tan_val + tan_c8;
  2613. tan_val = x2 * tan_val + tan_c6;
  2614. tan_val = x2 * tan_val + tan_c4;
  2615. tan_val = x2 * tan_val + tan_c2;
  2616. tan_val = x2 * tan_val + one_vec;
  2617. // Equation was for tan(x)/x
  2618. tan_val *= x;
  2619. z = tan_val;
  2620. }
  2621. return x_lt_0 ? -z : z;
  2622. }
  2623. }
  2624. __declspec(safe)
  2625. static inline float atan(float x_full) {
  2626. if (__have_native_trigonometry)
  2627. {
  2628. return __atan_varying_float(x_full);
  2629. }
  2630. else if (__math_lib == __math_lib_svml) {
  2631. return __svml_atanf(x_full);
  2632. }
  2633. else if (__math_lib == __math_lib_system) {
  2634. float ret;
  2635. foreach_active (i) {
  2636. uniform float r = __stdlib_atanf(extract(x_full, i));
  2637. ret = insert(ret, i, r);
  2638. }
  2639. return ret;
  2640. }
  2641. else if (__math_lib == __math_lib_ispc ||
  2642. __math_lib == __math_lib_ispc_fast) {
  2643. const float pi_over_two_vec = 1.57079637050628662109375;
  2644. // atan(-x) = -atan(x) (so flip from negative to positive first)
  2645. // if x > 1 -> atan(x) = Pi/2 - atan(1/x)
  2646. bool x_neg = x_full < 0;
  2647. float x_flipped = x_neg ? -x_full : x_full;
  2648. bool x_gt_1 = x_flipped > 1.;
  2649. float x = x_gt_1 ? 1./x_flipped : x_flipped;
  2650. // These coefficients approximate atan(x)/x
  2651. const float atan_c0 = 0.99999988079071044921875;
  2652. const float atan_c2 = -0.3333191573619842529296875;
  2653. const float atan_c4 = 0.199689209461212158203125;
  2654. const float atan_c6 = -0.14015688002109527587890625;
  2655. const float atan_c8 = 9.905083477497100830078125e-2;
  2656. const float atan_c10 = -5.93664981424808502197265625e-2;
  2657. const float atan_c12 = 2.417283318936824798583984375e-2;
  2658. const float atan_c14 = -4.6721356920897960662841796875e-3;
  2659. float x2 = x * x;
  2660. float result = x2 * atan_c14 + atan_c12;
  2661. result = x2 * result + atan_c10;
  2662. result = x2 * result + atan_c8;
  2663. result = x2 * result + atan_c6;
  2664. result = x2 * result + atan_c4;
  2665. result = x2 * result + atan_c2;
  2666. result = x2 * result + atan_c0;
  2667. result *= x;
  2668. result = x_gt_1 ? pi_over_two_vec - result : result;
  2669. result = x_neg ? -result : result;
  2670. return result;
  2671. }
  2672. }
  2673. __declspec(safe)
  2674. static inline uniform float atan(uniform float x_full) {
  2675. if (__have_native_trigonometry)
  2676. {
  2677. return __atan_uniform_float(x_full);
  2678. }
  2679. else if (__math_lib == __math_lib_system ||
  2680. __math_lib == __math_lib_svml) {
  2681. return __stdlib_atanf(x_full);
  2682. }
  2683. else if (__math_lib == __math_lib_ispc ||
  2684. __math_lib == __math_lib_ispc_fast) {
  2685. const uniform float pi_over_two_vec = 1.57079637050628662109375;
  2686. // atan(-x) = -atan(x) (so flip from negative to positive first)
  2687. // if x > 1 -> atan(x) = Pi/2 - atan(1/x)
  2688. uniform bool x_neg = x_full < 0;
  2689. uniform float x_flipped = x_neg ? -x_full : x_full;
  2690. uniform bool x_gt_1 = x_flipped > 1.;
  2691. uniform float x = x_gt_1 ? 1./x_flipped : x_flipped;
  2692. // These coefficients approximate atan(x)/x
  2693. const uniform float atan_c0 = 0.99999988079071044921875;
  2694. const uniform float atan_c2 = -0.3333191573619842529296875;
  2695. const uniform float atan_c4 = 0.199689209461212158203125;
  2696. const uniform float atan_c6 = -0.14015688002109527587890625;
  2697. const uniform float atan_c8 = 9.905083477497100830078125e-2;
  2698. const uniform float atan_c10 = -5.93664981424808502197265625e-2;
  2699. const uniform float atan_c12 = 2.417283318936824798583984375e-2;
  2700. const uniform float atan_c14 = -4.6721356920897960662841796875e-3;
  2701. uniform float x2 = x * x;
  2702. uniform float result = x2 * atan_c14 + atan_c12;
  2703. result = x2 * result + atan_c10;
  2704. result = x2 * result + atan_c8;
  2705. result = x2 * result + atan_c6;
  2706. result = x2 * result + atan_c4;
  2707. result = x2 * result + atan_c2;
  2708. result = x2 * result + atan_c0;
  2709. result *= x;
  2710. result = x_gt_1 ? pi_over_two_vec - result : result;
  2711. result = x_neg ? -result : result;
  2712. return result;
  2713. }
  2714. }
  2715. __declspec(safe)
  2716. static inline float atan2(float y, float x) {
  2717. if (__have_native_trigonometry)
  2718. {
  2719. return __atan2_varying_float(y,x);
  2720. }
  2721. else if (__math_lib == __math_lib_svml) {
  2722. return __svml_atan2f(y, x);
  2723. }
  2724. else if (__math_lib == __math_lib_system) {
  2725. float ret;
  2726. foreach_active (i) {
  2727. uniform float r = __stdlib_atan2f(extract(y, i), extract(x, i));
  2728. ret = insert(ret, i, r);
  2729. }
  2730. return ret;
  2731. }
  2732. else if (__math_lib == __math_lib_ispc ||
  2733. __math_lib == __math_lib_ispc_fast) {
  2734. const float pi_vec = 3.1415926536;
  2735. const float pi_over_two_vec = 1.5707963267;
  2736. // atan2(y, x) =
  2737. //
  2738. // atan2(y > 0, x = +-0) -> Pi/2
  2739. // atan2(y < 0, x = +-0) -> -Pi/2
  2740. // atan2(y = +-0, x < +0) -> +-Pi
  2741. // atan2(y = +-0, x >= +0) -> +-0
  2742. //
  2743. // atan2(y >= 0, x < 0) -> Pi + atan(y/x)
  2744. // atan2(y < 0, x < 0) -> -Pi + atan(y/x)
  2745. // atan2(y, x > 0) -> atan(y/x)
  2746. //
  2747. // and then a bunch of code for dealing with infinities.
  2748. float y_over_x = y/x;
  2749. float atan_arg = atan(y_over_x);
  2750. bool x_lt_0 = x < 0;
  2751. bool y_lt_0 = y < 0;
  2752. float offset = x_lt_0 ? (y_lt_0 ? -pi_vec : pi_vec) : 0;
  2753. return offset + atan_arg;
  2754. }
  2755. }
  2756. __declspec(safe)
  2757. static inline uniform float atan2(uniform float y, uniform float x) {
  2758. if (__have_native_trigonometry)
  2759. {
  2760. return __atan2_uniform_float(y,x);
  2761. }
  2762. else if (__math_lib == __math_lib_system ||
  2763. __math_lib == __math_lib_svml) {
  2764. return __stdlib_atan2f(y, x);
  2765. }
  2766. else if (__math_lib == __math_lib_ispc ||
  2767. __math_lib == __math_lib_ispc_fast) {
  2768. const uniform float pi_vec = 3.1415927410125732421875;
  2769. const uniform float pi_over_two_vec = 1.57079637050628662109375;
  2770. uniform float y_over_x = y/x;
  2771. uniform float atan_arg = atan(y_over_x);
  2772. uniform bool x_lt_0 = x < 0;
  2773. uniform bool y_lt_0 = y < 0;
  2774. uniform float offset = x_lt_0 ? (y_lt_0 ? -pi_vec : pi_vec) : 0;
  2775. return offset + atan_arg;
  2776. }
  2777. }
  2778. __declspec(safe)
  2779. static inline float exp(float x_full) {
  2780. if (__have_native_transcendentals) {
  2781. return __exp_varying_float(x_full);
  2782. }
  2783. else if (__math_lib == __math_lib_svml) {
  2784. return __svml_expf(x_full);
  2785. }
  2786. else if (__math_lib == __math_lib_system) {
  2787. float ret;
  2788. foreach_active (i) {
  2789. uniform float r = __stdlib_expf(extract(x_full, i));
  2790. ret = insert(ret, i, r);
  2791. }
  2792. return ret;
  2793. }
  2794. else if (__math_lib == __math_lib_ispc_fast) {
  2795. float z = floor(1.44269504088896341f * x_full + 0.5f);
  2796. int n;
  2797. x_full -= z * 0.693359375f;
  2798. x_full -= z * -2.12194440e-4f;
  2799. n = (int)z;
  2800. z = x_full * x_full;
  2801. z = (((((1.9875691500E-4f * x_full + 1.3981999507E-3f) * x_full +
  2802. 8.3334519073E-3f) * x_full + 4.1665795894E-2f) * x_full +
  2803. 1.6666665459E-1f) * x_full + 5.0000001201E-1f) * z + x_full + 1.f;
  2804. x_full = ldexp(z, n);
  2805. return x_full;
  2806. }
  2807. else if (__math_lib == __math_lib_ispc) {
  2808. const float ln2_part1 = 0.6931457519;
  2809. const float ln2_part2 = 1.4286067653e-6;
  2810. const float one_over_ln2 = 1.44269502162933349609375;
  2811. float scaled = x_full * one_over_ln2;
  2812. float k_real = floor(scaled);
  2813. int k = (int)k_real;
  2814. // Reduced range version of x
  2815. float x = x_full - k_real * ln2_part1;
  2816. x -= k_real * ln2_part2;
  2817. // These coefficients are for e^x in [0, ln(2)]
  2818. const float one = 1.;
  2819. const float c2 = 0.4999999105930328369140625;
  2820. const float c3 = 0.166668415069580078125;
  2821. const float c4 = 4.16539050638675689697265625e-2;
  2822. const float c5 = 8.378830738365650177001953125e-3;
  2823. const float c6 = 1.304379315115511417388916015625e-3;
  2824. const float c7 = 2.7555381529964506626129150390625e-4;
  2825. float result = x * c7 + c6;
  2826. result = x * result + c5;
  2827. result = x * result + c4;
  2828. result = x * result + c3;
  2829. result = x * result + c2;
  2830. result = x * result + one;
  2831. result = x * result + one;
  2832. // Compute 2^k (should differ for float and double, but I'll avoid
  2833. // it for now and just do floats)
  2834. const int fpbias = 127;
  2835. int biased_n = k + fpbias;
  2836. bool overflow = k > fpbias;
  2837. // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
  2838. // we've got underflow. -127 * ln(2) -> -88.02. So the most
  2839. // negative float input that doesn't result in zero is like -88.
  2840. bool underflow = (biased_n <= 0);
  2841. const int InfBits = 0x7f800000;
  2842. biased_n <<= 23;
  2843. // Reinterpret this thing as float
  2844. float two_to_the_n = floatbits(biased_n);
  2845. // Handle both doubles and floats (hopefully eliding the copy for float)
  2846. float elemtype_2n = two_to_the_n;
  2847. result *= elemtype_2n;
  2848. result = overflow ? floatbits(InfBits) : result;
  2849. result = underflow ? 0. : result;
  2850. return result;
  2851. }
  2852. }
  2853. __declspec(safe)
  2854. static inline uniform float exp(uniform float x_full) {
  2855. if (__have_native_transcendentals) {
  2856. return __exp_uniform_float(x_full);
  2857. }
  2858. else if (__math_lib == __math_lib_system ||
  2859. __math_lib == __math_lib_svml) {
  2860. return __stdlib_expf(x_full);
  2861. }
  2862. else if (__math_lib == __math_lib_ispc_fast) {
  2863. uniform float z = floor(1.44269504088896341f * x_full + 0.5f);
  2864. uniform int n;
  2865. x_full -= z * 0.693359375f;
  2866. x_full -= z * -2.12194440e-4f;
  2867. n = (int)z;
  2868. z = x_full * x_full;
  2869. z = (((((1.9875691500E-4f * x_full + 1.3981999507E-3f) * x_full +
  2870. 8.3334519073E-3f) * x_full + 4.1665795894E-2f) * x_full +
  2871. 1.6666665459E-1f) * x_full + 5.0000001201E-1f) * z + x_full + 1.f;
  2872. x_full = ldexp(z, n);
  2873. return x_full;
  2874. }
  2875. else if (__math_lib == __math_lib_ispc) {
  2876. const uniform float ln2_part1 = 0.6931457519;
  2877. const uniform float ln2_part2 = 1.4286067653e-6;
  2878. const uniform float one_over_ln2 = 1.44269502162933349609375;
  2879. uniform float scaled = x_full * one_over_ln2;
  2880. uniform float k_real = floor(scaled);
  2881. uniform int k = (uniform int)k_real;
  2882. // Reduced range version of x
  2883. uniform float x = x_full - k_real * ln2_part1;
  2884. x -= k_real * ln2_part2;
  2885. // These coefficients are for e^x in [0, ln(2)]
  2886. const uniform float one = 1.;
  2887. const uniform float c2 = 0.4999999105930328369140625;
  2888. const uniform float c3 = 0.166668415069580078125;
  2889. const uniform float c4 = 4.16539050638675689697265625e-2;
  2890. const uniform float c5 = 8.378830738365650177001953125e-3;
  2891. const uniform float c6 = 1.304379315115511417388916015625e-3;
  2892. const uniform float c7 = 2.7555381529964506626129150390625e-4;
  2893. uniform float result = x * c7 + c6;
  2894. result = x * result + c5;
  2895. result = x * result + c4;
  2896. result = x * result + c3;
  2897. result = x * result + c2;
  2898. result = x * result + one;
  2899. result = x * result + one;
  2900. // Compute 2^k (should differ for uniform float and double, but I'll avoid
  2901. // it for now and just do uniform floats)
  2902. const uniform int fpbias = 127;
  2903. uniform int biased_n = k + fpbias;
  2904. uniform bool overflow = k > fpbias;
  2905. // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
  2906. // we've got underflow. -127 * ln(2) -> -88.02. So the most
  2907. // negative uniform float input that doesn't result in zero is like -88.
  2908. uniform bool underflow = (biased_n <= 0);
  2909. const uniform int InfBits = 0x7f800000;
  2910. biased_n <<= 23;
  2911. // Reuniform interpret this thing as uniform float
  2912. uniform float two_to_the_n = floatbits(biased_n);
  2913. // Handle both doubles and uniform floats (hopefully eliding the copy for uniform float)
  2914. uniform float elemtype_2n = two_to_the_n;
  2915. result *= elemtype_2n;
  2916. result = overflow ? floatbits(InfBits) : result;
  2917. result = underflow ? 0. : result;
  2918. return result;
  2919. }
  2920. }
  2921. // Range reduction for logarithms takes log(x) -> log(2^n * y) -> n
  2922. // * log(2) + log(y) where y is the reduced range (usually in [1/2,
  2923. // 1)).
  2924. __declspec(safe)
  2925. static inline void __range_reduce_log(float input, varying float * uniform reduced,
  2926. varying int * uniform exponent) {
  2927. int int_version = intbits(input);
  2928. // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
  2929. // exponent mask = 0111 1111 1000 0000 0000 0000 0000 0000
  2930. // 0x7 0xF 0x8 0x0 0x0 0x0 0x0 0x0
  2931. // non-exponent = 1000 0000 0111 1111 1111 1111 1111 1111
  2932. // = 0x8 0x0 0x7 0xF 0xF 0xF 0xF 0xF
  2933. //const int exponent_mask(0x7F800000)
  2934. static const int nonexponent_mask = 0x807FFFFF;
  2935. // We want the reduced version to have an exponent of -1 which is -1 + 127 after biasing or 126
  2936. static const int exponent_neg1 = (126l << 23);
  2937. // NOTE(boulos): We don't need to mask anything out since we know
  2938. // the sign bit has to be 0. If it's 1, we need to return infinity/nan
  2939. // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
  2940. int biased_exponent = int_version >> 23; // This number is [0, 255] but it means [-127, 128]
  2941. int offset_exponent = biased_exponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
  2942. *exponent = offset_exponent - 127; // get the real value
  2943. // Blend the offset_exponent with the original input (do this in
  2944. // int for now, until I decide if float can have & and &not)
  2945. int blended = (int_version & nonexponent_mask) | (exponent_neg1);
  2946. *reduced = floatbits(blended);
  2947. }
  2948. __declspec(safe)
  2949. static inline void __range_reduce_log(uniform float input, uniform float * uniform reduced,
  2950. uniform int * uniform exponent) {
  2951. uniform int int_version = intbits(input);
  2952. static const uniform int nonexponent_mask = 0x807FFFFF;
  2953. static const uniform int exponent_neg1 = (126ul << 23);
  2954. uniform int biased_exponent = int_version >> 23;
  2955. uniform int offset_exponent = biased_exponent + 1;
  2956. *exponent = offset_exponent - 127; // get the real value
  2957. uniform int blended = (int_version & nonexponent_mask) | (exponent_neg1);
  2958. *reduced = floatbits(blended);
  2959. }
  2960. __declspec(safe)
  2961. static inline float log(float x_full) {
  2962. if (__have_native_transcendentals) {
  2963. return __log_varying_float(x_full);
  2964. }
  2965. else if (__math_lib == __math_lib_svml) {
  2966. return __svml_logf(x_full);
  2967. }
  2968. else if (__math_lib == __math_lib_system) {
  2969. float ret;
  2970. foreach_active (i) {
  2971. uniform float r = __stdlib_logf(extract(x_full, i));
  2972. ret = insert(ret, i, r);
  2973. }
  2974. return ret;
  2975. }
  2976. else if (__math_lib == __math_lib_ispc_fast) {
  2977. int e;
  2978. x_full = frexp(x_full, &e);
  2979. int x_smaller_SQRTHF = (0.707106781186547524f > x_full) ? 0xffffffff : 0;
  2980. e += x_smaller_SQRTHF;
  2981. int ix_add = intbits(x_full);
  2982. ix_add &= x_smaller_SQRTHF;
  2983. x_full += floatbits(ix_add) - 1.f;
  2984. float z = x_full * x_full;
  2985. float y =
  2986. ((((((((7.0376836292E-2f * x_full
  2987. + -1.1514610310E-1f) * x_full
  2988. + 1.1676998740E-1f) * x_full
  2989. + -1.2420140846E-1f) * x_full
  2990. + 1.4249322787E-1f) * x_full
  2991. + -1.6668057665E-1f) * x_full
  2992. + 2.0000714765E-1f) * x_full
  2993. + -2.4999993993E-1f) * x_full
  2994. + 3.3333331174E-1f) * x_full * z;
  2995. float fe = (float)e;
  2996. y += fe * -2.12194440e-4;
  2997. y -= 0.5f * z;
  2998. z = x_full + y;
  2999. return z + 0.693359375 * fe;
  3000. }
  3001. else if (__math_lib == __math_lib_ispc) {
  3002. float reduced;
  3003. int exponent;
  3004. const int NaN_bits = 0x7fc00000;
  3005. const int Neg_Inf_bits = 0xFF800000;
  3006. const float NaN = floatbits(NaN_bits);
  3007. const float neg_inf = floatbits(Neg_Inf_bits);
  3008. bool use_nan = x_full < 0.;
  3009. bool use_inf = x_full == 0.;
  3010. bool exceptional = use_nan || use_inf;
  3011. const float one = 1.0;
  3012. float patched = exceptional ? one : x_full;
  3013. __range_reduce_log(patched, &reduced, &exponent);
  3014. const float ln2 = 0.693147182464599609375;
  3015. float x1 = one - reduced;
  3016. const float c1 = 0.50000095367431640625;
  3017. const float c2 = 0.33326041698455810546875;
  3018. const float c3 = 0.2519190013408660888671875;
  3019. const float c4 = 0.17541764676570892333984375;
  3020. const float c5 = 0.3424419462680816650390625;
  3021. const float c6 = -0.599632322788238525390625;
  3022. const float c7 = +1.98442304134368896484375;
  3023. const float c8 = -2.4899270534515380859375;
  3024. const float c9 = +1.7491014003753662109375;
  3025. float result = x1 * c9 + c8;
  3026. result = x1 * result + c7;
  3027. result = x1 * result + c6;
  3028. result = x1 * result + c5;
  3029. result = x1 * result + c4;
  3030. result = x1 * result + c3;
  3031. result = x1 * result + c2;
  3032. result = x1 * result + c1;
  3033. result = x1 * result + one;
  3034. // Equation was for -(ln(red)/(1-red))
  3035. result *= -x1;
  3036. result += (float)(exponent) * ln2;
  3037. return exceptional ? (use_nan ? NaN : neg_inf) : result;
  3038. }
  3039. }
  3040. __declspec(safe)
  3041. static inline uniform float log(uniform float x_full) {
  3042. if (__have_native_transcendentals) {
  3043. return __log_uniform_float(x_full);
  3044. }
  3045. else if (__math_lib == __math_lib_system ||
  3046. __math_lib == __math_lib_svml) {
  3047. return __stdlib_logf(x_full);
  3048. }
  3049. else if (__math_lib == __math_lib_ispc_fast) {
  3050. uniform int e;
  3051. x_full = frexp(x_full, &e);
  3052. uniform int x_smaller_SQRTHF = (0.707106781186547524f > x_full) ? 0xffffffff : 0;
  3053. e += x_smaller_SQRTHF;
  3054. uniform int ix_add = intbits(x_full);
  3055. ix_add &= x_smaller_SQRTHF;
  3056. x_full += floatbits(ix_add) - 1.f;
  3057. uniform float z = x_full * x_full;
  3058. uniform float y =
  3059. ((((((((7.0376836292E-2f * x_full
  3060. + -1.1514610310E-1f) * x_full
  3061. + 1.1676998740E-1f) * x_full
  3062. + -1.2420140846E-1f) * x_full
  3063. + 1.4249322787E-1f) * x_full
  3064. + -1.6668057665E-1f) * x_full
  3065. + 2.0000714765E-1f) * x_full
  3066. + -2.4999993993E-1f) * x_full
  3067. + 3.3333331174E-1f) * x_full * z;
  3068. uniform float fe = (uniform float)e;
  3069. y += fe * -2.12194440e-4;
  3070. y -= 0.5f * z;
  3071. z = x_full + y;
  3072. return z + 0.693359375 * fe;
  3073. }
  3074. else if (__math_lib == __math_lib_ispc) {
  3075. uniform float reduced;
  3076. uniform int exponent;
  3077. const uniform int NaN_bits = 0x7fc00000;
  3078. const uniform int Neg_Inf_bits = 0xFF800000;
  3079. const uniform float NaN = floatbits(NaN_bits);
  3080. const uniform float neg_inf = floatbits(Neg_Inf_bits);
  3081. uniform bool use_nan = x_full < 0.;
  3082. uniform bool use_inf = x_full == 0.;
  3083. uniform bool exceptional = use_nan || use_inf;
  3084. const uniform float one = 1.0;
  3085. uniform float patched = exceptional ? one : x_full;
  3086. __range_reduce_log(patched, &reduced, &exponent);
  3087. const uniform float ln2 = 0.693147182464599609375;
  3088. uniform float x1 = one - reduced;
  3089. const uniform float c1 = 0.50000095367431640625;
  3090. const uniform float c2 = 0.33326041698455810546875;
  3091. const uniform float c3 = 0.2519190013408660888671875;
  3092. const uniform float c4 = 0.17541764676570892333984375;
  3093. const uniform float c5 = 0.3424419462680816650390625;
  3094. const uniform float c6 = -0.599632322788238525390625;
  3095. const uniform float c7 = +1.98442304134368896484375;
  3096. const uniform float c8 = -2.4899270534515380859375;
  3097. const uniform float c9 = +1.7491014003753662109375;
  3098. uniform float result = x1 * c9 + c8;
  3099. result = x1 * result + c7;
  3100. result = x1 * result + c6;
  3101. result = x1 * result + c5;
  3102. result = x1 * result + c4;
  3103. result = x1 * result + c3;
  3104. result = x1 * result + c2;
  3105. result = x1 * result + c1;
  3106. result = x1 * result + one;
  3107. // Equation was for -(ln(red)/(1-red))
  3108. result *= -x1;
  3109. result += (uniform float)(exponent) * ln2;
  3110. return exceptional ? (use_nan ? NaN : neg_inf) : result;
  3111. }
  3112. }
  3113. __declspec(safe)
  3114. static inline float pow(float a, float b) {
  3115. if (__have_native_transcendentals) {
  3116. return __pow_varying_float(a, b);
  3117. }
  3118. else if (__math_lib == __math_lib_svml) {
  3119. return __svml_powf(a, b);
  3120. }
  3121. else if (__math_lib == __math_lib_system) {
  3122. float ret;
  3123. foreach_active (i) {
  3124. uniform float r = __stdlib_powf(extract(a, i), extract(b, i));
  3125. ret = insert(ret, i, r);
  3126. }
  3127. return ret;
  3128. }
  3129. else if (__math_lib == __math_lib_ispc ||
  3130. __math_lib == __math_lib_ispc_fast) {
  3131. return exp(b * log(a));
  3132. }
  3133. }
  3134. __declspec(safe)
  3135. static inline uniform float pow(uniform float a, uniform float b) {
  3136. if (__have_native_transcendentals) {
  3137. return __pow_uniform_float(a, b);
  3138. }
  3139. if (__math_lib == __math_lib_system ||
  3140. __math_lib == __math_lib_svml) {
  3141. return __stdlib_powf(a, b);
  3142. }
  3143. else if (__math_lib == __math_lib_ispc ||
  3144. __math_lib == __math_lib_ispc_fast) {
  3145. return exp(b * log(a));
  3146. }
  3147. }
  3148. ///////////////////////////////////////////////////////////////////////////
  3149. // Transcendentals (double precision)
  3150. __declspec(safe)
  3151. static inline double sqrt(double v) {
  3152. return __sqrt_varying_double(v);
  3153. }
  3154. __declspec(safe)
  3155. static inline uniform double sqrt(uniform double v) {
  3156. return __sqrt_uniform_double(v);
  3157. }
  3158. #define RSQRTD(QUAL) \
  3159. __declspec(safe) \
  3160. static inline QUAL double __rsqrt_iterate_##QUAL##_double(QUAL double x, QUAL double y) \
  3161. { \
  3162. QUAL double xh = x*0.5d; \
  3163. y += y*(0.5d0 - xh*y*y); \
  3164. y += y*(0.5d0 - xh*y*y); \
  3165. return y; \
  3166. } \
  3167. __declspec(safe) \
  3168. static inline QUAL double __rsqrt_safe_##QUAL##_double (QUAL double x) \
  3169. { \
  3170. if (x <= 1.0d+33 && x >= 1.0d-33) \
  3171. return __rsqrt_iterate_##QUAL##_double(x, rsqrt((QUAL float)x)); \
  3172. QUAL int64 ex = intbits(x) & 0x7fe0000000000000; \
  3173. QUAL double exp = doublebits( 0x7fd0000000000000 - ex ); /* 1.0d/exponent */ \
  3174. QUAL double exph = doublebits( 0x5fe0000000000000 - (ex >> 1)); /* 1.0d/sqrt(exponent) */ \
  3175. QUAL double y = rsqrt((QUAL float)(x*exp)); \
  3176. return __rsqrt_iterate_##QUAL##_double(x, y*exph); \
  3177. }
  3178. RSQRTD(varying)
  3179. __declspec(safe)
  3180. static inline double rsqrt(double v) {
  3181. if (__have_native_rsqrtd)
  3182. return __rsqrt_varying_double(v);
  3183. else
  3184. return __rsqrt_safe_varying_double(v);
  3185. }
  3186. RSQRTD(uniform)
  3187. __declspec(safe)
  3188. static inline uniform double rsqrt(uniform double v) {
  3189. if (__have_native_rsqrtd)
  3190. return __rsqrt_uniform_double(v);
  3191. else
  3192. return __rsqrt_safe_uniform_double(v);
  3193. }
  3194. __declspec(safe)
  3195. static inline double ldexp(double x, int n) {
  3196. unsigned int64 ex = 0x7ff0000000000000;
  3197. unsigned int64 ix = intbits(x);
  3198. ex &= ix;
  3199. ix = ix & ~0x7ff0000000000000; // clear exponent
  3200. int64 n64 = ((int64)n << 52) + ex;
  3201. ix |= n64; // insert new exponent
  3202. return doublebits(ix);
  3203. }
  3204. __declspec(safe)
  3205. static inline uniform double ldexp(uniform double x, uniform int n) {
  3206. uniform unsigned int64 ex = 0x7ff0000000000000;
  3207. uniform unsigned int64 ix = intbits(x);
  3208. ex &= ix;
  3209. ix = ix & ~0x7ff0000000000000; // clear exponent
  3210. uniform int64 n64 = ((int64)n << 52) + ex;
  3211. ix |= n64; // insert new exponent
  3212. return doublebits(ix);
  3213. }
  3214. __declspec(safe)
  3215. static inline double frexp(double x, varying int * uniform pw2) {
  3216. unsigned int64 ex = 0x7ff0000000000000; // exponent mask
  3217. unsigned int64 ix = intbits(x);
  3218. ex &= ix;
  3219. ix &= ~0x7ff0000000000000; // clear exponent
  3220. *pw2 = (int)(ex >> 52) - 1022; // compute exponent
  3221. ix |= 0x3fe0000000000000; // insert exponent +1 in x
  3222. return doublebits(ix);
  3223. }
  3224. __declspec(safe)
  3225. static inline uniform double frexp(uniform double x, uniform int * uniform pw2) {
  3226. uniform unsigned int64 ex = 0x7ff0000000000000; // exponent mask
  3227. uniform unsigned int64 ix = intbits(x);
  3228. ex &= ix;
  3229. ix &= ~0x7ff0000000000000; // clear exponent
  3230. *pw2 = (int)(ex >> 52) - 1022; // compute exponent
  3231. ix |= 0x3fe0000000000000; // insert exponent +1 in x
  3232. return doublebits(ix);
  3233. }
  3234. __declspec(safe)
  3235. static inline double sin(double x) {
  3236. if (__have_native_trigonometry)
  3237. {
  3238. return __sin_varying_double(x);
  3239. }
  3240. else if (__math_lib == __math_lib_svml)
  3241. {
  3242. return __svml_sind(x);
  3243. }
  3244. else {
  3245. double ret;
  3246. foreach_active (i) {
  3247. uniform double r = __stdlib_sin(extract(x, i));
  3248. ret = insert(ret, i, r);
  3249. }
  3250. return ret;
  3251. }
  3252. }
  3253. __declspec(safe)
  3254. static inline uniform double asin(uniform double x) {
  3255. if (__have_native_trigonometry)
  3256. {
  3257. return __asin_uniform_double(x);
  3258. }
  3259. else
  3260. {
  3261. return __stdlib_asin(x);
  3262. }
  3263. }
  3264. __declspec(safe)
  3265. static inline uniform double sin(uniform double x) {
  3266. if (__have_native_trigonometry)
  3267. {
  3268. return __sin_uniform_double(x);
  3269. }
  3270. else
  3271. return __stdlib_sin(x);
  3272. }
  3273. __declspec(safe)
  3274. static inline double asin(const double x) {
  3275. if (__have_native_trigonometry)
  3276. {
  3277. return __asin_varying_double(x);
  3278. }
  3279. else if (__math_lib == __math_lib_svml)
  3280. {
  3281. return __svml_asind(x);
  3282. }
  3283. else {
  3284. double ret;
  3285. foreach_active (i) {
  3286. uniform double r = __stdlib_asin(extract(x, i));
  3287. ret = insert(ret, i, r);
  3288. }
  3289. return ret;
  3290. }
  3291. }
  3292. __declspec(safe)
  3293. static inline double cos(const double x) {
  3294. if (__have_native_trigonometry)
  3295. {
  3296. return __cos_varying_double(x);
  3297. }
  3298. if (__math_lib == __math_lib_svml)
  3299. {
  3300. return __svml_cosd(x);
  3301. }
  3302. else {
  3303. double ret;
  3304. foreach_active (i) {
  3305. uniform double r = __stdlib_cos(extract(x, i));
  3306. ret = insert(ret, i, r);
  3307. }
  3308. return ret;
  3309. }
  3310. }
  3311. __declspec(safe)
  3312. static inline uniform double cos(uniform double x) {
  3313. if (__have_native_trigonometry)
  3314. {
  3315. return __cos_uniform_double(x);
  3316. }
  3317. else
  3318. return __stdlib_cos(x);
  3319. }
  3320. __declspec(safe)
  3321. static inline void sincos(double x, varying double * uniform sin_result,
  3322. varying double * uniform cos_result) {
  3323. if (__have_native_trigonometry)
  3324. {
  3325. __sincos_varying_double(x,sin_result,cos_result);
  3326. }
  3327. if (__math_lib == __math_lib_svml)
  3328. {
  3329. __svml_sincosd(x, sin_result, cos_result);
  3330. }
  3331. else {
  3332. foreach_active (i) {
  3333. uniform double sr, cr;
  3334. __stdlib_sincos(extract(x, i), &sr, &cr);
  3335. *sin_result = insert(*sin_result, i, sr);
  3336. *cos_result = insert(*cos_result, i, cr);
  3337. }
  3338. }
  3339. }
  3340. __declspec(safe)
  3341. static inline void sincos(uniform double x, uniform double * uniform sin_result,
  3342. uniform double * uniform cos_result) {
  3343. if (__have_native_trigonometry)
  3344. {
  3345. __sincos_uniform_double(x,sin_result, cos_result);
  3346. }
  3347. else
  3348. __stdlib_sincos(x, sin_result, cos_result);
  3349. }
  3350. __declspec(safe)
  3351. static inline double tan(double x) {
  3352. if (__have_native_trigonometry)
  3353. {
  3354. return __tan_varying_double(x);
  3355. }
  3356. else if (__math_lib == __math_lib_svml)
  3357. {
  3358. return __svml_tand(x);
  3359. }
  3360. else {
  3361. double ret;
  3362. foreach_active (i) {
  3363. uniform double r = __stdlib_tan(extract(x, i));
  3364. ret = insert(ret, i, r);
  3365. }
  3366. return ret;
  3367. }
  3368. }
  3369. __declspec(safe)
  3370. static inline uniform double tan(uniform double x) {
  3371. if (__have_native_trigonometry)
  3372. {
  3373. return __tan_uniform_double(x);
  3374. }
  3375. else
  3376. return __stdlib_tan(x);
  3377. }
  3378. __declspec(safe)
  3379. static inline double atan(double x) {
  3380. if (__have_native_trigonometry)
  3381. {
  3382. return __atan_varying_double(x);
  3383. }
  3384. else {
  3385. double ret;
  3386. foreach_active (i) {
  3387. uniform double r = __stdlib_atan(extract(x, i));
  3388. ret = insert(ret, i, r);
  3389. }
  3390. return ret;
  3391. }
  3392. }
  3393. __declspec(safe)
  3394. static inline uniform double atan(uniform double x) {
  3395. if (__have_native_trigonometry)
  3396. {
  3397. return __atan_uniform_double(x);
  3398. }
  3399. else
  3400. return __stdlib_atan(x);
  3401. }
  3402. __declspec(safe)
  3403. static inline double atan2(double y, double x) {
  3404. if (__have_native_trigonometry)
  3405. {
  3406. return __atan2_varying_double(y,x);
  3407. }
  3408. else if (__math_lib == __math_lib_svml)
  3409. {
  3410. return __svml_atan2d(y,x);
  3411. }
  3412. else {
  3413. double ret;
  3414. foreach_active (i) {
  3415. uniform double r = __stdlib_atan2(extract(y, i), extract(x, i));
  3416. ret = insert(ret, i, r);
  3417. }
  3418. return ret;
  3419. }
  3420. }
  3421. __declspec(safe)
  3422. static inline uniform double atan2(uniform double y, uniform double x) {
  3423. if (__have_native_trigonometry)
  3424. {
  3425. return __atan2_uniform_double(y,x);
  3426. }
  3427. else
  3428. return __stdlib_atan2(y, x);
  3429. }
  3430. __declspec(safe)
  3431. static inline double exp(double x) {
  3432. if (__have_native_transcendentals) {
  3433. return __exp_varying_double(x);
  3434. }
  3435. else if (__math_lib == __math_lib_svml)
  3436. {
  3437. return __svml_expd(x);
  3438. }
  3439. else {
  3440. double ret;
  3441. foreach_active (i) {
  3442. uniform double r = __stdlib_exp(extract(x, i));
  3443. ret = insert(ret, i, r);
  3444. }
  3445. return ret;
  3446. }
  3447. }
  3448. __declspec(safe)
  3449. static inline uniform double exp(uniform double x) {
  3450. if (__have_native_transcendentals) {
  3451. return __exp_uniform_double(x);
  3452. }
  3453. else
  3454. return __stdlib_exp(x);
  3455. }
  3456. __declspec(safe)
  3457. static inline double log(double x) {
  3458. if (__have_native_transcendentals) {
  3459. return __log_varying_double(x);
  3460. }
  3461. else if (__math_lib == __math_lib_svml)
  3462. {
  3463. return __svml_logd(x);
  3464. }
  3465. else {
  3466. double ret;
  3467. foreach_active (i) {
  3468. uniform double r = __stdlib_log(extract(x, i));
  3469. ret = insert(ret, i, r);
  3470. }
  3471. return ret;
  3472. }
  3473. }
  3474. __declspec(safe)
  3475. static inline uniform double log(uniform double x) {
  3476. if (__have_native_transcendentals) {
  3477. return __log_uniform_double(x);
  3478. }
  3479. else
  3480. return __stdlib_log(x);
  3481. }
  3482. __declspec(safe)
  3483. static inline double pow(double a, double b) {
  3484. if (__have_native_transcendentals) {
  3485. return __pow_varying_double(a,b);
  3486. }
  3487. else if (__math_lib == __math_lib_svml)
  3488. {
  3489. return __svml_powd(a,b);
  3490. }
  3491. else {
  3492. double ret;
  3493. foreach_active (i) {
  3494. uniform double r = __stdlib_pow(extract(a, i), extract(b, i));
  3495. ret = insert(ret, i, r);
  3496. }
  3497. return ret;
  3498. }
  3499. }
  3500. __declspec(safe)
  3501. static inline uniform double pow(uniform double a, uniform double b) {
  3502. if (__have_native_transcendentals) {
  3503. return __pow_uniform_double(a,b);
  3504. }
  3505. else
  3506. return __stdlib_pow(a, b);
  3507. }
  3508. ///////////////////////////////////////////////////////////////////////////
  3509. // half-precision floats
  3510. __declspec(safe)
  3511. static inline uniform float half_to_float(uniform unsigned int16 h) {
  3512. if (__have_native_half) {
  3513. return __half_to_float_uniform(h);
  3514. }
  3515. else {
  3516. // https://gist.github.com/2144712
  3517. // Fabian "ryg" Giesen.
  3518. static const uniform unsigned int32 shifted_exp = 0x7c00ul << 13; // exponent mask after shift
  3519. uniform int32 o = ((int32)(h & 0x7fff)) << 13; // exponent/mantissa bits
  3520. uniform unsigned int32 exp = shifted_exp & o; // just the exponent
  3521. o += (uniform int32)(127 - 15) << 23; // exponent adjust
  3522. // handle exponent special cases
  3523. if (exp == shifted_exp) // Inf/NaN?
  3524. o += (uniform unsigned int32)(128 - 16) << 23; // extra exp adjust
  3525. else if (exp == 0) { // Zero/Denormal?
  3526. o += 1ul << 23; // extra exp adjust
  3527. o = intbits(floatbits(o) - floatbits(113ul << 23)); // renormalize
  3528. }
  3529. o |= ((int32)(h & 0x8000)) << 16; // sign bit
  3530. return floatbits(o);
  3531. }
  3532. }
  3533. __declspec(safe)
  3534. static inline float half_to_float(unsigned int16 h) {
  3535. if (__have_native_half) {
  3536. return __half_to_float_varying((unsigned int16)h);
  3537. }
  3538. else {
  3539. // https://gist.github.com/2144712
  3540. // Fabian "ryg" Giesen.
  3541. const unsigned int32 shifted_exp = 0x7c00ul << 13; // exponent mask after shift
  3542. int32 o = ((int32)(h & 0x7ffful)) << 13; // exponent/mantissa bits
  3543. unsigned int32 exp = shifted_exp & o; // just the exponent
  3544. o += (int32)(127 - 15) << 23; // exponent adjust
  3545. int32 infnan_val = o + ((int32)(128 - 16) << 23);
  3546. int32 zerodenorm_val = intbits(floatbits(o + (1ul<<23)) - floatbits(113ul << 23));
  3547. int32 reg_val = (exp == 0) ? zerodenorm_val : o;
  3548. int32 sign_bit = ((int32)(h & 0x8000ul)) << 16;
  3549. return floatbits(((exp == shifted_exp) ? infnan_val : reg_val) | sign_bit);
  3550. }
  3551. }
  3552. __declspec(safe)
  3553. static inline uniform int16 float_to_half(uniform float f) {
  3554. if (__have_native_half) {
  3555. return __float_to_half_uniform(f);
  3556. }
  3557. else {
  3558. // via Fabian "ryg" Giesen.
  3559. // https://gist.github.com/2156668
  3560. uniform unsigned int32 sign_mask = 0x80000000u;
  3561. uniform int32 o;
  3562. uniform int32 fint = intbits(f);
  3563. uniform int32 sign = fint & sign_mask;
  3564. fint ^= sign;
  3565. // NOTE all the integer compares in this function can be safely
  3566. // compiled into signed compares since all operands are below
  3567. // 0x80000000. Important if you want fast straight SSE2 code (since
  3568. // there's no unsigned PCMPGTD).
  3569. // Inf or NaN (all exponent bits set)
  3570. // NaN->qNaN and Inf->Inf
  3571. // unconditional assignment here, will override with right value for
  3572. // the regular case below.
  3573. uniform int32 f32infty = 255ul << 23;
  3574. o = (fint > f32infty) ? 0x7e00u : 0x7c00u;
  3575. // (De)normalized number or zero
  3576. // update fint unconditionally to save the blending; we don't need it
  3577. // anymore for the Inf/NaN case anyway.
  3578. const uniform unsigned int32 round_mask = ~0xffful;
  3579. const uniform int32 magic = 15ul << 23;
  3580. const uniform int32 f16infty = 31ul << 23;
  3581. uniform int32 fint2 = intbits(floatbits(fint & round_mask) * floatbits(magic)) - round_mask;
  3582. fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
  3583. if (fint < f32infty)
  3584. o = fint2 >> 13; // Take the bits!
  3585. return (o | (sign >> 16));
  3586. }
  3587. }
  3588. __declspec(safe)
  3589. static inline int16 float_to_half(float f) {
  3590. if (__have_native_half) {
  3591. return __float_to_half_varying(f);
  3592. }
  3593. else {
  3594. // via Fabian "ryg" Giesen.
  3595. // https://gist.github.com/2156668
  3596. unsigned int32 sign_mask = 0x80000000u;
  3597. int32 o;
  3598. int32 fint = intbits(f);
  3599. int32 sign = fint & sign_mask;
  3600. fint ^= sign;
  3601. // NOTE all the integer compares in this function can be safely
  3602. // compiled into signed compares since all operands are below
  3603. // 0x80000000. Important if you want fast straight SSE2 code (since
  3604. // there's no unsigned PCMPGTD).
  3605. // Inf or NaN (all exponent bits set)
  3606. // NaN->qNaN and Inf->Inf
  3607. // unconditional assignment here, will override with right value for
  3608. // the regular case below.
  3609. int32 f32infty = 255ul << 23;
  3610. o = (fint > f32infty) ? 0x7e00u : 0x7c00u;
  3611. // (De)normalized number or zero
  3612. // update fint unconditionally to save the blending; we don't need it
  3613. // anymore for the Inf/NaN case anyway.
  3614. const unsigned int32 round_mask = ~0xffful;
  3615. const int32 magic = 15ul << 23;
  3616. const int32 f16infty = 31ul << 23;
  3617. // Shift exponent down, denormalize if necessary.
  3618. // NOTE This represents half-float denormals using single precision denormals.
  3619. // The main reason to do this is that there's no shift with per-lane variable
  3620. // shifts in SSE*, which we'd otherwise need. It has some funky side effects
  3621. // though:
  3622. // - This conversion will actually respect the FTZ (Flush To Zero) flag in
  3623. // MXCSR - if it's set, no half-float denormals will be generated. I'm
  3624. // honestly not sure whether this is good or bad. It's definitely interesting.
  3625. // - If the underlying HW doesn't support denormals (not an issue with Intel
  3626. // CPUs, but might be a problem on GPUs or PS3 SPUs), you will always get
  3627. // flush-to-zero behavior. This is bad, unless you're on a CPU where you don't
  3628. // care.
  3629. // - Denormals tend to be slow. FP32 denormals are rare in practice outside of things
  3630. // like recursive filters in DSP - not a typical half-float application. Whether
  3631. // FP16 denormals are rare in practice, I don't know. Whatever slow path your HW
  3632. // may or may not have for denormals, this may well hit it.
  3633. float fscale = floatbits(fint & round_mask) * floatbits(magic);
  3634. fscale = min(fscale, floatbits((31ul << 23) - 0x1000ul));
  3635. int32 fint2 = intbits(fscale) - round_mask;
  3636. if (fint < f32infty)
  3637. o = fint2 >> 13; // Take the bits!
  3638. return (o | (sign >> 16));
  3639. }
  3640. }
  3641. __declspec(safe)
  3642. static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
  3643. if (__have_native_half) {
  3644. return __half_to_float_uniform(h);
  3645. }
  3646. else {
  3647. uniform unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
  3648. uniform unsigned int32 hem = h & (int32)0x7fffu; // Pick off exponent-mantissa bits
  3649. uniform unsigned int32 xs = ((unsigned int32) hs) << 16;
  3650. uniform unsigned int32 xem = ((unsigned int32) hem) << 13;
  3651. xem += 0x38000000; // (127 - 15) << 23
  3652. return floatbits(xs | xem);
  3653. }
  3654. }
  3655. __declspec(safe)
  3656. static inline float half_to_float_fast(unsigned int16 h) {
  3657. if (__have_native_half) {
  3658. return __half_to_float_varying(h);
  3659. }
  3660. else {
  3661. unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
  3662. unsigned int32 hem = h & (int32)0x7fffu; // Pick off exponent-mantissa bits
  3663. unsigned int32 xs = ((unsigned int32) hs) << 16;
  3664. unsigned int32 xem = ((unsigned int32) hem) << 13;
  3665. return floatbits(xs | (xem + 0x38000000 /* (127 - 15) << 23 */));
  3666. }
  3667. }
  3668. __declspec(safe)
  3669. static inline uniform int16 float_to_half_fast(uniform float f) {
  3670. if (__have_native_half) {
  3671. return __float_to_half_uniform(f);
  3672. }
  3673. else {
  3674. uniform int32 x = intbits(f);
  3675. uniform unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
  3676. uniform unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
  3677. uniform unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits
  3678. uniform unsigned int32 hs = (xs >> 16); // Sign bit
  3679. // Exponent unbias the single, then bias the halfp
  3680. uniform int32 hes = ((int)(xe >> 23)) - 127 + 15;
  3681. uniform unsigned int32 he = (hes << 10); // Exponent
  3682. uniform int32 hm = (xm >> 13); // Mantissa
  3683. uniform int32 ret = (hs | he | hm);
  3684. if (xm & 0x00001000u) // Check for rounding
  3685. // Round, might overflow to inf, this is OK
  3686. ret += 1u;
  3687. return (int16)ret;
  3688. }
  3689. }
  3690. __declspec(safe)
  3691. static inline int16 float_to_half_fast(float f) {
  3692. if (__have_native_half) {
  3693. return __float_to_half_varying(f);
  3694. }
  3695. else {
  3696. int32 x = intbits(f);
  3697. unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
  3698. unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
  3699. unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits
  3700. unsigned int32 hs = (xs >> 16); // Sign bit
  3701. // Exponent unbias the single, then bias the halfp
  3702. int32 hes = ((int)(xe >> 23)) - 127 + 15;
  3703. unsigned int32 he = (hes << 10); // Exponent
  3704. int32 hm = (xm >> 13); // Mantissa
  3705. int32 ret = (hs | he | hm);
  3706. if (xm & 0x00001000u) // Check for rounding
  3707. // Round, might overflow to inf, this is OK
  3708. ret += 1u;
  3709. return (int16)ret;
  3710. }
  3711. }
  3712. ///////////////////////////////////////////////////////////////////////////
  3713. // float -> srgb8
  3714. // https://gist.github.com/2246678, from Fabian "rygorous" Giesen.
  3715. //
  3716. // The basic ideas are still the same, only this time, we squeeze
  3717. // everything into the table, even the linear part of the range; since we
  3718. // are approximating the function as piecewise linear anyway, this is
  3719. // fairly easy.
  3720. //
  3721. // In the exact version of the conversion, any value that produces an
  3722. // output float less than 0.5 will be rounded to an integer of
  3723. // zero. Inverting the linear part of the transform, we get:
  3724. //
  3725. // log2(0.5 / (255 * 12.92)) =~ -12.686
  3726. //
  3727. // which in turn means that any value smaller than about 2^(-12.687) will
  3728. // return 0. What this means is that we can adapt the clamping code to
  3729. // just clamp to [2^(-13), 1-eps] and we're covered. This means our table
  3730. // needs to cover a range of 13 different exponents from -13 to -1.
  3731. //
  3732. // The table lookup, storage and interpolation works exactly the same way
  3733. // as in the code above.
  3734. //
  3735. // Max error for the whole function (integer-rounded result minus "exact"
  3736. // value, as computed in floats using the official formula): 0.544403 at
  3737. // 0x3e9f8000
  3738. __declspec(safe)
  3739. static inline int
  3740. float_to_srgb8(float inval)
  3741. {
  3742. static const uniform unsigned int table[104] = {
  3743. 0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d,
  3744. 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
  3745. 0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a,
  3746. 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
  3747. 0x010e0033, 0x01280033, 0x01410033, 0x015b0033,
  3748. 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
  3749. 0x01dc0067, 0x020f0067, 0x02430067, 0x02760067,
  3750. 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
  3751. 0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce,
  3752. 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
  3753. 0x06970158, 0x07420142, 0x07e30130, 0x087b0120,
  3754. 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
  3755. 0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180,
  3756. 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
  3757. 0x11070264, 0x1238023e, 0x1357021d, 0x14660201,
  3758. 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
  3759. 0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad,
  3760. 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
  3761. 0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392,
  3762. 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
  3763. 0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5,
  3764. 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
  3765. 0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d,
  3766. 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
  3767. 0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f,
  3768. 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
  3769. };
  3770. static const uniform unsigned int almost_one = 0x3f7fffff;
  3771. // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
  3772. inval = max(inval, 0.0f);
  3773. inval = min(inval, floatbits(almost_one));
  3774. // Do the table lookup and unpack bias, scale
  3775. unsigned int tab = table[(intbits(inval) - 0x39000000u) >> 20];
  3776. unsigned int bias = (tab >> 16) << 9;
  3777. unsigned int scale = tab & 0xfffful;
  3778. // Grab next-highest mantissa bits and perform linear interpolation
  3779. unsigned int t = (intbits(inval) >> 12) & 0xff;
  3780. return (bias + scale*t) >> 16;
  3781. }
  3782. __declspec(safe)
  3783. static inline uniform int
  3784. float_to_srgb8(uniform float inval)
  3785. {
  3786. static const uniform unsigned int table[104] = {
  3787. 0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d,
  3788. 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
  3789. 0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a,
  3790. 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
  3791. 0x010e0033, 0x01280033, 0x01410033, 0x015b0033,
  3792. 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
  3793. 0x01dc0067, 0x020f0067, 0x02430067, 0x02760067,
  3794. 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
  3795. 0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce,
  3796. 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
  3797. 0x06970158, 0x07420142, 0x07e30130, 0x087b0120,
  3798. 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
  3799. 0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180,
  3800. 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
  3801. 0x11070264, 0x1238023e, 0x1357021d, 0x14660201,
  3802. 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
  3803. 0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad,
  3804. 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
  3805. 0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392,
  3806. 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
  3807. 0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5,
  3808. 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
  3809. 0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d,
  3810. 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
  3811. 0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f,
  3812. 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
  3813. };
  3814. static const uniform unsigned int almost_one = 0x3f7fffff;
  3815. // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
  3816. inval = max(inval, 0.0f);
  3817. inval = min(inval, floatbits(almost_one));
  3818. // Do the table lookup and unpack bias, scale
  3819. uniform unsigned int tab = table[(intbits(inval) - 0x39000000u) >> 20];
  3820. uniform unsigned int bias = (tab >> 16) << 9;
  3821. uniform unsigned int scale = tab & 0xfffful;
  3822. // Grab next-highest mantissa bits and perform linear interpolation
  3823. uniform unsigned int t = (intbits(inval) >> 12) & 0xff;
  3824. return (bias + scale*t) >> 16;
  3825. }
  3826. ///////////////////////////////////////////////////////////////////////////
  3827. // RNG stuff
  3828. struct RNGState {
  3829. unsigned int z1, z2, z3, z4;
  3830. };
  3831. static inline unsigned int random(varying RNGState * uniform state)
  3832. {
  3833. unsigned int b;
  3834. b = ((state->z1 << 6) ^ state->z1) >> 13;
  3835. state->z1 = ((state->z1 & 4294967294U) << 18) ^ b;
  3836. b = ((state->z2 << 2) ^ state->z2) >> 27;
  3837. state->z2 = ((state->z2 & 4294967288U) << 2) ^ b;
  3838. b = ((state->z3 << 13) ^ state->z3) >> 21;
  3839. state->z3 = ((state->z3 & 4294967280U) << 7) ^ b;
  3840. b = ((state->z4 << 3) ^ state->z4) >> 12;
  3841. state->z4 = ((state->z4 & 4294967168U) << 13) ^ b;
  3842. return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4);
  3843. }
  3844. static inline uniform unsigned int random(uniform RNGState * uniform state)
  3845. {
  3846. uniform unsigned int b;
  3847. b = ((state->z1 << 6) ^ state->z1) >> 13;
  3848. state->z1 = ((state->z1 & 4294967294U) << 18) ^ b;
  3849. b = ((state->z2 << 2) ^ state->z2) >> 27;
  3850. state->z2 = ((state->z2 & 4294967288U) << 2) ^ b;
  3851. b = ((state->z3 << 13) ^ state->z3) >> 21;
  3852. state->z3 = ((state->z3 & 4294967280U) << 7) ^ b;
  3853. b = ((state->z4 << 3) ^ state->z4) >> 12;
  3854. state->z4 = ((state->z4 & 4294967168U) << 13) ^ b;
  3855. return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4);
  3856. }
  3857. static inline float frandom(varying RNGState * uniform state)
  3858. {
  3859. unsigned int irand = random(state);
  3860. irand &= (1ul<<23)-1;
  3861. return floatbits(0x3F800000 | irand)-1.0f;
  3862. }
  3863. static inline uniform float frandom(uniform RNGState * uniform state)
  3864. {
  3865. uniform unsigned int irand = random(state);
  3866. irand &= (1ul<<23)-1;
  3867. return floatbits(0x3F800000 | irand)-1.0f;
  3868. }
  3869. static inline void seed_rng(varying RNGState * uniform state,
  3870. unsigned int seed) {
  3871. state->z1 = seed;
  3872. state->z2 = seed ^ 0xbeeff00d;
  3873. state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16);
  3874. state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul) << 8) |
  3875. ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24);
  3876. }
  3877. static inline void seed_rng(uniform RNGState * uniform state,
  3878. uniform unsigned int seed) {
  3879. state->z1 = seed;
  3880. state->z2 = seed ^ 0xbeeff00d;
  3881. state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16);
  3882. state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul) << 8) |
  3883. ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24);
  3884. }
  3885. static inline void fastmath() {
  3886. __fastmath();
  3887. }
  3888. ///////////////////////////////////////////////////////////////////////////
  3889. // saturation arithmetic
  3890. static inline uniform int8 saturating_add(uniform int8 a, uniform int8 b) {
  3891. uniform unsigned int8 a_unsig = a, b_unsig = b;
  3892. uniform unsigned int8 result = a_unsig + b_unsig;
  3893. a_unsig = (a_unsig >> 7) + INT8_MAX;
  3894. if ((uniform int8) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
  3895. result = a_unsig;
  3896. return result;
  3897. }
  3898. static inline varying int8 saturating_add(varying int8 a, varying int8 b) {
  3899. return __padds_vi8(a, b);
  3900. }
  3901. static inline uniform int16 saturating_add(uniform int16 a, uniform int16 b) {
  3902. uniform unsigned int16 a_unsig = a, b_unsig = b;
  3903. uniform unsigned int16 result = a_unsig + b_unsig;
  3904. a_unsig = (a_unsig >> 15) + INT16_MAX;
  3905. if ((uniform int16) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
  3906. result = a_unsig;
  3907. return result;
  3908. }
  3909. static inline varying int16 saturating_add(varying int16 a, varying int16 b) {
  3910. return __padds_vi16(a, b);
  3911. }
  3912. static inline uniform int32 saturating_add(uniform int32 a, uniform int32 b) {
  3913. uniform unsigned int32 a_unsig = a, b_unsig = b;
  3914. uniform unsigned int32 result = a_unsig + b_unsig;
  3915. a_unsig = (a_unsig >> 31) + INT32_MAX;
  3916. if ((uniform int32) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
  3917. result = a_unsig;
  3918. return result;
  3919. }
  3920. static inline varying int32 saturating_add(varying int32 a, varying int32 b) {
  3921. varying unsigned int32 a_unsig = a, b_unsig = b;
  3922. varying unsigned int32 result = a_unsig + b_unsig;
  3923. a_unsig = (a_unsig >> 31) + INT32_MAX;
  3924. if ((varying int32) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
  3925. result = a_unsig;
  3926. return result;
  3927. }
  3928. static inline uniform int64 saturating_add(uniform int64 a, uniform int64 b) {
  3929. uniform unsigned int64 a_unsig = a, b_unsig = b;
  3930. uniform unsigned int64 result = a_unsig + b_unsig;
  3931. a_unsig = (a_unsig >> 63) + INT64_MAX;
  3932. if ((uniform int64) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
  3933. result = a_unsig;
  3934. return result;
  3935. }
  3936. static inline varying int64 saturating_add(varying int64 a, varying int64 b) {
  3937. varying unsigned int64 a_unsig = a, b_unsig = b;
  3938. varying unsigned int64 result = a_unsig + b_unsig;
  3939. a_unsig = (a_unsig >> 63) + INT64_MAX;
  3940. if ((varying int64) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
  3941. result = a_unsig;
  3942. return result;
  3943. }
  3944. static inline uniform unsigned int8 saturating_add(uniform unsigned int8 a,
  3945. uniform unsigned int8 b) {
  3946. uniform unsigned int8 result = a + b;
  3947. result |= (-(uniform int8)(result < a));
  3948. return result;
  3949. }
  3950. static inline varying unsigned int8 saturating_add(varying unsigned int8 a,
  3951. varying unsigned int8 b) {
  3952. return __paddus_vi8(a, b);
  3953. }
  3954. static inline uniform unsigned int16 saturating_add(uniform unsigned int16 a,
  3955. uniform unsigned int16 b) {
  3956. uniform unsigned int16 result = a + b;
  3957. result |= (-(uniform int16)(result < a));
  3958. return result;
  3959. }
  3960. static inline varying unsigned int16 saturating_add(varying unsigned int16 a,
  3961. varying unsigned int16 b) {
  3962. return __paddus_vi16(a, b);
  3963. }
  3964. static inline uniform unsigned int32 saturating_add(uniform unsigned int32 a,
  3965. uniform unsigned int32 b) {
  3966. uniform unsigned int32 result = a + b;
  3967. result |= (-(uniform int32)(result < a));
  3968. return result;
  3969. }
  3970. static inline varying unsigned int32 saturating_add(varying unsigned int32 a,
  3971. varying unsigned int32 b) {
  3972. varying unsigned int32 result = a + b;
  3973. result |= (-(varying int32)(result < a));
  3974. return result;
  3975. }
  3976. static inline uniform unsigned int64 saturating_add(uniform unsigned int64 a,
  3977. uniform unsigned int64 b) {
  3978. uniform unsigned int64 result = a + b;
  3979. result |= (-(uniform int64)(result < a));
  3980. return result;
  3981. }
  3982. static inline varying unsigned int64 saturating_add(varying unsigned int64 a,
  3983. varying unsigned int64 b) {
  3984. varying unsigned int64 result = a + b;
  3985. result |= (-(varying int64)(result < a));
  3986. return result;
  3987. }
  3988. static inline uniform int8 saturating_sub(uniform int8 a, uniform int8 b) {
  3989. uniform unsigned int8 a_unsig = a, b_unsig = b;
  3990. uniform unsigned int8 result = a_unsig - b_unsig;
  3991. a_unsig = (a_unsig >> 7) + INT8_MAX;
  3992. if ((uniform int8) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
  3993. result = a_unsig;
  3994. return result;
  3995. }
  3996. static inline varying int8 saturating_sub(varying int8 a, varying int8 b) {
  3997. return __psubs_vi8(a, b);
  3998. }
  3999. static inline uniform int16 saturating_sub(uniform int16 a, uniform int16 b) {
  4000. uniform unsigned int16 a_unsig = a, b_unsig = b;
  4001. uniform unsigned int16 result = a_unsig - b_unsig;
  4002. a_unsig = (a_unsig >> 15) + INT16_MAX;
  4003. if ((uniform int16) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
  4004. result = a_unsig;
  4005. return result;
  4006. }
  4007. static inline varying int16 saturating_sub(varying int16 a, varying int16 b) {
  4008. return __psubs_vi16(a, b);
  4009. }
  4010. static inline uniform int32 saturating_sub(uniform int32 a, uniform int32 b) {
  4011. uniform unsigned int32 a_unsig = a, b_unsig = b;
  4012. uniform unsigned int32 result = a_unsig - b_unsig;
  4013. a_unsig = (a_unsig >> 31) + INT32_MAX;
  4014. if ((uniform int32) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
  4015. result = a_unsig;
  4016. return result;
  4017. }
  4018. static inline varying int32 saturating_sub(varying int32 a, varying int32 b) {
  4019. varying unsigned int32 a_unsig = a, b_unsig = b;
  4020. varying unsigned int32 result = a_unsig - b_unsig;
  4021. a_unsig = (a_unsig >> 31) + INT32_MAX;
  4022. if ((varying int32) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
  4023. result = a_unsig;
  4024. return result;
  4025. }
  4026. static inline uniform int64 saturating_sub(uniform int64 a, uniform int64 b) {
  4027. uniform unsigned int64 a_unsig = a, b_unsig = b;
  4028. uniform unsigned int64 result = a_unsig - b_unsig;
  4029. a_unsig = (a_unsig >> 63) + INT64_MAX;
  4030. if ((uniform int64) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
  4031. result = a_unsig;
  4032. return result;
  4033. }
  4034. static inline varying int64 saturating_sub(varying int64 a, varying int64 b) {
  4035. varying unsigned int64 a_unsig = a, b_unsig = b;
  4036. varying unsigned int64 result = a_unsig - b_unsig;
  4037. a_unsig = (a_unsig >> 63) + INT64_MAX;
  4038. if ((varying int64) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
  4039. result = a_unsig;
  4040. return result;
  4041. }
  4042. static inline uniform unsigned int8 saturating_sub(uniform unsigned int8 a,
  4043. uniform unsigned int8 b) {
  4044. uniform unsigned int8 result = a - b;
  4045. result &= (-(uniform int8)(result <= a));
  4046. return result;
  4047. }
  4048. static inline varying unsigned int8 saturating_sub(varying unsigned int8 a,
  4049. varying unsigned int8 b) {
  4050. return __psubus_vi8(a, b);
  4051. }
  4052. static inline uniform unsigned int16 saturating_sub(uniform unsigned int16 a,
  4053. uniform unsigned int16 b) {
  4054. uniform unsigned int16 result = a - b;
  4055. result &= (-(uniform int16)(result <= a));
  4056. return result;
  4057. }
  4058. static inline varying unsigned int16 saturating_sub(varying unsigned int16 a,
  4059. varying unsigned int16 b) {
  4060. return __psubus_vi16(a, b);
  4061. }
  4062. static inline uniform unsigned int32 saturating_sub(uniform unsigned int32 a,
  4063. uniform unsigned int32 b) {
  4064. uniform unsigned int32 result = a - b;
  4065. result &= (-(uniform int32)(result <= a));
  4066. return result;
  4067. }
  4068. static inline varying unsigned int32 saturating_sub(varying unsigned int32 a,
  4069. varying unsigned int32 b) {
  4070. varying unsigned int32 result = a - b;
  4071. result &= (-(varying int32)(result <= a));
  4072. return result;
  4073. }
  4074. static inline uniform unsigned int64 saturating_sub(uniform unsigned int64 a,
  4075. uniform unsigned int64 b) {
  4076. uniform unsigned int64 result = a - b;
  4077. result &= (-(uniform int64)(result <= a));
  4078. return result;
  4079. }
  4080. static inline varying unsigned int64 saturating_sub(varying unsigned int64 a,
  4081. varying unsigned int64 b) {
  4082. varying unsigned int64 result = a - b;
  4083. result &= (-(varying int64)(result <= a));
  4084. return result;
  4085. }
  4086. static inline uniform int8 saturating_div(uniform int8 a, uniform int8 b) {
  4087. /* Only one way to overflow, so test for and prevent it. */
  4088. a += !((b + 1) | ((uniform unsigned int8) a + INT8_MIN));
  4089. return a / b;
  4090. }
  4091. static inline varying int8 saturating_div(varying int8 a, varying int8 b) {
  4092. /* Only one way to overflow, so test for and prevent it. */
  4093. a += !((b + 1) | ((varying unsigned int8) a + INT8_MIN));
  4094. return a / b;
  4095. }
  4096. static inline uniform int16 saturating_div(uniform int16 a, uniform int16 b) {
  4097. /* Only one way to overflow, so test for and prevent it. */
  4098. a += !((b + 1) | ((uniform unsigned int16) a + INT16_MIN));
  4099. return a / b;
  4100. }
  4101. static inline varying int16 saturating_div(varying int16 a, varying int16 b) {
  4102. /* Only one way to overflow, so test for and prevent it. */
  4103. a += !((b + 1) | ((varying unsigned int16) a + INT16_MIN));
  4104. return a / b;
  4105. }
  4106. static inline uniform int32 saturating_div(uniform int32 a, uniform int32 b) {
  4107. /* Only one way to overflow, so test for and prevent it. */
  4108. a += !((b + 1) | ((uniform unsigned int32) a + INT32_MIN));
  4109. return a / b;
  4110. }
  4111. static inline varying int32 saturating_div(varying int32 a, varying int32 b) {
  4112. /* Only one way to overflow, so test for and prevent it. */
  4113. a += !((b + 1) | ((varying unsigned int32) a + INT32_MIN));
  4114. return a / b;
  4115. }
  4116. static inline uniform int64 saturating_div(uniform int64 a, uniform int64 b) {
  4117. /* Only one way to overflow, so test for and prevent it. */
  4118. a += !((b + 1) | ((uniform unsigned int64) a + INT64_MIN));
  4119. return a / b;
  4120. }
  4121. static inline varying int64 saturating_div(varying int64 a, varying int64 b) {
  4122. /* Only one way to overflow, so test for and prevent it. */
  4123. a += !((b + 1) | ((varying unsigned int64) a + INT64_MIN));
  4124. return a / b;
  4125. }
  4126. static inline uniform unsigned int8 saturating_div(uniform unsigned int8 a,
  4127. uniform unsigned int8 b) {
  4128. /* No overflow possible */
  4129. return a / b;
  4130. }
  4131. static inline varying unsigned int8 saturating_div(varying unsigned int8 a,
  4132. varying unsigned int8 b) {
  4133. /* No overflow possible */
  4134. return a / b;
  4135. }
  4136. static inline uniform unsigned int16 saturating_div(uniform unsigned int16 a,
  4137. uniform unsigned int16 b) {
  4138. /* No overflow possible */
  4139. return a / b;
  4140. }
  4141. static inline varying unsigned int16 saturating_div(varying unsigned int16 a,
  4142. varying unsigned int16 b) {
  4143. /* No overflow possible */
  4144. return a / b;
  4145. }
  4146. static inline uniform unsigned int32 saturating_div(uniform unsigned int32 a,
  4147. uniform unsigned int32 b) {
  4148. /* No overflow possible */
  4149. return a / b;
  4150. }
  4151. static inline varying unsigned int32 saturating_div(varying unsigned int32 a,
  4152. varying unsigned int32 b) {
  4153. /* No overflow possible */
  4154. return a / b;
  4155. }
  4156. static inline uniform unsigned int64 saturating_div(uniform unsigned int64 a,
  4157. uniform unsigned int64 b) {
  4158. /* No overflow possible */
  4159. return a / b;
  4160. }
  4161. static inline varying unsigned int64 saturating_div(varying unsigned int64 a,
  4162. varying unsigned int64 b) {
  4163. /* No overflow possible */
  4164. return a / b;
  4165. }
  4166. static inline uniform int8 saturating_mul(uniform int8 a, uniform int8 b) {
  4167. uniform int16 result = (uniform int16) a * (uniform int16) b;
  4168. uniform unsigned int8 result2 = ((uniform unsigned int8) (a ^ b) >> 7) + INT8_MAX;
  4169. uniform int8 hi = result >> 8;
  4170. uniform int8 lo = result;
  4171. if (hi != (lo >> 7))
  4172. result = result2;
  4173. return result;
  4174. }
  4175. static inline varying int8 saturating_mul(varying int8 a, varying int8 b) {
  4176. varying int16 result = (varying int16) a * (varying int16) b;
  4177. varying unsigned int8 result2 = ((varying unsigned int8) (a ^ b) >> 7) + INT8_MAX;
  4178. varying int8 hi = result >> 8;
  4179. varying int8 lo = result;
  4180. if (hi != (lo >> 7))
  4181. result = result2;
  4182. return result;
  4183. }
  4184. static inline uniform int16 saturating_mul(uniform int16 a, uniform int16 b) {
  4185. uniform int32 result = (uniform int32) a * (uniform int32) b;
  4186. uniform unsigned int16 result2 = ((uniform unsigned int16) (a ^ b) >> 15) + INT16_MAX;
  4187. uniform int16 hi = result >> 16;
  4188. uniform int16 lo = result;
  4189. if (hi != (lo >> 15))
  4190. result = result2;
  4191. return result;
  4192. }
  4193. static inline varying int16 saturating_mul(varying int16 a, varying int16 b) {
  4194. varying int32 result = (varying int32) a * (varying int32) b;
  4195. varying unsigned int16 result2 = ((varying unsigned int16) (a ^ b) >> 15) + INT16_MAX;
  4196. varying int16 hi = result >> 16;
  4197. varying int16 lo = result;
  4198. if (hi != (lo >> 15))
  4199. result = result2;
  4200. return result;
  4201. }
  4202. static inline uniform int32 saturating_mul(uniform int32 a, uniform int32 b) {
  4203. uniform int64 result = (uniform int64) a * (uniform int64) b;
  4204. uniform unsigned int32 result2 = ((uniform unsigned int32) (a ^ b) >> 31) + INT32_MAX;
  4205. uniform int32 hi = result >> 32;
  4206. uniform int32 lo = result;
  4207. if (hi != (lo >> 31))
  4208. result = result2;
  4209. return result;
  4210. }
  4211. static inline varying int32 saturating_mul(varying int32 a, varying int32 b) {
  4212. varying int64 result = (varying int64) a * (varying int64) b;
  4213. varying unsigned int32 result2 = ((varying unsigned int32) (a ^ b) >> 31) + INT32_MAX;
  4214. varying int32 hi = result >> 32;
  4215. varying int32 lo = result;
  4216. if (hi != (lo >> 31))
  4217. result = result2;
  4218. return result;
  4219. }
  4220. static inline uniform unsigned int8 saturating_mul(uniform unsigned int8 a,
  4221. uniform unsigned int8 b) {
  4222. uniform unsigned int16 result = (uniform unsigned int16) a *
  4223. (uniform unsigned int16) b;
  4224. uniform unsigned int8 hi = result >> 8;
  4225. uniform unsigned int8 lo = result;
  4226. return lo | - (uniform int8) !! hi;
  4227. }
  4228. static inline varying unsigned int8 saturating_mul(varying unsigned int8 a,
  4229. varying unsigned int8 b) {
  4230. varying unsigned int16 result = (varying unsigned int16) a *
  4231. (varying unsigned int16) b;
  4232. varying unsigned int8 hi = result >> 8;
  4233. varying unsigned int8 lo = result;
  4234. return lo | - (varying int8) !! hi;
  4235. }
  4236. static inline uniform unsigned int16 saturating_mul(uniform unsigned int16 a,
  4237. uniform unsigned int16 b) {
  4238. uniform unsigned int32 result = (uniform unsigned int32) a *
  4239. (uniform unsigned int32) b;
  4240. uniform unsigned int16 hi = result >> 16;
  4241. uniform unsigned int16 lo = result;
  4242. return lo | - (uniform int16) !! hi;
  4243. }
  4244. static inline varying unsigned int16 saturating_mul(varying unsigned int16 a,
  4245. varying unsigned int16 b) {
  4246. varying unsigned int32 result = (varying unsigned int32) a *
  4247. (varying unsigned int32) b;
  4248. varying unsigned int16 hi = result >> 16;
  4249. varying unsigned int16 lo = result;
  4250. return lo | - (varying int16) !! hi;
  4251. }
  4252. static inline uniform unsigned int32 saturating_mul(uniform unsigned int32 a,
  4253. uniform unsigned int32 b) {
  4254. uniform unsigned int64 result = (uniform unsigned int64) a *
  4255. (uniform unsigned int64) b;
  4256. uniform unsigned int32 hi = result >> 32;
  4257. uniform unsigned int32 lo = result;
  4258. return lo | - (uniform int32) !! hi;
  4259. }
  4260. static inline varying unsigned int32 saturating_mul(varying unsigned int32 a,
  4261. varying unsigned int32 b) {
  4262. varying unsigned int64 result = (varying unsigned int64) a *
  4263. (varying unsigned int64) b;
  4264. varying unsigned int32 hi = result >> 32;
  4265. varying unsigned int32 lo = result;
  4266. return lo | - (varying int32) !! hi;
  4267. }
  4268. static inline uniform int64 saturating_mul(uniform int64 a, uniform int64 b) {
  4269. uniform unsigned int64 ret = 0;
  4270. uniform int8 sign = (((a > 0) && (b > 0)) || ((a < 0) && (b < 0))) ? 1 : -1;
  4271. uniform unsigned int64 a_abs = 0;
  4272. uniform unsigned int64 b_abs = 0;
  4273. if (a == INT64_MIN)
  4274. // Operation "-" is undefined for "INT64_MIN", as it causes overflow.
  4275. // But converting INT64_MIN to unsigned type yields the correct result,
  4276. // i.e. it will be positive value -INT64_MIN.
  4277. // See 6.3.1.3 section in C99 standart for more details (ISPC follows
  4278. // C standard, unless it's specifically different in the language).
  4279. a_abs = (uniform unsigned int64) INT64_MIN;
  4280. else
  4281. a_abs = (a > 0) ? a : -a;
  4282. if (b == INT64_MIN)
  4283. b_abs = (uniform unsigned int64) INT64_MIN;
  4284. else
  4285. b_abs = (b > 0) ? b : -b;
  4286. uniform unsigned int32 a0 = a_abs & 0xFFFFFFFF;
  4287. uniform unsigned int32 b0 = b_abs & 0xFFFFFFFF;
  4288. uniform unsigned int32 a1 = a_abs >> 32;
  4289. uniform unsigned int32 b1 = b_abs >> 32;
  4290. if ((a1 != 0) && (b1 != 0)) {
  4291. if (sign > 0) {
  4292. return INT64_MAX;
  4293. }
  4294. else {
  4295. return INT64_MIN;
  4296. }
  4297. } else if (a1 != 0) {
  4298. ret = saturating_add ((uniform unsigned int64) saturating_mul (b0, a1) << 32 ,
  4299. (uniform unsigned int64) (a0) * b0);
  4300. } else if (b1 != 0) {
  4301. ret = saturating_add ((uniform unsigned int64) saturating_mul (a0, b1) << 32 ,
  4302. (uniform unsigned int64) (a0) * b0);
  4303. } else {
  4304. ret = a_abs * b_abs;
  4305. }
  4306. if ((sign < 0) && (ret >= (uniform unsigned int64) INT64_MIN)) {
  4307. return INT64_MIN;
  4308. } else if ((sign > 0) && (ret >= INT64_MAX)) {
  4309. return INT64_MAX;
  4310. } else {
  4311. return ret * sign;
  4312. }
  4313. }
  4314. static inline varying int64 saturating_mul(varying int64 a, varying int64 b) {
  4315. varying unsigned int64 ret = 0;
  4316. varying int8 sign = (((a > 0) && (b > 0)) || ((a < 0) && (b < 0))) ? 1 : -1;
  4317. varying unsigned int64 a_abs = 0;
  4318. varying unsigned int64 b_abs = 0;
  4319. if (a == INT64_MIN)
  4320. // Operation "-" is undefined for "INT64_MIN", as it causes overflow.
  4321. // But converting INT64_MIN to unsigned type yields the correct result,
  4322. // i.e. it will be positive value -INT64_MIN.
  4323. // See 6.3.1.3 section in C99 standart for more details (ISPC follows
  4324. // C standard, unless it's specifically different in the language).
  4325. a_abs = (varying unsigned int64) INT64_MIN;
  4326. else
  4327. a_abs = (a > 0) ? a : -a;
  4328. if (b == INT64_MIN)
  4329. b_abs = (varying unsigned int64) INT64_MIN;
  4330. else
  4331. b_abs = (b > 0) ? b : -b;
  4332. varying unsigned int32 a0 = a_abs & 0xFFFFFFFF;
  4333. varying unsigned int32 b0 = b_abs & 0xFFFFFFFF;
  4334. varying unsigned int32 a1 = a_abs >> 32;
  4335. varying unsigned int32 b1 = b_abs >> 32;
  4336. if ((a1 != 0) && (b1 != 0)) {
  4337. if (sign > 0) {
  4338. return INT64_MAX;
  4339. }
  4340. else {
  4341. return INT64_MIN;
  4342. }
  4343. } else if (a1 != 0) {
  4344. ret = saturating_add ((varying unsigned int64) saturating_mul (b0, a1) << 32 ,
  4345. (varying unsigned int64) (a0) * b0);
  4346. } else if (b1 != 0) {
  4347. ret = saturating_add ((varying unsigned int64) saturating_mul (a0, b1) << 32 ,
  4348. (varying unsigned int64) (a0) * b0);
  4349. } else {
  4350. ret = a_abs * b_abs;
  4351. }
  4352. if ((sign < 0) && (ret >= (varying unsigned int64) INT64_MIN)) {
  4353. return INT64_MIN;
  4354. } else if ((sign > 0) && (ret >= INT64_MAX)) {
  4355. return INT64_MAX;
  4356. } else {
  4357. return ret * sign;
  4358. }
  4359. }
  4360. static inline uniform unsigned int64 saturating_mul(uniform unsigned int64 a,
  4361. uniform unsigned int64 b) {
  4362. uniform unsigned int32 a0 = a & 0xFFFFFFFF;
  4363. uniform unsigned int32 b0 = b & 0xFFFFFFFF;
  4364. uniform unsigned int32 a1 = a >> 32;
  4365. uniform unsigned int32 b1 = b >> 32;
  4366. if ((a1 != 0) && (b1 != 0)) {
  4367. return UINT64_MAX;
  4368. } else if (a1 != 0) {
  4369. return saturating_add ((uniform unsigned int64) saturating_mul (b0, a1) << 32 ,
  4370. (uniform unsigned int64) (a0) * b0);
  4371. } else if (b1 != 0) {
  4372. return saturating_add ((uniform unsigned int64) saturating_mul (a0, b1) << 32 ,
  4373. (uniform unsigned int64) (a0) * b0);
  4374. } else {
  4375. return a * b;
  4376. }
  4377. }
  4378. static inline varying unsigned int64 saturating_mul(varying unsigned int64 a,
  4379. varying unsigned int64 b) {
  4380. varying unsigned int32 a0 = a & 0xFFFFFFFF;
  4381. varying unsigned int32 b0 = b & 0xFFFFFFFF;
  4382. varying unsigned int32 a1 = a >> 32;
  4383. varying unsigned int32 b1 = b >> 32;
  4384. if ((a1 != 0) && (b1 != 0)) {
  4385. return UINT64_MAX;
  4386. } else if (a1 != 0) {
  4387. return saturating_add ((varying unsigned int64) saturating_mul (b0, a1) << 32 ,
  4388. (varying unsigned int64) (a0) * b0);
  4389. } else if (b1 != 0) {
  4390. return saturating_add ((varying unsigned int64) saturating_mul (a0, b1) << 32 ,
  4391. (varying unsigned int64) (a0) * b0);
  4392. } else {
  4393. return a * b;
  4394. }
  4395. }
  4396. ///////////////////////////////////////////////////////////////////////////
  4397. // rdrand
  4398. static inline uniform bool rdrand(float * uniform ptr) {
  4399. if (__have_native_rand == false)
  4400. return false;
  4401. else {
  4402. uniform int32 irand;
  4403. uniform bool success = __rdrand_i32(&irand);
  4404. if (success) {
  4405. irand &= (1ul<<23)-1;
  4406. *ptr = floatbits(0x3F800000 | irand)-1.0f;
  4407. }
  4408. return success;
  4409. }
  4410. }
  4411. static inline bool rdrand(varying float * uniform ptr) {
  4412. if (__have_native_rand == false)
  4413. return false;
  4414. else {
  4415. bool success = false;
  4416. foreach_active (index) {
  4417. uniform int32 irand;
  4418. if (__rdrand_i32(&irand)) {
  4419. // FIXME: it probably would be preferable, here and in the
  4420. // following rdrand() function, to do the int->float stuff
  4421. // in vector form. However, we need to be careful to not
  4422. // clobber any existing already-set values in *ptr with
  4423. // inactive lanes here...
  4424. irand &= (1ul<<23)-1;
  4425. *ptr = floatbits(0x3F800000 | irand)-1.0f;
  4426. success = true;
  4427. }
  4428. }
  4429. return success;
  4430. }
  4431. }
  4432. static inline bool rdrand(float * ptr) {
  4433. if (__have_native_rand == false)
  4434. return false;
  4435. else {
  4436. float * uniform ptrs[programCount];
  4437. ptrs[programIndex] = ptr;
  4438. bool success = false;
  4439. foreach_active (index) {
  4440. uniform int32 irand;
  4441. if (__rdrand_i32(&irand)) {
  4442. irand &= (1ul<<23)-1;
  4443. *ptrs[index] = floatbits(0x3F800000 | irand)-1.0f;
  4444. success = true;
  4445. }
  4446. }
  4447. return success;
  4448. }
  4449. }
  4450. static inline uniform bool rdrand(int16 * uniform ptr) {
  4451. if (__have_native_rand == false)
  4452. return false;
  4453. else
  4454. return __rdrand_i16(ptr);
  4455. }
  4456. static inline bool rdrand(varying int16 * uniform ptr) {
  4457. if (__have_native_rand == false)
  4458. return false;
  4459. else {
  4460. bool success = false;
  4461. foreach_active (index) {
  4462. uniform int16 irand;
  4463. if (__rdrand_i16(&irand)) {
  4464. *ptr = irand;
  4465. success = true;
  4466. }
  4467. }
  4468. return success;
  4469. }
  4470. }
  4471. static inline bool rdrand(int16 * ptr) {
  4472. if (__have_native_rand == false)
  4473. return false;
  4474. else {
  4475. int16 * uniform ptrs[programCount];
  4476. ptrs[programIndex] = ptr;
  4477. bool success = false;
  4478. foreach_active (index) {
  4479. uniform int16 irand;
  4480. if (__rdrand_i16(&irand)) {
  4481. *ptrs[index] = irand;
  4482. success = true;
  4483. }
  4484. }
  4485. return success;
  4486. }
  4487. }
  4488. static inline uniform bool rdrand(int32 * uniform ptr) {
  4489. if (__have_native_rand == false)
  4490. return false;
  4491. else
  4492. return __rdrand_i32(ptr);
  4493. }
  4494. static inline bool rdrand(varying int32 * uniform ptr) {
  4495. if (__have_native_rand == false)
  4496. return false;
  4497. else {
  4498. bool success = false;
  4499. foreach_active (index) {
  4500. uniform int32 irand;
  4501. if (__rdrand_i32(&irand)) {
  4502. *ptr = irand;
  4503. success = true;
  4504. }
  4505. }
  4506. return success;
  4507. }
  4508. }
  4509. static inline bool rdrand(int32 * ptr) {
  4510. if (__have_native_rand == false)
  4511. return false;
  4512. else {
  4513. int32 * uniform ptrs[programCount];
  4514. ptrs[programIndex] = ptr;
  4515. bool success = false;
  4516. foreach_active (index) {
  4517. uniform int32 irand;
  4518. if (__rdrand_i32(&irand)) {
  4519. *ptrs[index] = irand;
  4520. success = true;
  4521. }
  4522. }
  4523. return success;
  4524. }
  4525. }
  4526. static inline uniform bool rdrand(int64 * uniform ptr) {
  4527. if (__have_native_rand == false)
  4528. return false;
  4529. else
  4530. return __rdrand_i64(ptr);
  4531. }
  4532. static inline bool rdrand(varying int64 * uniform ptr) {
  4533. if (__have_native_rand == false)
  4534. return false;
  4535. else {
  4536. bool success = false;
  4537. foreach_active (index) {
  4538. uniform int64 irand;
  4539. if (__rdrand_i64(&irand)) {
  4540. *ptr = irand;
  4541. success = true;
  4542. }
  4543. }
  4544. return success;
  4545. }
  4546. }
  4547. static inline bool rdrand(int64 * ptr) {
  4548. if (__have_native_rand == false)
  4549. return false;
  4550. else {
  4551. int64 * uniform ptrs[programCount];
  4552. ptrs[programIndex] = ptr;
  4553. bool success = false;
  4554. foreach_active (index) {
  4555. uniform int64 irand;
  4556. if (__rdrand_i64(&irand)) {
  4557. *ptrs[index] = irand;
  4558. success = true;
  4559. }
  4560. }
  4561. return success;
  4562. }
  4563. }
  4564. ///////////////////////////////////////////////////////////////////////////
  4565. // Fast vector integer division
  4566. /* These tables and the algorithms in the __fast_idiv() functions below are
  4567. from Halide; the idea is based on the paper "Division by Invariant
  4568. Integers using Multiplication" by Granlund and Montgomery.
  4569. Copyright (c) 2012 MIT CSAIL
  4570. Developed by:
  4571. The Halide team
  4572. MIT CSAIL
  4573. http://halide-lang.org
  4574. Permission is hereby granted, free of charge, to any person obtaining a
  4575. copy of this software and associated documentation files (the
  4576. "Software"), to deal in the Software without restriction, including
  4577. without limitation the rights to use, copy, modify, merge, publish,
  4578. distribute, sublicense, and/or sell copies of the Software, and to
  4579. permit persons to whom the Software is furnished to do so, subject to
  4580. the following conditions:
  4581. The above copyright notice and this permission notice shall be included
  4582. in all copies or substantial portions of the Software.
  4583. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  4584. OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  4585. MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  4586. NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  4587. LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  4588. OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  4589. WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  4590. */
  4591. static const uniform int64 __idiv_table_u8[][3] = {
  4592. {0, 0LL, 1}, {1, 171LL, 1}, {0, 0LL, 2},
  4593. {1, 205LL, 2}, {1, 171LL, 2}, {2, 37LL, 2},
  4594. {0, 0LL, 3}, {1, 57LL, 1}, {1, 205LL, 3},
  4595. {2, 117LL, 3}, {1, 171LL, 3}, {1, 79LL, 2},
  4596. {2, 37LL, 3}, {1, 137LL, 3}, {0, 0LL, 4},
  4597. {1, 241LL, 4}, {1, 57LL, 2}, {1, 27LL, 1},
  4598. {1, 205LL, 4}, {2, 135LL, 4}, {2, 117LL, 4},
  4599. {2, 101LL, 4}, {1, 171LL, 4}, {1, 41LL, 2},
  4600. {1, 79LL, 3}, {1, 19LL, 1}, {2, 37LL, 4},
  4601. {2, 27LL, 4}, {1, 137LL, 4}, {2, 9LL, 4},
  4602. {0, 0LL, 5}, {1, 249LL, 5}, {1, 241LL, 5},
  4603. {1, 235LL, 5}, {1, 57LL, 3}, {1, 111LL, 4},
  4604. {1, 27LL, 2}, {2, 165LL, 5}, {1, 205LL, 5},
  4605. {1, 25LL, 2}, {2, 135LL, 5}, {1, 191LL, 5},
  4606. {1, 187LL, 5}, {2, 109LL, 5}, {2, 101LL, 5},
  4607. {1, 175LL, 5}, {1, 171LL, 5}, {2, 79LL, 5},
  4608. {1, 41LL, 3}, {1, 161LL, 5}, {1, 79LL, 4},
  4609. {1, 155LL, 5}, {1, 19LL, 2}, {1, 149LL, 5},
  4610. {2, 37LL, 5}, {1, 9LL, 1}, {2, 27LL, 5},
  4611. {1, 139LL, 5}, {1, 137LL, 5}, {2, 13LL, 5},
  4612. {2, 9LL, 5}, {2, 5LL, 5}, {0, 0LL, 6},
  4613. {1, 253LL, 6}, {1, 249LL, 6}, {1, 245LL, 6},
  4614. {1, 121LL, 5}, {1, 119LL, 5}, {1, 235LL, 6},
  4615. {1, 231LL, 6}, {1, 57LL, 4}, {1, 225LL, 6},
  4616. {1, 111LL, 5}, {1, 219LL, 6}, {1, 27LL, 3},
  4617. {1, 213LL, 6}, {2, 165LL, 6}, {1, 13LL, 2},
  4618. {1, 205LL, 6}, {1, 203LL, 6}, {1, 25LL, 3},
  4619. {1, 99LL, 5}, {2, 135LL, 6}, {1, 193LL, 6},
  4620. {1, 191LL, 6}, {1, 189LL, 6}, {1, 187LL, 6},
  4621. {1, 185LL, 6}, {1, 183LL, 6}, {1, 181LL, 6},
  4622. {1, 179LL, 6}, {1, 177LL, 6}, {1, 175LL, 6},
  4623. {1, 173LL, 6}, {1, 171LL, 6}, {1, 169LL, 6},
  4624. {1, 21LL, 3}, {1, 83LL, 5}, {1, 41LL, 4},
  4625. {1, 163LL, 6}, {1, 161LL, 6}, {2, 63LL, 6},
  4626. {1, 79LL, 5}, {2, 57LL, 6}, {1, 155LL, 6},
  4627. {2, 51LL, 6}, {1, 19LL, 3}, {1, 151LL, 6},
  4628. {1, 149LL, 6}, {1, 37LL, 4}, {2, 37LL, 6},
  4629. {1, 145LL, 6}, {1, 9LL, 2}, {1, 143LL, 6},
  4630. {2, 27LL, 6}, {2, 25LL, 6}, {1, 139LL, 6},
  4631. {1, 69LL, 5}, {1, 137LL, 6}, {2, 15LL, 6},
  4632. {2, 13LL, 6}, {2, 11LL, 6}, {2, 9LL, 6},
  4633. {2, 7LL, 6}, {2, 5LL, 6}, {2, 3LL, 6},
  4634. {0, 0LL, 7}, {1, 255LL, 7}, {1, 127LL, 6},
  4635. {1, 63LL, 5}, {1, 125LL, 6}, {1, 31LL, 4},
  4636. {1, 123LL, 6}, {1, 61LL, 5}, {1, 121LL, 6},
  4637. {1, 15LL, 3}, {1, 119LL, 6}, {1, 59LL, 5},
  4638. {1, 235LL, 7}, {1, 117LL, 6}, {1, 29LL, 4},
  4639. {1, 115LL, 6}, {1, 57LL, 5}, {1, 113LL, 6},
  4640. {1, 225LL, 7}, {1, 7LL, 2}, {1, 111LL, 6},
  4641. {1, 55LL, 5}, {1, 219LL, 7}, {1, 109LL, 6},
  4642. {1, 27LL, 4}, {1, 215LL, 7}, {1, 107LL, 6},
  4643. {1, 53LL, 5}, {1, 211LL, 7}, {1, 105LL, 6},
  4644. {1, 13LL, 3}, {1, 207LL, 7}, {1, 103LL, 6},
  4645. {1, 51LL, 5}, {1, 203LL, 7}, {1, 101LL, 6},
  4646. {1, 25LL, 4}, {1, 199LL, 7}, {1, 99LL, 6},
  4647. {1, 197LL, 7}, {1, 49LL, 5}, {1, 97LL, 6},
  4648. {1, 193LL, 7}, {1, 3LL, 1}, {1, 191LL, 7},
  4649. {1, 95LL, 6}, {1, 189LL, 7}, {1, 47LL, 5},
  4650. {1, 187LL, 7}, {1, 93LL, 6}, {1, 185LL, 7},
  4651. {1, 23LL, 4}, {1, 183LL, 7}, {1, 91LL, 6},
  4652. {1, 181LL, 7}, {1, 45LL, 5}, {1, 179LL, 7},
  4653. {1, 89LL, 6}, {1, 177LL, 7}, {1, 11LL, 3},
  4654. {1, 175LL, 7}, {1, 87LL, 6}, {1, 173LL, 7},
  4655. {1, 43LL, 5}, {1, 171LL, 7}, {1, 85LL, 6},
  4656. {1, 169LL, 7}, {2, 81LL, 7}, {1, 21LL, 4},
  4657. {1, 167LL, 7}, {1, 83LL, 6}, {1, 165LL, 7},
  4658. {1, 41LL, 5}, {2, 71LL, 7}, {1, 163LL, 7},
  4659. {1, 81LL, 6}, {1, 161LL, 7}, {1, 5LL, 2},
  4660. {2, 63LL, 7}, {1, 159LL, 7}, {1, 79LL, 6},
  4661. {1, 157LL, 7}, {2, 57LL, 7}, {1, 39LL, 5},
  4662. {1, 155LL, 7}, {1, 77LL, 6}, {2, 51LL, 7},
  4663. {1, 153LL, 7}, {1, 19LL, 4}, {2, 47LL, 7},
  4664. {1, 151LL, 7}, {1, 75LL, 6}, {1, 149LL, 7},
  4665. {2, 41LL, 7}, {1, 37LL, 5}, {1, 147LL, 7},
  4666. {2, 37LL, 7}, {1, 73LL, 6}, {1, 145LL, 7},
  4667. {2, 33LL, 7}, {1, 9LL, 3}, {2, 31LL, 7},
  4668. {1, 143LL, 7}, {1, 71LL, 6}, {2, 27LL, 7},
  4669. {1, 141LL, 7}, {2, 25LL, 7}, {1, 35LL, 5},
  4670. {1, 139LL, 7}, {2, 21LL, 7}, {1, 69LL, 6},
  4671. {2, 19LL, 7}, {1, 137LL, 7}, {1, 17LL, 4},
  4672. {2, 15LL, 7}, {1, 135LL, 7}, {2, 13LL, 7},
  4673. {1, 67LL, 6}, {2, 11LL, 7}, {1, 133LL, 7},
  4674. {2, 9LL, 7}, {1, 33LL, 5}, {2, 7LL, 7},
  4675. {1, 131LL, 7}, {2, 5LL, 7}, {1, 65LL, 6},
  4676. {2, 3LL, 7}, {1, 129LL, 7}, {0, 0LL, 8},
  4677. };
  4678. static const uniform int64 __idiv_table_s8[][3] = {
  4679. {0, 0LL, 1}, {1, 86LL, 0}, {0, 0LL, 2},
  4680. {1, 103LL, 1}, {1, 43LL, 0}, {1, 147LL, 2},
  4681. {0, 0LL, 3}, {1, 57LL, 1}, {1, 103LL, 2},
  4682. {1, 187LL, 3}, {1, 43LL, 1}, {1, 79LL, 2},
  4683. {1, 147LL, 3}, {1, 137LL, 3}, {0, 0LL, 4},
  4684. {1, 121LL, 3}, {1, 57LL, 2}, {1, 27LL, 1},
  4685. {1, 103LL, 3}, {1, 49LL, 2}, {1, 187LL, 4},
  4686. {1, 179LL, 4}, {1, 43LL, 2}, {1, 41LL, 2},
  4687. {1, 79LL, 3}, {1, 19LL, 1}, {1, 147LL, 4},
  4688. {1, 71LL, 3}, {1, 137LL, 4}, {1, 133LL, 4},
  4689. {0, 0LL, 5}, {1, 125LL, 4}, {1, 121LL, 4},
  4690. {1, 59LL, 3}, {1, 57LL, 3}, {1, 111LL, 4},
  4691. {1, 27LL, 2}, {1, 211LL, 5}, {1, 103LL, 4},
  4692. {1, 25LL, 2}, {1, 49LL, 3}, {1, 6LL, 0},
  4693. {1, 47LL, 3}, {1, 23LL, 2}, {1, 45LL, 3},
  4694. {1, 11LL, 1}, {1, 43LL, 3}, {1, 21LL, 2},
  4695. {1, 41LL, 3}, {1, 81LL, 4}, {1, 79LL, 4},
  4696. {1, 39LL, 3}, {1, 19LL, 2}, {1, 75LL, 4},
  4697. {1, 147LL, 5}, {1, 9LL, 1}, {1, 71LL, 4},
  4698. {1, 35LL, 3}, {1, 137LL, 5}, {1, 135LL, 5},
  4699. {1, 133LL, 5}, {1, 131LL, 5}, {0, 0LL, 6},
  4700. {1, 127LL, 5}, {1, 63LL, 4}, {1, 31LL, 3},
  4701. {1, 61LL, 4}, {1, 15LL, 2}, {1, 59LL, 4},
  4702. {1, 29LL, 3}, {1, 57LL, 4}, {1, 113LL, 5},
  4703. {1, 7LL, 1}, {1, 55LL, 4}, {1, 27LL, 3},
  4704. {1, 107LL, 5}, {1, 53LL, 4}, {1, 13LL, 2},
  4705. {1, 103LL, 5}, {1, 51LL, 4}, {1, 25LL, 3},
  4706. {1, 99LL, 5}, {1, 49LL, 4}, {1, 97LL, 5},
  4707. {1, 3LL, 0}, {1, 95LL, 5}, {1, 47LL, 4},
  4708. {1, 93LL, 5}, {1, 23LL, 3}, {1, 91LL, 5},
  4709. {1, 45LL, 4}, {1, 89LL, 5}, {1, 11LL, 2},
  4710. {1, 87LL, 5}, {1, 43LL, 4}, {1, 85LL, 5},
  4711. {1, 21LL, 3}, {1, 83LL, 5}, {1, 41LL, 4},
  4712. {1, 163LL, 6}, {1, 81LL, 5}, {1, 5LL, 1},
  4713. {1, 79LL, 5}, {1, 157LL, 6}, {1, 39LL, 4},
  4714. {1, 77LL, 5}, {1, 19LL, 3}, {1, 151LL, 6},
  4715. {1, 75LL, 5}, {1, 37LL, 4}, {1, 147LL, 6},
  4716. {1, 73LL, 5}, {1, 9LL, 2}, {1, 143LL, 6},
  4717. {1, 71LL, 5}, {1, 141LL, 6}, {1, 35LL, 4},
  4718. {1, 69LL, 5}, {1, 137LL, 6}, {1, 17LL, 3},
  4719. {1, 135LL, 6}, {1, 67LL, 5}, {1, 133LL, 6},
  4720. {1, 33LL, 4}, {1, 131LL, 6}, {1, 65LL, 5},
  4721. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4722. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4723. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4724. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4725. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4726. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4727. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4728. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4729. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4730. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4731. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4732. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4733. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4734. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4735. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4736. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4737. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4738. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4739. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4740. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4741. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4742. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4743. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4744. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4745. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4746. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4747. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4748. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4749. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4750. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4751. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4752. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4753. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4754. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4755. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4756. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4757. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4758. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4759. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4760. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4761. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4762. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4763. {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
  4764. };
  4765. static const uniform int64 __idiv_table_u16[][3] = {
  4766. {0, 0LL, 1}, {1, 43691LL, 1}, {0, 0LL, 2},
  4767. {1, 52429LL, 2}, {1, 43691LL, 2}, {2, 9363LL, 2},
  4768. {0, 0LL, 3}, {1, 58255LL, 3}, {1, 52429LL, 3},
  4769. {1, 47663LL, 3}, {1, 43691LL, 3}, {1, 20165LL, 2},
  4770. {2, 9363LL, 3}, {1, 34953LL, 3}, {0, 0LL, 4},
  4771. {1, 61681LL, 4}, {1, 58255LL, 4}, {1, 55189LL, 4},
  4772. {1, 52429LL, 4}, {2, 34329LL, 4}, {1, 47663LL, 4},
  4773. {2, 25645LL, 4}, {1, 43691LL, 4}, {2, 18351LL, 4},
  4774. {1, 20165LL, 3}, {2, 12137LL, 4}, {2, 9363LL, 4},
  4775. {1, 18079LL, 3}, {1, 34953LL, 4}, {2, 2115LL, 4},
  4776. {0, 0LL, 5}, {1, 63551LL, 5}, {1, 61681LL, 5},
  4777. {1, 59919LL, 5}, {1, 58255LL, 5}, {1, 7085LL, 2},
  4778. {1, 55189LL, 5}, {2, 42011LL, 5}, {1, 52429LL, 5},
  4779. {2, 36765LL, 5}, {2, 34329LL, 5}, {1, 48771LL, 5},
  4780. {1, 47663LL, 5}, {1, 11651LL, 3}, {2, 25645LL, 5},
  4781. {2, 23705LL, 5}, {1, 43691LL, 5}, {2, 20063LL, 5},
  4782. {2, 18351LL, 5}, {1, 41121LL, 5}, {1, 20165LL, 4},
  4783. {1, 39569LL, 5}, {2, 12137LL, 5}, {2, 10725LL, 5},
  4784. {2, 9363LL, 5}, {2, 8049LL, 5}, {1, 18079LL, 4},
  4785. {1, 35545LL, 5}, {1, 34953LL, 5}, {1, 8595LL, 3},
  4786. {2, 2115LL, 5}, {2, 1041LL, 5}, {0, 0LL, 6},
  4787. {1, 4033LL, 2}, {1, 63551LL, 6}, {1, 31301LL, 5},
  4788. {1, 61681LL, 6}, {2, 56039LL, 6}, {1, 59919LL, 6},
  4789. {1, 59075LL, 6}, {1, 58255LL, 6}, {1, 57457LL, 6},
  4790. {1, 7085LL, 3}, {2, 46313LL, 6}, {1, 55189LL, 6},
  4791. {1, 6809LL, 3}, {2, 42011LL, 6}, {1, 53093LL, 6},
  4792. {1, 52429LL, 6}, {1, 25891LL, 5}, {2, 36765LL, 6},
  4793. {1, 25267LL, 5}, {2, 34329LL, 6}, {1, 49345LL, 6},
  4794. {1, 48771LL, 6}, {1, 48211LL, 6}, {1, 47663LL, 6},
  4795. {2, 28719LL, 6}, {1, 11651LL, 4}, {2, 26647LL, 6},
  4796. {2, 25645LL, 6}, {2, 24665LL, 6}, {2, 23705LL, 6},
  4797. {1, 44151LL, 6}, {1, 43691LL, 6}, {2, 20945LL, 6},
  4798. {2, 20063LL, 6}, {1, 42367LL, 6}, {2, 18351LL, 6},
  4799. {1, 5191LL, 3}, {1, 41121LL, 6}, {1, 20361LL, 5},
  4800. {1, 20165LL, 5}, {1, 19973LL, 5}, {1, 39569LL, 6},
  4801. {2, 12863LL, 6}, {2, 12137LL, 6}, {1, 2405LL, 2},
  4802. {2, 10725LL, 6}, {1, 37787LL, 6}, {2, 9363LL, 6},
  4803. {1, 18559LL, 5}, {2, 8049LL, 6}, {2, 7409LL, 6},
  4804. {1, 18079LL, 5}, {1, 35849LL, 6}, {1, 35545LL, 6},
  4805. {2, 4957LL, 6}, {1, 34953LL, 6}, {1, 4333LL, 3},
  4806. {1, 8595LL, 4}, {2, 2665LL, 6}, {2, 2115LL, 6},
  4807. {2, 1573LL, 6}, {2, 1041LL, 6}, {2, 517LL, 6},
  4808. {0, 0LL, 7}, {1, 16257LL, 5}, {1, 4033LL, 3},
  4809. {1, 16009LL, 5}, {1, 63551LL, 7}, {1, 63073LL, 7},
  4810. {1, 31301LL, 6}, {1, 31069LL, 6}, {1, 61681LL, 7},
  4811. {1, 61231LL, 7}, {2, 56039LL, 7}, {1, 30175LL, 6},
  4812. {1, 59919LL, 7}, {1, 29747LL, 6}, {1, 59075LL, 7},
  4813. {1, 29331LL, 6}, {1, 58255LL, 7}, {1, 57853LL, 7},
  4814. {1, 57457LL, 7}, {1, 28533LL, 6}, {1, 7085LL, 4},
  4815. {1, 14075LL, 5}, {2, 46313LL, 7}, {1, 27777LL, 6},
  4816. {1, 55189LL, 7}, {1, 13707LL, 5}, {1, 6809LL, 4},
  4817. {2, 42705LL, 7}, {2, 42011LL, 7}, {1, 53431LL, 7},
  4818. {1, 53093LL, 7}, {1, 52759LL, 7}, {1, 52429LL, 7},
  4819. {2, 38671LL, 7}, {1, 25891LL, 6}, {1, 6433LL, 4},
  4820. {2, 36765LL, 7}, {2, 36145LL, 7}, {1, 25267LL, 6},
  4821. {2, 34927LL, 7}, {2, 34329LL, 7}, {1, 49637LL, 7},
  4822. {1, 49345LL, 7}, {2, 32577LL, 7}, {1, 48771LL, 7},
  4823. {2, 31443LL, 7}, {1, 48211LL, 7}, {1, 47935LL, 7},
  4824. {1, 47663LL, 7}, {2, 29251LL, 7}, {2, 28719LL, 7},
  4825. {1, 2929LL, 3}, {1, 11651LL, 5}, {1, 23173LL, 6},
  4826. {2, 26647LL, 7}, {1, 2865LL, 3}, {2, 25645LL, 7},
  4827. {1, 1417LL, 2}, {2, 24665LL, 7}, {1, 44859LL, 7},
  4828. {2, 23705LL, 7}, {2, 23233LL, 7}, {1, 44151LL, 7},
  4829. {1, 2745LL, 3}, {1, 43691LL, 7}, {2, 21393LL, 7},
  4830. {2, 20945LL, 7}, {1, 43019LL, 7}, {2, 20063LL, 7},
  4831. {1, 21291LL, 6}, {1, 42367LL, 7}, {1, 21077LL, 6},
  4832. {2, 18351LL, 7}, {1, 41735LL, 7}, {1, 5191LL, 4},
  4833. {2, 17111LL, 7}, {1, 41121LL, 7}, {2, 16305LL, 7},
  4834. {1, 20361LL, 6}, {1, 40525LL, 7}, {1, 20165LL, 6},
  4835. {1, 40137LL, 7}, {1, 19973LL, 6}, {1, 39757LL, 7},
  4836. {1, 39569LL, 7}, {2, 13231LL, 7}, {2, 12863LL, 7},
  4837. {1, 39017LL, 7}, {2, 12137LL, 7}, {2, 11779LL, 7},
  4838. {1, 2405LL, 3}, {2, 11073LL, 7}, {2, 10725LL, 7},
  4839. {1, 18979LL, 6}, {1, 37787LL, 7}, {2, 9699LL, 7},
  4840. {2, 9363LL, 7}, {1, 37283LL, 7}, {1, 18559LL, 6},
  4841. {2, 8373LL, 7}, {2, 8049LL, 7}, {1, 4579LL, 4},
  4842. {2, 7409LL, 7}, {2, 7093LL, 7}, {1, 18079LL, 6},
  4843. {1, 36003LL, 7}, {1, 35849LL, 7}, {2, 5857LL, 7},
  4844. {1, 35545LL, 7}, {1, 35395LL, 7}, {2, 4957LL, 7},
  4845. {1, 35099LL, 7}, {1, 34953LL, 7}, {1, 4351LL, 4},
  4846. {1, 4333LL, 4}, {2, 3507LL, 7}, {1, 8595LL, 5},
  4847. {2, 2943LL, 7}, {2, 2665LL, 7}, {1, 16981LL, 6},
  4848. {2, 2115LL, 7}, {2, 1843LL, 7}, {2, 1573LL, 7},
  4849. {1, 33421LL, 7}, {2, 1041LL, 7}, {1, 33157LL, 7},
  4850. {2, 517LL, 7}, {1, 32897LL, 7}, {0, 0LL, 8},
  4851. };
  4852. static const uniform int64 __idiv_table_s16[][3] = {
  4853. {0, 0LL, 1}, {1, 21846LL, 0}, {0, 0LL, 2},
  4854. {1, 26215LL, 1}, {1, 10923LL, 0}, {1, 18725LL, 1},
  4855. {0, 0LL, 3}, {1, 7282LL, 0}, {1, 26215LL, 2},
  4856. {1, 5958LL, 0}, {1, 10923LL, 1}, {1, 20165LL, 2},
  4857. {1, 18725LL, 2}, {1, 34953LL, 3}, {0, 0LL, 4},
  4858. {1, 30841LL, 3}, {1, 3641LL, 0}, {1, 55189LL, 4},
  4859. {1, 26215LL, 3}, {1, 49933LL, 4}, {1, 2979LL, 0},
  4860. {1, 45591LL, 4}, {1, 10923LL, 2}, {1, 5243LL, 1},
  4861. {1, 20165LL, 3}, {1, 38837LL, 4}, {1, 18725LL, 3},
  4862. {1, 18079LL, 3}, {1, 34953LL, 4}, {1, 16913LL, 3},
  4863. {0, 0LL, 5}, {1, 1986LL, 0}, {1, 30841LL, 4},
  4864. {1, 3745LL, 1}, {1, 3641LL, 1}, {1, 7085LL, 2},
  4865. {1, 55189LL, 5}, {1, 26887LL, 4}, {1, 26215LL, 4},
  4866. {1, 51151LL, 5}, {1, 49933LL, 5}, {1, 12193LL, 3},
  4867. {1, 2979LL, 1}, {1, 11651LL, 3}, {1, 45591LL, 5},
  4868. {1, 44621LL, 5}, {1, 10923LL, 3}, {1, 2675LL, 1},
  4869. {1, 5243LL, 2}, {1, 41121LL, 5}, {1, 20165LL, 4},
  4870. {1, 19785LL, 4}, {1, 38837LL, 5}, {1, 38131LL, 5},
  4871. {1, 18725LL, 4}, {1, 36793LL, 5}, {1, 18079LL, 4},
  4872. {1, 17773LL, 4}, {1, 34953LL, 5}, {1, 8595LL, 3},
  4873. {1, 16913LL, 4}, {1, 33289LL, 5}, {0, 0LL, 6},
  4874. {1, 4033LL, 2}, {1, 993LL, 0}, {1, 31301LL, 5},
  4875. {1, 30841LL, 5}, {1, 15197LL, 4}, {1, 3745LL, 2},
  4876. {1, 14769LL, 4}, {1, 3641LL, 2}, {1, 57457LL, 6},
  4877. {1, 7085LL, 3}, {1, 55925LL, 6}, {1, 55189LL, 6},
  4878. {1, 6809LL, 3}, {1, 26887LL, 5}, {1, 26547LL, 5},
  4879. {1, 26215LL, 5}, {1, 25891LL, 5}, {1, 51151LL, 6},
  4880. {1, 25267LL, 5}, {1, 49933LL, 6}, {1, 24673LL, 5},
  4881. {1, 12193LL, 4}, {1, 48211LL, 6}, {1, 2979LL, 2},
  4882. {1, 5891LL, 3}, {1, 11651LL, 4}, {1, 11523LL, 4},
  4883. {1, 45591LL, 6}, {1, 45101LL, 6}, {1, 44621LL, 6},
  4884. {1, 44151LL, 6}, {1, 10923LL, 4}, {1, 43241LL, 6},
  4885. {1, 2675LL, 2}, {1, 662LL, 0}, {1, 5243LL, 3},
  4886. {1, 5191LL, 3}, {1, 41121LL, 6}, {1, 20361LL, 5},
  4887. {1, 20165LL, 5}, {1, 19973LL, 5}, {1, 19785LL, 5},
  4888. {1, 1225LL, 1}, {1, 38837LL, 6}, {1, 2405LL, 2},
  4889. {1, 38131LL, 6}, {1, 37787LL, 6}, {1, 18725LL, 5},
  4890. {1, 18559LL, 5}, {1, 36793LL, 6}, {1, 36473LL, 6},
  4891. {1, 18079LL, 5}, {1, 35849LL, 6}, {1, 17773LL, 5},
  4892. {1, 35247LL, 6}, {1, 34953LL, 6}, {1, 4333LL, 3},
  4893. {1, 8595LL, 4}, {1, 34101LL, 6}, {1, 16913LL, 5},
  4894. {1, 33555LL, 6}, {1, 33289LL, 6}, {1, 33027LL, 6},
  4895. {0, 0LL, 7}, {1, 16257LL, 5}, {1, 4033LL, 3},
  4896. {1, 16009LL, 5}, {1, 993LL, 1}, {1, 31537LL, 6},
  4897. {1, 31301LL, 6}, {1, 31069LL, 6}, {1, 30841LL, 6},
  4898. {1, 3827LL, 3}, {1, 15197LL, 5}, {1, 30175LL, 6},
  4899. {1, 3745LL, 3}, {1, 29747LL, 6}, {1, 14769LL, 5},
  4900. {1, 29331LL, 6}, {1, 3641LL, 3}, {1, 28927LL, 6},
  4901. {1, 57457LL, 7}, {1, 28533LL, 6}, {1, 7085LL, 4},
  4902. {1, 14075LL, 5}, {1, 55925LL, 7}, {1, 27777LL, 6},
  4903. {1, 55189LL, 7}, {1, 13707LL, 5}, {1, 6809LL, 4},
  4904. {1, 54121LL, 7}, {1, 26887LL, 6}, {1, 6679LL, 4},
  4905. {1, 26547LL, 6}, {1, 6595LL, 4}, {1, 26215LL, 6},
  4906. {1, 6513LL, 4}, {1, 25891LL, 6}, {1, 6433LL, 4},
  4907. {1, 51151LL, 7}, {1, 50841LL, 7}, {1, 25267LL, 6},
  4908. {1, 6279LL, 4}, {1, 49933LL, 7}, {1, 24819LL, 6},
  4909. {1, 24673LL, 6}, {1, 49057LL, 7}, {1, 12193LL, 5},
  4910. {1, 24245LL, 6}, {1, 48211LL, 7}, {1, 749LL, 1},
  4911. {1, 2979LL, 3}, {1, 23697LL, 6}, {1, 5891LL, 4},
  4912. {1, 2929LL, 3}, {1, 11651LL, 5}, {1, 23173LL, 6},
  4913. {1, 11523LL, 5}, {1, 2865LL, 3}, {1, 45591LL, 7},
  4914. {1, 1417LL, 2}, {1, 45101LL, 7}, {1, 11215LL, 5},
  4915. {1, 44621LL, 7}, {1, 44385LL, 7}, {1, 44151LL, 7},
  4916. {1, 2745LL, 3}, {1, 10923LL, 5}, {1, 43465LL, 7},
  4917. {1, 43241LL, 7}, {1, 43019LL, 7}, {1, 2675LL, 3},
  4918. {1, 21291LL, 6}, {1, 331LL, 0}, {1, 21077LL, 6},
  4919. {1, 5243LL, 4}, {1, 41735LL, 7}, {1, 5191LL, 4},
  4920. {1, 10331LL, 5}, {1, 41121LL, 7}, {1, 40921LL, 7},
  4921. {1, 20361LL, 6}, {1, 40525LL, 7}, {1, 20165LL, 6},
  4922. {1, 20069LL, 6}, {1, 19973LL, 6}, {1, 39757LL, 7},
  4923. {1, 19785LL, 6}, {1, 4923LL, 4}, {1, 1225LL, 2},
  4924. {1, 39017LL, 7}, {1, 38837LL, 7}, {1, 19329LL, 6},
  4925. {1, 2405LL, 3}, {1, 38305LL, 7}, {1, 38131LL, 7},
  4926. {1, 18979LL, 6}, {1, 37787LL, 7}, {1, 18809LL, 6},
  4927. {1, 18725LL, 6}, {1, 37283LL, 7}, {1, 18559LL, 6},
  4928. {1, 36955LL, 7}, {1, 36793LL, 7}, {1, 4579LL, 4},
  4929. {1, 36473LL, 7}, {1, 36315LL, 7}, {1, 18079LL, 6},
  4930. {1, 36003LL, 7}, {1, 35849LL, 7}, {1, 35697LL, 7},
  4931. {1, 17773LL, 6}, {1, 8849LL, 5}, {1, 35247LL, 7},
  4932. {1, 35099LL, 7}, {1, 34953LL, 7}, {1, 4351LL, 4},
  4933. {1, 4333LL, 4}, {1, 17261LL, 6}, {1, 8595LL, 5},
  4934. {1, 535LL, 1}, {1, 34101LL, 7}, {1, 16981LL, 6},
  4935. {1, 16913LL, 6}, {1, 16845LL, 6}, {1, 33555LL, 7},
  4936. {1, 33421LL, 7}, {1, 33289LL, 7}, {1, 33157LL, 7},
  4937. {1, 33027LL, 7}, {1, 32897LL, 7}, {1, 32769LL, 7},
  4938. };
  4939. static const uniform int64 __idiv_table_u32[][3] = {
  4940. {0, 0LL, 1}, {1, 2863311531LL, 1}, {0, 0LL, 2},
  4941. {1, 3435973837LL, 2}, {1, 2863311531LL, 2}, {2, 613566757LL, 2},
  4942. {0, 0LL, 3}, {1, 954437177LL, 1}, {1, 3435973837LL, 3},
  4943. {1, 3123612579LL, 3}, {1, 2863311531LL, 3}, {1, 1321528399LL, 2},
  4944. {2, 613566757LL, 3}, {1, 2290649225LL, 3}, {0, 0LL, 4},
  4945. {1, 4042322161LL, 4}, {1, 954437177LL, 2}, {2, 2938661835LL, 4},
  4946. {1, 3435973837LL, 4}, {2, 2249744775LL, 4}, {1, 3123612579LL, 4},
  4947. {1, 2987803337LL, 4}, {1, 2863311531LL, 4}, {1, 1374389535LL, 3},
  4948. {1, 1321528399LL, 3}, {2, 795364315LL, 4}, {2, 613566757LL, 4},
  4949. {1, 2369637129LL, 4}, {1, 2290649225LL, 4}, {2, 138547333LL, 4},
  4950. {0, 0LL, 5}, {1, 1041204193LL, 3}, {1, 4042322161LL, 5},
  4951. {2, 3558687189LL, 5}, {1, 954437177LL, 3}, {2, 3134165325LL, 5},
  4952. {2, 2938661835LL, 5}, {2, 2753184165LL, 5}, {1, 3435973837LL, 5},
  4953. {1, 3352169597LL, 5}, {2, 2249744775LL, 5}, {1, 799063683LL, 3},
  4954. {1, 3123612579LL, 5}, {2, 1813430637LL, 5}, {1, 2987803337LL, 5},
  4955. {1, 2924233053LL, 5}, {1, 2863311531LL, 5}, {1, 1402438301LL, 4},
  4956. {1, 1374389535LL, 4}, {1, 2694881441LL, 5}, {1, 1321528399LL, 4},
  4957. {2, 891408307LL, 5}, {2, 795364315LL, 5}, {2, 702812831LL, 5},
  4958. {2, 613566757LL, 5}, {2, 527452125LL, 5}, {1, 2369637129LL, 5},
  4959. {1, 582368447LL, 3}, {1, 2290649225LL, 5}, {1, 1126548799LL, 4},
  4960. {2, 138547333LL, 5}, {2, 68174085LL, 5}, {0, 0LL, 6},
  4961. {1, 4228890877LL, 6}, {1, 1041204193LL, 4}, {1, 128207979LL, 1},
  4962. {1, 4042322161LL, 6}, {1, 1991868891LL, 5}, {2, 3558687189LL, 6},
  4963. {1, 3871519817LL, 6}, {1, 954437177LL, 4}, {2, 3235934265LL, 6},
  4964. {2, 3134165325LL, 6}, {1, 458129845LL, 3}, {2, 2938661835LL, 6},
  4965. {1, 892460737LL, 4}, {2, 2753184165LL, 6}, {1, 3479467177LL, 6},
  4966. {1, 3435973837LL, 6}, {1, 3393554407LL, 6}, {1, 3352169597LL, 6},
  4967. {1, 827945503LL, 4}, {2, 2249744775LL, 6}, {1, 3233857729LL, 6},
  4968. {1, 799063683LL, 4}, {1, 789879043LL, 4}, {1, 3123612579LL, 6},
  4969. {1, 3088515809LL, 6}, {2, 1813430637LL, 6}, {2, 1746305385LL, 6},
  4970. {1, 2987803337LL, 6}, {1, 2955676419LL, 6}, {1, 2924233053LL, 6},
  4971. {2, 1491936009LL, 6}, {1, 2863311531LL, 6}, {2, 1372618415LL, 6},
  4972. {1, 1402438301LL, 5}, {1, 2776544515LL, 6}, {1, 1374389535LL, 5},
  4973. {2, 1148159575LL, 6}, {1, 2694881441LL, 6}, {2, 1042467791LL, 6},
  4974. {1, 1321528399LL, 5}, {2, 940802361LL, 6}, {2, 891408307LL, 6},
  4975. {2, 842937507LL, 6}, {2, 795364315LL, 6}, {2, 748664025LL, 6},
  4976. {2, 702812831LL, 6}, {2, 657787785LL, 6}, {2, 613566757LL, 6},
  4977. {2, 570128403LL, 6}, {2, 527452125LL, 6}, {2, 485518043LL, 6},
  4978. {1, 2369637129LL, 6}, {2, 403800345LL, 6}, {1, 582368447LL, 4},
  4979. {1, 1154949189LL, 5}, {1, 2290649225LL, 6}, {2, 248469183LL, 6},
  4980. {1, 1126548799LL, 5}, {2, 174592167LL, 6}, {2, 138547333LL, 6},
  4981. {1, 274877907LL, 3}, {2, 68174085LL, 6}, {2, 33818641LL, 6},
  4982. {0, 0LL, 7}, {1, 266354561LL, 3}, {1, 4228890877LL, 7},
  4983. {1, 4196609267LL, 7}, {1, 1041204193LL, 5}, {1, 4133502361LL, 7},
  4984. {1, 128207979LL, 2}, {1, 4072265289LL, 7}, {1, 4042322161LL, 7},
  4985. {1, 125400505LL, 2}, {1, 1991868891LL, 6}, {1, 1977538899LL, 6},
  4986. {2, 3558687189LL, 7}, {1, 974744351LL, 5}, {1, 3871519817LL, 7},
  4987. {1, 3844446251LL, 7}, {1, 954437177LL, 5}, {1, 3791419407LL, 7},
  4988. {2, 3235934265LL, 7}, {1, 3739835469LL, 7}, {2, 3134165325LL, 7},
  4989. {1, 3689636335LL, 7}, {1, 458129845LL, 4}, {1, 910191745LL, 5},
  4990. {2, 2938661835LL, 7}, {1, 3593175255LL, 7}, {1, 892460737LL, 5},
  4991. {1, 3546811703LL, 7}, {2, 2753184165LL, 7}, {1, 875407347LL, 5},
  4992. {1, 3479467177LL, 7}, {2, 2620200175LL, 7}, {1, 3435973837LL, 7},
  4993. {1, 3414632385LL, 7}, {1, 3393554407LL, 7}, {1, 3372735055LL, 7},
  4994. {1, 3352169597LL, 7}, {1, 1665926709LL, 6}, {1, 827945503LL, 5},
  4995. {1, 1645975491LL, 6}, {2, 2249744775LL, 7}, {1, 1626496491LL, 6},
  4996. {1, 3233857729LL, 7}, {2, 2134925265LL, 7}, {1, 799063683LL, 5},
  4997. {2, 2060591247LL, 7}, {1, 789879043LL, 5}, {1, 1570730897LL, 6},
  4998. {1, 3123612579LL, 7}, {2, 1916962805LL, 7}, {1, 3088515809LL, 7},
  4999. {2, 1847555765LL, 7}, {2, 1813430637LL, 7}, {1, 3037324939LL, 7},
  5000. {2, 1746305385LL, 7}, {1, 3004130131LL, 7}, {1, 2987803337LL, 7},
  5001. {2, 1648338801LL, 7}, {1, 2955676419LL, 7}, {1, 2939870663LL, 7},
  5002. {1, 2924233053LL, 7}, {2, 1522554545LL, 7}, {2, 1491936009LL, 7},
  5003. {1, 2878302691LL, 7}, {1, 2863311531LL, 7}, {1, 356059465LL, 4},
  5004. {2, 1372618415LL, 7}, {2, 1343553873LL, 7}, {1, 1402438301LL, 6},
  5005. {2, 1286310003LL, 7}, {1, 2776544515LL, 7}, {1, 1381296015LL, 6},
  5006. {1, 1374389535LL, 6}, {1, 42735993LL, 1}, {2, 1148159575LL, 7},
  5007. {1, 2708156719LL, 7}, {1, 2694881441LL, 7}, {1, 1340867839LL, 6},
  5008. {2, 1042467791LL, 7}, {1, 663956297LL, 5}, {1, 1321528399LL, 6},
  5009. {1, 2630410593LL, 7}, {2, 940802361LL, 7}, {1, 2605477791LL, 7},
  5010. {2, 891408307LL, 7}, {1, 2581013211LL, 7}, {2, 842937507LL, 7},
  5011. {1, 1278501893LL, 6}, {2, 795364315LL, 7}, {2, 771906565LL, 7},
  5012. {2, 748664025LL, 7}, {2, 725633745LL, 7}, {2, 702812831LL, 7},
  5013. {2, 680198441LL, 7}, {2, 657787785LL, 7}, {2, 635578121LL, 7},
  5014. {2, 613566757LL, 7}, {1, 2443359173LL, 7}, {2, 570128403LL, 7},
  5015. {2, 548696263LL, 7}, {2, 527452125LL, 7}, {1, 1200340205LL, 6},
  5016. {2, 485518043LL, 7}, {2, 464823301LL, 7}, {1, 2369637129LL, 7},
  5017. {2, 423966729LL, 7}, {2, 403800345LL, 7}, {2, 383805589LL, 7},
  5018. {1, 582368447LL, 5}, {2, 344322273LL, 7}, {1, 1154949189LL, 6},
  5019. {1, 2300233531LL, 7}, {1, 2290649225LL, 7}, {1, 285143057LL, 4},
  5020. {2, 248469183LL, 7}, {1, 2262369605LL, 7}, {1, 1126548799LL, 6},
  5021. {2, 192835267LL, 7}, {2, 174592167LL, 7}, {2, 156496785LL, 7},
  5022. {2, 138547333LL, 7}, {2, 120742053LL, 7}, {1, 274877907LL, 4},
  5023. {1, 2190262207LL, 7}, {2, 68174085LL, 7}, {1, 2172947881LL, 7},
  5024. {2, 33818641LL, 7}, {1, 2155905153LL, 7}, {0, 0LL, 8},
  5025. };
  5026. static const uniform int64 __idiv_table_s32[][3] = {
  5027. {0, 0LL, 1}, {1, 1431655766LL, 0}, {0, 0LL, 2},
  5028. {1, 1717986919LL, 1}, {1, 715827883LL, 0}, {1, 2454267027LL, 2},
  5029. {0, 0LL, 3}, {1, 954437177LL, 1}, {1, 1717986919LL, 2},
  5030. {1, 780903145LL, 1}, {1, 715827883LL, 1}, {1, 1321528399LL, 2},
  5031. {1, 2454267027LL, 3}, {1, 2290649225LL, 3}, {0, 0LL, 4},
  5032. {1, 2021161081LL, 3}, {1, 954437177LL, 2}, {1, 1808407283LL, 3},
  5033. {1, 1717986919LL, 3}, {1, 818089009LL, 2}, {1, 780903145LL, 2},
  5034. {1, 2987803337LL, 4}, {1, 715827883LL, 2}, {1, 1374389535LL, 3},
  5035. {1, 1321528399LL, 3}, {1, 1272582903LL, 3}, {1, 2454267027LL, 4},
  5036. {1, 2369637129LL, 4}, {1, 2290649225LL, 4}, {1, 2216757315LL, 4},
  5037. {0, 0LL, 5}, {1, 1041204193LL, 3}, {1, 2021161081LL, 4},
  5038. {1, 3926827243LL, 5}, {1, 954437177LL, 3}, {1, 3714566311LL, 5},
  5039. {1, 1808407283LL, 4}, {1, 3524075731LL, 5}, {1, 1717986919LL, 4},
  5040. {1, 1676084799LL, 4}, {1, 818089009LL, 3}, {1, 799063683LL, 3},
  5041. {1, 780903145LL, 3}, {1, 3054198967LL, 5}, {1, 2987803337LL, 5},
  5042. {1, 2924233053LL, 5}, {1, 715827883LL, 3}, {1, 1402438301LL, 4},
  5043. {1, 1374389535LL, 4}, {1, 2694881441LL, 5}, {1, 1321528399LL, 4},
  5044. {1, 1296593901LL, 4}, {1, 1272582903LL, 4}, {1, 156180629LL, 1},
  5045. {1, 2454267027LL, 5}, {1, 2411209711LL, 5}, {1, 2369637129LL, 5},
  5046. {1, 582368447LL, 3}, {1, 2290649225LL, 5}, {1, 1126548799LL, 4},
  5047. {1, 2216757315LL, 5}, {1, 2181570691LL, 5}, {0, 0LL, 6},
  5048. {1, 2114445439LL, 5}, {1, 1041204193LL, 4}, {1, 128207979LL, 1},
  5049. {1, 2021161081LL, 5}, {1, 1991868891LL, 5}, {1, 3926827243LL, 6},
  5050. {1, 3871519817LL, 6}, {1, 954437177LL, 4}, {1, 3765450781LL, 6},
  5051. {1, 3714566311LL, 6}, {1, 458129845LL, 3}, {1, 1808407283LL, 5},
  5052. {1, 892460737LL, 4}, {1, 3524075731LL, 6}, {1, 1739733589LL, 5},
  5053. {1, 1717986919LL, 5}, {1, 424194301LL, 3}, {1, 1676084799LL, 5},
  5054. {1, 827945503LL, 4}, {1, 818089009LL, 4}, {1, 1616928865LL, 5},
  5055. {1, 799063683LL, 4}, {1, 789879043LL, 4}, {1, 780903145LL, 4},
  5056. {1, 3088515809LL, 6}, {1, 3054198967LL, 6}, {1, 3020636341LL, 6},
  5057. {1, 2987803337LL, 6}, {1, 738919105LL, 4}, {1, 2924233053LL, 6},
  5058. {1, 2893451653LL, 6}, {1, 715827883LL, 4}, {1, 354224107LL, 3},
  5059. {1, 1402438301LL, 5}, {1, 2776544515LL, 6}, {1, 1374389535LL, 5},
  5060. {1, 680390859LL, 4}, {1, 2694881441LL, 6}, {1, 333589693LL, 3},
  5061. {1, 1321528399LL, 5}, {1, 2617884829LL, 6}, {1, 1296593901LL, 5},
  5062. {1, 1284476201LL, 5}, {1, 1272582903LL, 5}, {1, 2521815661LL, 6},
  5063. {1, 156180629LL, 2}, {1, 2476377541LL, 6}, {1, 2454267027LL, 6},
  5064. {1, 1216273925LL, 5}, {1, 2411209711LL, 6}, {1, 1195121335LL, 5},
  5065. {1, 2369637129LL, 6}, {1, 2349383821LL, 6}, {1, 582368447LL, 4},
  5066. {1, 1154949189LL, 5}, {1, 2290649225LL, 6}, {1, 70991195LL, 1},
  5067. {1, 1126548799LL, 5}, {1, 558694933LL, 4}, {1, 2216757315LL, 6},
  5068. {1, 274877907LL, 3}, {1, 2181570691LL, 6}, {1, 2164392969LL, 6},
  5069. {0, 0LL, 7}, {1, 266354561LL, 3}, {1, 2114445439LL, 6},
  5070. {1, 1049152317LL, 5}, {1, 1041204193LL, 5}, {1, 4133502361LL, 7},
  5071. {1, 128207979LL, 2}, {1, 4072265289LL, 7}, {1, 2021161081LL, 6},
  5072. {1, 125400505LL, 2}, {1, 1991868891LL, 6}, {1, 1977538899LL, 6},
  5073. {1, 3926827243LL, 7}, {1, 974744351LL, 5}, {1, 3871519817LL, 7},
  5074. {1, 961111563LL, 5}, {1, 954437177LL, 5}, {1, 3791419407LL, 7},
  5075. {1, 3765450781LL, 7}, {1, 1869917735LL, 6}, {1, 3714566311LL, 7},
  5076. {1, 230602271LL, 3}, {1, 458129845LL, 4}, {1, 910191745LL, 5},
  5077. {1, 1808407283LL, 6}, {1, 3593175255LL, 7}, {1, 892460737LL, 5},
  5078. {1, 443351463LL, 4}, {1, 3524075731LL, 7}, {1, 875407347LL, 5},
  5079. {1, 1739733589LL, 6}, {1, 432197967LL, 4}, {1, 1717986919LL, 6},
  5080. {1, 3414632385LL, 7}, {1, 424194301LL, 4}, {1, 210795941LL, 3},
  5081. {1, 1676084799LL, 6}, {1, 1665926709LL, 6}, {1, 827945503LL, 5},
  5082. {1, 1645975491LL, 6}, {1, 818089009LL, 5}, {1, 1626496491LL, 6},
  5083. {1, 1616928865LL, 6}, {1, 3214946281LL, 7}, {1, 799063683LL, 5},
  5084. {1, 397222409LL, 4}, {1, 789879043LL, 5}, {1, 1570730897LL, 6},
  5085. {1, 780903145LL, 5}, {1, 3105965051LL, 7}, {1, 3088515809LL, 7},
  5086. {1, 3071261531LL, 7}, {1, 3054198967LL, 7}, {1, 759331235LL, 5},
  5087. {1, 3020636341LL, 7}, {1, 3004130131LL, 7}, {1, 2987803337LL, 7},
  5088. {1, 2971653049LL, 7}, {1, 738919105LL, 5}, {1, 2939870663LL, 7},
  5089. {1, 2924233053LL, 7}, {1, 2908760921LL, 7}, {1, 2893451653LL, 7},
  5090. {1, 2878302691LL, 7}, {1, 715827883LL, 5}, {1, 356059465LL, 4},
  5091. {1, 354224107LL, 4}, {1, 2819260585LL, 7}, {1, 1402438301LL, 6},
  5092. {1, 1395319325LL, 6}, {1, 2776544515LL, 7}, {1, 1381296015LL, 6},
  5093. {1, 1374389535LL, 6}, {1, 42735993LL, 1}, {1, 680390859LL, 5},
  5094. {1, 2708156719LL, 7}, {1, 2694881441LL, 7}, {1, 1340867839LL, 6},
  5095. {1, 333589693LL, 4}, {1, 663956297LL, 5}, {1, 1321528399LL, 6},
  5096. {1, 2630410593LL, 7}, {1, 2617884829LL, 7}, {1, 81421181LL, 2},
  5097. {1, 1296593901LL, 6}, {1, 2581013211LL, 7}, {1, 1284476201LL, 6},
  5098. {1, 1278501893LL, 6}, {1, 1272582903LL, 6}, {1, 2533436931LL, 7},
  5099. {1, 2521815661LL, 7}, {1, 2510300521LL, 7}, {1, 156180629LL, 3},
  5100. {1, 2487582869LL, 7}, {1, 2476377541LL, 7}, {1, 2465272709LL, 7},
  5101. {1, 2454267027LL, 7}, {1, 2443359173LL, 7}, {1, 1216273925LL, 6},
  5102. {1, 605457945LL, 5}, {1, 2411209711LL, 7}, {1, 1200340205LL, 6},
  5103. {1, 1195121335LL, 6}, {1, 2379895299LL, 7}, {1, 2369637129LL, 7},
  5104. {1, 2359467013LL, 7}, {1, 2349383821LL, 7}, {1, 2339386443LL, 7},
  5105. {1, 582368447LL, 5}, {1, 2319644785LL, 7}, {1, 1154949189LL, 6},
  5106. {1, 2300233531LL, 7}, {1, 2290649225LL, 7}, {1, 285143057LL, 4},
  5107. {1, 70991195LL, 2}, {1, 2262369605LL, 7}, {1, 1126548799LL, 6},
  5108. {1, 1121950641LL, 6}, {1, 558694933LL, 5}, {1, 2225732041LL, 7},
  5109. {1, 2216757315LL, 7}, {1, 2207854675LL, 7}, {1, 274877907LL, 4},
  5110. {1, 2190262207LL, 7}, {1, 2181570691LL, 7}, {1, 2172947881LL, 7},
  5111. {1, 2164392969LL, 7}, {1, 2155905153LL, 7}, {1, 2147483649LL, 7},
  5112. };
  5113. __declspec(safe)
  5114. static unmasked inline unsigned int8
  5115. __fast_idiv(unsigned int8 numerator, uniform unsigned int8 divisor) {
  5116. uniform int64 method = __idiv_table_u8[divisor-2][0];
  5117. uniform int64 multiplier = __idiv_table_u8[divisor-2][1];
  5118. uniform int64 shift = __idiv_table_u8[divisor-2][2];
  5119. unsigned int16 mult = multiplier;
  5120. unsigned int16 val = numerator;
  5121. if (method == 0)
  5122. return numerator >> shift;
  5123. else if (method == 1)
  5124. return (val * mult) >> (8 + shift);
  5125. else {
  5126. val *= mult;
  5127. val >>= 8;
  5128. val += (numerator-val)>>1;
  5129. return (val >> shift);
  5130. }
  5131. }
  5132. __declspec(safe)
  5133. static unmasked inline int8 __fast_idiv(int8 numerator, uniform int8 divisor) {
  5134. uniform int8 method = __idiv_table_s8[divisor-2][0];
  5135. uniform int16 multiplier = __idiv_table_s8[divisor-2][1];
  5136. uniform int8 shift = __idiv_table_s8[divisor-2][2];
  5137. if (method == 0)
  5138. return numerator >> shift;
  5139. else {
  5140. unsigned int8 sign = numerator >> 7;
  5141. numerator ^= sign;
  5142. int16 mul = (int16)numerator * (int16)multiplier;
  5143. mul >>= 8 + shift;
  5144. return (int8)mul ^ sign;
  5145. }
  5146. }
  5147. __declspec(safe)
  5148. static unmasked inline unsigned int16 __fast_idiv(unsigned int16 numerator,
  5149. uniform unsigned int16 divisor) {
  5150. uniform int64 method = __idiv_table_u16[divisor-2][0];
  5151. uniform int64 multiplier = __idiv_table_u16[divisor-2][1];
  5152. uniform int64 shift = __idiv_table_u16[divisor-2][2];
  5153. unsigned int32 mult = multiplier;
  5154. unsigned int32 val = numerator;
  5155. if (method == 0)
  5156. return numerator >> shift;
  5157. else if (method == 1)
  5158. return (val * mult) >> (16 + shift);
  5159. else {
  5160. val *= mult;
  5161. val >>= 16;
  5162. val += (numerator-val)>>1;
  5163. return val >> shift;
  5164. }
  5165. }
  5166. __declspec(safe)
  5167. static unmasked inline int16 __fast_idiv(int16 numerator, uniform int16 divisor) {
  5168. uniform int64 method = __idiv_table_s16[divisor-2][0];
  5169. uniform int64 multiplier = __idiv_table_s16[divisor-2][1];
  5170. uniform int64 shift = __idiv_table_s16[divisor-2][2];
  5171. if (method == 0)
  5172. return numerator >> shift;
  5173. else {
  5174. unsigned int16 sign = numerator >> 15;
  5175. numerator ^= sign;
  5176. int32 mul = (int32)numerator * (int32)multiplier;
  5177. mul >>= 16 + shift;
  5178. int16 result = mul;
  5179. return result ^ sign;
  5180. }
  5181. }
  5182. __declspec(safe)
  5183. static unmasked inline inline unsigned int32 __fast_idiv(unsigned int32 numerator,
  5184. uniform unsigned int32 divisor) {
  5185. uniform int64 method = __idiv_table_u32[divisor-2][0];
  5186. uniform int64 multiplier = __idiv_table_u32[divisor-2][1];
  5187. uniform int64 shift = __idiv_table_u32[divisor-2][2];
  5188. unsigned int64 mult = multiplier;
  5189. unsigned int64 val = numerator;
  5190. if (method == 0)
  5191. return numerator >> shift;
  5192. else if (method == 1)
  5193. return (val * mult) >> (32 + shift);
  5194. else {
  5195. val *= mult;
  5196. val >>= 32;
  5197. val += (numerator-val)>>1;
  5198. return val >> shift;
  5199. }
  5200. }
  5201. __declspec(safe)
  5202. static unmasked inline int32 __fast_idiv(int32 numerator, uniform int32 divisor) {
  5203. uniform int64 method = __idiv_table_s32[divisor-2][0];
  5204. uniform int64 multiplier = __idiv_table_s32[divisor-2][1];
  5205. uniform int64 shift = __idiv_table_s32[divisor-2][2];
  5206. if (method == 0)
  5207. return numerator >> shift;
  5208. else {
  5209. unsigned int32 sign = numerator >> 31;
  5210. numerator ^= sign;
  5211. int64 mul = (int64)numerator * (int64)multiplier;
  5212. mul >>= 32 + shift;
  5213. int32 result = mul;
  5214. return result ^ sign;
  5215. }
  5216. }
  5217. ///////////////////////////////////////////////////////////////////////////
  5218. // Saturating int8/int16 ops
  5219. __declspec(safe)
  5220. static unmasked inline unsigned int8 avg_up(unsigned int8 a, unsigned int8 b) {
  5221. return __avg_up_uint8(a, b);
  5222. }
  5223. __declspec(safe)
  5224. static unmasked inline int8 avg_up(int8 a, int8 b) {
  5225. return __avg_up_int8(a, b);
  5226. }
  5227. __declspec(safe)
  5228. static unmasked inline unsigned int16 avg_up(unsigned int16 a, unsigned int16 b) {
  5229. return __avg_up_uint16(a, b);
  5230. }
  5231. __declspec(safe)
  5232. static unmasked inline int16 avg_up(int16 a, int16 b) {
  5233. return __avg_up_int16(a, b);
  5234. }
  5235. __declspec(safe)
  5236. static unmasked inline unsigned int8 avg_down(unsigned int8 a, unsigned int8 b) {
  5237. return __avg_down_uint8(a, b);
  5238. }
  5239. __declspec(safe)
  5240. static unmasked inline int8 avg_down(int8 a, int8 b) {
  5241. return __avg_down_int8(a, b);
  5242. }
  5243. __declspec(safe)
  5244. static unmasked inline unsigned int16 avg_down(unsigned int16 a, unsigned int16 b) {
  5245. return __avg_down_uint16(a, b);
  5246. }
  5247. __declspec(safe)
  5248. static unmasked inline int16 avg_down(int16 a, int16 b) {
  5249. return __avg_down_int16(a, b);
  5250. }