/stdlib.ispc
http://github.com/ispc/ispc · Unknown · 6027 lines · 5250 code · 777 blank · 0 comment · 0 complexity · 9e136345ca764dccc89498a77a0eac4e MD5 · raw file
- // -*- mode: c++ -*-
- /*
- Copyright (c) 2010-2014, Intel Corporation
- All rights reserved.
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of Intel Corporation nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
- IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
- PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
- OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
- /** @file stdlib.ispc
- @brief Portion of the ispc standard library implementation that's in
- ispc code
- */
- #if (ISPC_MASK_BITS == 1)
- #define IntMaskType bool
- #define UIntMaskType bool
- #elif (ISPC_MASK_BITS == 8)
- #define IntMaskType int8
- #define UIntMaskType unsigned int8
- #elif (ISPC_MASK_BITS == 16)
- #define IntMaskType int16
- #define UIntMaskType unsigned int16
- #elif (ISPC_MASK_BITS == 32)
- #define IntMaskType int32
- #define UIntMaskType unsigned int32
- #elif (ISPC_MASK_BITS == 64)
- #define IntMaskType int64
- #define UIntMaskType unsigned int64
- #else
- #error Unknown value of ISPC_MASK_BITS
- #endif
- ///////////////////////////////////////////////////////////////////////////
- // CUDA Specific primitives
- //
- /***************/
- __declspec(safe,cost0) static inline varying int __programIndex() { return __program_index(); }
- __declspec(safe,cost0) static inline uniform int __programCount() { return __program_count(); }
- __declspec(safe,cost0) static inline uniform int __warpIndex() { return __warp_index(); }
- /***************/
- __declspec(safe,cost0) static inline uniform int __taskIndex0() { return __task_index0(); }
- __declspec(safe,cost0) static inline uniform int __taskIndex1() { return __task_index1(); }
- __declspec(safe,cost0) static inline uniform int __taskIndex2() { return __task_index2(); }
- __declspec(safe,cost0) static inline uniform int __taskIndex () { return __task_index (); }
- /***************/
- __declspec(safe,cost0) static inline uniform int __taskCount0() { return __task_count0(); }
- __declspec(safe,cost0) static inline uniform int __taskCount1() { return __task_count1(); }
- __declspec(safe,cost0) static inline uniform int __taskCount2() { return __task_count2(); }
- __declspec(safe,cost0) static inline uniform int __taskCount () { return __task_count (); }
- /* Limits of integral types. */
- #ifndef INT8_MAX
- #define INT8_MAX (127)
- #endif
- #ifndef INT16_MAX
- #define INT16_MAX (32767)
- #endif
- #ifndef INT32_MAX
- #define INT32_MAX (2147483647)
- #endif
- #ifndef INT64_MAX
- #define INT64_MAX (9223372036854775807)
- #endif
- #ifndef UINT8_MAX
- #define UINT8_MAX (255)
- #endif
- #ifndef UINT16_MAX
- #define UINT16_MAX (65535)
- #endif
- #ifndef UINT32_MAX
- #define UINT32_MAX (4294967295)
- #endif
- #ifndef UINT64_MAX
- #define UINT64_MAX (18446744073709551615)
- #endif
- #ifndef INT8_MIN
- #define INT8_MIN (-INT8_MAX - 1)
- #endif
- #ifndef INT16_MIN
- #define INT16_MIN (-INT16_MAX - 1)
- #endif
- #ifndef INT32_MIN
- #define INT32_MIN (-INT32_MAX - 1)
- #endif
- #ifndef INT64_MIN
- #define INT64_MIN (-INT64_MAX - 1)
- #endif
- ///////////////////////////////////////////////////////////////////////////
- // Low level primitives
- __declspec(safe,cost0)
- static inline float floatbits(unsigned int a) {
- return __floatbits_varying_int32(a);
- }
- __declspec(safe,cost0)
- static inline uniform float floatbits(uniform unsigned int a) {
- return __floatbits_uniform_int32(a);
- }
- __declspec(safe,cost0)
- static inline float floatbits(int a) {
- return __floatbits_varying_int32(a);
- }
- __declspec(safe,cost0)
- static inline uniform float floatbits(uniform int a) {
- return __floatbits_uniform_int32(a);
- }
- __declspec(safe,cost0)
- static inline double doublebits(unsigned int64 a) {
- return __doublebits_varying_int64(a);
- }
- __declspec(safe,cost0)
- static inline uniform double doublebits(uniform unsigned int64 a) {
- return __doublebits_uniform_int64(a);
- }
- __declspec(safe,cost0)
- static inline unsigned int intbits(float a) {
- return __intbits_varying_float(a);
- }
- __declspec(safe,cost0)
- static inline uniform unsigned int intbits(uniform float a) {
- return __intbits_uniform_float(a);
- }
- __declspec(safe,cost0)
- static inline unsigned int64 intbits(double d) {
- return __intbits_varying_double(d);
- }
- __declspec(safe,cost0)
- static inline uniform unsigned int64 intbits(uniform double d) {
- return __intbits_uniform_double(d);
- }
- __declspec(safe)
- static inline float broadcast(float v, uniform int i) {
- return __broadcast_float(v, i);
- }
- __declspec(safe)
- static inline int8 broadcast(int8 v, uniform int i) {
- return __broadcast_i8(v, i);
- }
- __declspec(safe)
- static inline int16 broadcast(int16 v, uniform int i) {
- return __broadcast_i16(v, i);
- }
- __declspec(safe)
- static inline int32 broadcast(int32 v, uniform int i) {
- return __broadcast_i32(v, i);
- }
- __declspec(safe)
- static inline double broadcast(double v, uniform int i) {
- return __broadcast_double(v, i);
- }
- __declspec(safe)
- static inline int64 broadcast(int64 v, uniform int i) {
- return __broadcast_i64(v, i);
- }
- __declspec(safe)
- static inline float rotate(float v, uniform int i) {
- return __rotate_float(v, i);
- }
- __declspec(safe)
- static inline int8 rotate(int8 v, uniform int i) {
- return __rotate_i8(v, i);
- }
- __declspec(safe)
- static inline int16 rotate(int16 v, uniform int i) {
- return __rotate_i16(v, i);
- }
- __declspec(safe)
- static inline int32 rotate(int32 v, uniform int i) {
- return __rotate_i32(v, i);
- }
- __declspec(safe)
- static inline double rotate(double v, uniform int i) {
- return __rotate_double(v, i);
- }
- __declspec(safe)
- static inline int64 rotate(int64 v, uniform int i) {
- return __rotate_i64(v, i);
- }
- __declspec(safe)
- static inline float shift(float v, uniform int i) {
- varying float result;
- unmasked {
- result = __shift_float(v, i);
- }
- return result;
- }
- __declspec(safe)
- static inline int8 shift(int8 v, uniform int i) {
- varying int8 result;
- unmasked {
- result = __shift_i8(v, i);
- }
- return result;
- }
- __declspec(safe)
- static inline int16 shift(int16 v, uniform int i) {
- varying int16 result;
- unmasked {
- result = __shift_i16(v, i);
- }
- return result;
- }
- __declspec(safe)
- static inline int32 shift(int32 v, uniform int i) {
- varying int32 result;
- unmasked {
- result = __shift_i32(v, i);
- }
- return result;
- }
- __declspec(safe)
- static inline double shift(double v, uniform int i) {
- varying double result;
- unmasked {
- result = __shift_double(v, i);
- }
- return result;
- }
- __declspec(safe)
- static inline int64 shift(int64 v, uniform int i) {
- varying int64 result;
- unmasked {
- result = __shift_i64(v, i);
- }
- return result;
- }
- __declspec(safe)
- static inline float shuffle(float v, int i) {
- return __shuffle_float(v, i);
- }
- __declspec(safe)
- static inline int8 shuffle(int8 v, int i) {
- return __shuffle_i8(v, i);
- }
- __declspec(safe)
- static inline int16 shuffle(int16 v, int i) {
- return __shuffle_i16(v, i);
- }
- __declspec(safe)
- static inline int32 shuffle(int32 v, int i) {
- return __shuffle_i32(v, i);
- }
- __declspec(safe)
- static inline double shuffle(double v, int i) {
- return __shuffle_double(v, i);
- }
- __declspec(safe)
- static inline int64 shuffle(int64 v, int i) {
- return __shuffle_i64(v, i);
- }
- __declspec(safe)
- static inline float shuffle(float v0, float v1, int i) {
- return __shuffle2_float(v0, v1, i);
- }
- __declspec(safe)
- static inline int8 shuffle(int8 v0, int8 v1, int i) {
- return __shuffle2_i8(v0, v1, i);
- }
- __declspec(safe)
- static inline int16 shuffle(int16 v0, int16 v1, int i) {
- return __shuffle2_i16(v0, v1, i);
- }
- __declspec(safe)
- static inline int32 shuffle(int32 v0, int32 v1, int i) {
- return __shuffle2_i32(v0, v1, i);
- }
- __declspec(safe)
- static inline double shuffle(double v0, double v1, int i) {
- return __shuffle2_double(v0, v1, i);
- }
- __declspec(safe)
- static inline int64 shuffle(int64 v0, int64 v1, int i) {
- return __shuffle2_i64(v0, v1, i);
- }
- // x[i]
- __declspec(safe,cost1)
- static inline uniform float extract(float x, uniform int i) {
- return floatbits(__extract_int32((int)intbits(x), i));
- }
- __declspec(safe,cost1)
- static inline uniform int8 extract(int8 x, uniform int i) {
- return __extract_int8(x, i);
- }
- __declspec(safe,cost1)
- static inline uniform unsigned int8 extract(unsigned int8 x, uniform int i) {
- return __extract_int8(x, (unsigned int)i);
- }
- __declspec(safe,cost1)
- static inline uniform int16 extract(int16 x, uniform int i) {
- return __extract_int16(x, i);
- }
- __declspec(safe,cost1)
- static inline uniform unsigned int16 extract(unsigned int16 x, uniform int i) {
- return __extract_int16(x, (unsigned int)i);
- }
- __declspec(safe,cost1)
- static inline uniform int32 extract(int32 x, uniform int i) {
- return __extract_int32(x, i);
- }
- __declspec(safe,cost1)
- static inline uniform unsigned int32 extract(unsigned int32 x, uniform int i) {
- return __extract_int32(x, (unsigned int)i);
- }
- __declspec(safe,cost1)
- static inline uniform double extract(double x, uniform int i) {
- return doublebits(__extract_int64((int64)intbits(x), i));
- }
- __declspec(safe,cost1)
- static inline uniform int64 extract(int64 x, uniform int i) {
- return __extract_int64(x, i);
- }
- __declspec(safe,cost1)
- static inline uniform unsigned int64 extract(unsigned int64 x, uniform int i) {
- return __extract_int64(x, (unsigned int)i);
- }
- // x[i] = v
- __declspec(safe,cost1)
- static inline float insert(float x, uniform int i, uniform float v) {
- return floatbits(__insert_int32((int)intbits(x), i, (int)intbits(v)));
- }
- __declspec(safe,cost1)
- static inline int8 insert(int8 x, uniform int i, uniform int8 v) {
- return __insert_int8(x, i, v);
- }
- __declspec(safe,cost1)
- static inline unsigned int8 insert(unsigned int8 x, uniform int i,
- uniform unsigned int8 v) {
- return __insert_int8(x, (unsigned int)i, v);
- }
- __declspec(safe,cost1)
- static inline int16 insert(int16 x, uniform int i, uniform int16 v) {
- return __insert_int16(x, i, v);
- }
- __declspec(safe,cost1)
- static inline unsigned int16 insert(unsigned int16 x, uniform int i,
- uniform unsigned int16 v) {
- return __insert_int16(x, (unsigned int)i, v);
- }
- __declspec(safe,cost1)
- static inline int32 insert(int32 x, uniform int i, uniform int32 v) {
- return __insert_int32(x, i, v);
- }
- __declspec(safe,cost1)
- static inline unsigned int32 insert(unsigned int32 x, uniform int i,
- uniform unsigned int32 v) {
- return __insert_int32(x, (unsigned int)i, v);
- }
- __declspec(safe,cost1)
- static inline double insert(double x, uniform int i, uniform double v) {
- return doublebits(__insert_int64((int64)intbits(x), i, (int64)intbits(v)));
- }
- __declspec(safe,cost1)
- static inline int64 insert(int64 x, uniform int i, uniform int64 v) {
- return __insert_int64(x, i, v);
- }
- __declspec(safe,cost1)
- static inline unsigned int64 insert(unsigned int64 x, uniform int i,
- uniform unsigned int64 v) {
- return __insert_int64(x, (unsigned int)i, v);
- }
- __declspec(safe,cost1)
- static inline uniform int32 sign_extend(uniform bool v) {
- return __sext_uniform_bool(v);
- }
- __declspec(safe,cost1)
- static inline int32 sign_extend(bool v) {
- return __sext_varying_bool(v);
- }
- __declspec(safe)
- static inline uniform bool any(bool v) {
- // We only care about whether "any" is true for the active program instances,
- // so we have to make v with the current program mask.
- #if (ISPC_MASK_BITS == 1)
- return __any(v & __mask);
- #else
- return __any((UIntMaskType)__sext_varying_bool(v) & __mask);
- #endif
- }
- __declspec(safe)
- static inline uniform bool all(bool v) {
- // As with any(), we need to explicitly mask v with the current program mask
- // so we're only looking at the current lanes
- #if (ISPC_MASK_BITS == 1)
- return __all(v | !__mask);
- #else
- return __all((UIntMaskType)__sext_varying_bool(v) | !__mask);
- #endif
- }
- __declspec(safe)
- static inline uniform bool none(bool v) {
- // As with any(), we need to explicitly mask v with the current program mask
- // so we're only looking at the current lanes
- #if (ISPC_MASK_BITS == 1)
- return __none(v & __mask);
- #else
- return __none((UIntMaskType)__sext_varying_bool(v) & __mask);
- #endif
- }
- __declspec(safe)
- static inline uniform int32 popcnt(uniform int32 v) {
- return __popcnt_int32(v);
- }
- __declspec(safe)
- static inline uniform int popcnt(uniform int64 v) {
- return (int32)__popcnt_int64(v);
- }
- __declspec(safe)
- static inline int popcnt(int v) {
- int r;
- for (uniform int i = 0; i < programCount; ++i)
- r = insert(r, i, popcnt(extract(v, i)));
- return __mask ? r : 0;
- }
- __declspec(safe)
- static inline int popcnt(int64 v) {
- int r;
- for (uniform int i = 0; i < programCount; ++i)
- r = insert(r, i, popcnt(extract(v, i)));
- return __mask ? r : 0;
- }
- __declspec(safe)
- static inline uniform int popcnt(bool v) {
- // As with any() and all(), only count across the active lanes
- #if (ISPC_MASK_BITS == 1)
- if (__is_nvptx_target)
- return __popcnt_int64(__movmsk_ptx(v & __mask));
- else
- return __popcnt_int64(__movmsk(v & __mask));
- #else
- return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask));
- #endif
- }
- __declspec(safe)
- static inline uniform unsigned int64 lanemask() {
- return __movmsk(__mask);
- }
- ///////////////////////////////////////////////////////////////////////////
- // memcpy/memmove/memset
- static inline void memcpy(void * uniform dst, void * uniform src,
- uniform int32 count) {
- __memcpy32((int8 * uniform)dst, (int8 * uniform)src, count);
- }
- static inline void memcpy64(void * uniform dst, void * uniform src,
- uniform int64 count) {
- __memcpy64((int8 * uniform)dst, (int8 * uniform)src, count);
- }
- static inline void memcpy(void * varying dst, void * varying src,
- int32 count) {
- void * uniform da[programCount];
- void * uniform sa[programCount];
- da[programIndex] = dst;
- sa[programIndex] = src;
- foreach_active (i) {
- void * uniform d = da[i], * uniform s = sa[i];
- __memcpy32((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
- }
- }
- static inline void memcpy64(void * varying dst, void * varying src,
- int64 count) {
- void * uniform da[programCount];
- void * uniform sa[programCount];
- da[programIndex] = dst;
- sa[programIndex] = src;
- foreach_active (i) {
- void * uniform d = da[i], * uniform s = sa[i];
- __memcpy64((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
- }
- }
- static inline void memmove(void * uniform dst, void * uniform src,
- uniform int32 count) {
- __memmove32((int8 * uniform)dst, (int8 * uniform)src, count);
- }
- static inline void memmove64(void * uniform dst, void * uniform src,
- uniform int64 count) {
- __memmove64((int8 * uniform)dst, (int8 * uniform)src, count);
- }
- static inline void memmove(void * varying dst, void * varying src,
- int32 count) {
- void * uniform da[programCount];
- void * uniform sa[programCount];
- da[programIndex] = dst;
- sa[programIndex] = src;
- foreach_active (i) {
- void * uniform d = da[i], * uniform s = sa[i];
- __memmove32((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
- }
- }
- static inline void memmove64(void * varying dst, void * varying src,
- int64 count) {
- void * uniform da[programCount];
- void * uniform sa[programCount];
- da[programIndex] = dst;
- sa[programIndex] = src;
- foreach_active (i) {
- void * uniform d = da[i], * uniform s = sa[i];
- __memmove64((int8 * uniform)d, (int8 * uniform)s, extract(count, i));
- }
- }
- static inline void memset(void * uniform ptr, uniform int8 val,
- uniform int32 count) {
- __memset32((int8 * uniform)ptr, val, count);
- }
- static inline void memset64(void * uniform ptr, uniform int8 val,
- uniform int64 count) {
- __memset64((int8 * uniform)ptr, val, count);
- }
- static inline void memset(void * varying ptr, int8 val, int32 count) {
- void * uniform pa[programCount];
- pa[programIndex] = ptr;
- foreach_active (i) {
- __memset32((int8 * uniform)pa[i], extract(val, i), extract(count, i));
- }
- }
- static inline void memset64(void * varying ptr, int8 val, int64 count) {
- void * uniform pa[programCount];
- pa[programIndex] = ptr;
- foreach_active (i) {
- __memset64((int8 * uniform)pa[i], extract(val, i), extract(count, i));
- }
- }
- ///////////////////////////////////////////////////////////////////////////
- // count leading/trailing zeros
- __declspec(safe,cost1)
- static inline uniform unsigned int32
- count_leading_zeros(uniform unsigned int32 v) {
- return __count_leading_zeros_i32(v);
- }
- __declspec(safe,cost1)
- static inline uniform unsigned int64
- count_leading_zeros(uniform unsigned int64 v) {
- return __count_leading_zeros_i64(v);
- }
- __declspec(safe,cost1)
- static inline uniform unsigned int32
- count_trailing_zeros(uniform unsigned int32 v) {
- return __count_trailing_zeros_i32(v);
- }
- __declspec(safe,cost1)
- static inline uniform unsigned int64
- count_trailing_zeros(uniform unsigned int64 v) {
- return __count_trailing_zeros_i64(v);
- }
- __declspec(safe,cost1)
- static inline uniform int32
- count_leading_zeros(uniform int32 v) {
- return __count_leading_zeros_i32(v);
- }
- __declspec(safe,cost1)
- static inline uniform int64
- count_leading_zeros(uniform int64 v) {
- return __count_leading_zeros_i64(v);
- }
- __declspec(safe,cost1)
- static inline uniform int32
- count_trailing_zeros(uniform int32 v) {
- return __count_trailing_zeros_i32(v);
- }
- __declspec(safe,cost1)
- static inline uniform int64
- count_trailing_zeros(uniform int64 v) {
- return __count_trailing_zeros_i64(v);
- }
- __declspec(safe)
- static inline unsigned int32
- count_leading_zeros(unsigned int32 v) {
- unsigned int32 r;
- for (uniform int i = 0; i < programCount; ++i)
- r = insert(r, i, __count_leading_zeros_i32(extract(v, i)));
- return r;
- }
- __declspec(safe)
- static inline unsigned int64
- count_leading_zeros(unsigned int64 v) {
- unsigned int64 r;
- for (uniform int i = 0; i < programCount; ++i)
- r = insert(r, i, __count_leading_zeros_i64(extract(v, i)));
- return r;
- }
- __declspec(safe)
- static inline unsigned int32
- count_trailing_zeros(unsigned int32 v) {
- unsigned int32 r;
- for (uniform int i = 0; i < programCount; ++i)
- r = insert(r, i, __count_trailing_zeros_i32(extract(v, i)));
- return r;
- }
- __declspec(safe)
- static inline unsigned int64
- count_trailing_zeros(unsigned int64 v) {
- unsigned int64 r;
- for (uniform int i = 0; i < programCount; ++i)
- r = insert(r, i, __count_trailing_zeros_i64(extract(v, i)));
- return r;
- }
- __declspec(safe)
- static inline int32
- count_leading_zeros(int32 v) {
- int32 r;
- for (uniform int i = 0; i < programCount; ++i)
- r = insert(r, i, __count_leading_zeros_i32(extract(v, i)));
- return r;
- }
- __declspec(safe)
- static inline int64
- count_leading_zeros(int64 v) {
- int64 r;
- for (uniform int i = 0; i < programCount; ++i)
- r = insert(r, i, __count_leading_zeros_i64(extract(v, i)));
- return r;
- }
- __declspec(safe)
- static inline int32
- count_trailing_zeros(int32 v) {
- int32 r;
- for (uniform int i = 0; i < programCount; ++i)
- r = insert(r, i, __count_trailing_zeros_i32(extract(v, i)));
- return r;
- }
- __declspec(safe)
- static inline int64
- count_trailing_zeros(int64 v) {
- int64 r;
- for (uniform int i = 0; i < programCount; ++i)
- r = insert(r, i, __count_trailing_zeros_i64(extract(v, i)));
- return r;
- }
- ///////////////////////////////////////////////////////////////////////////
- // AOS/SOA conversion
- static inline void
- aos_to_soa3(uniform float a[], varying float * uniform v0,
- varying float * uniform v1, varying float * uniform v2) {
- __aos_to_soa3_float(a, v0, v1, v2);
- }
- static inline void
- soa_to_aos3(float v0, float v1, float v2, uniform float a[]) {
- __soa_to_aos3_float(v0, v1, v2, a);
- }
- static inline void
- aos_to_soa4(uniform float a[], varying float * uniform v0,
- varying float * uniform v1, varying float * uniform v2,
- varying float * uniform v3) {
- __aos_to_soa4_float(a, v0, v1, v2, v3);
- }
- static inline void
- soa_to_aos4(float v0, float v1, float v2, float v3, uniform float a[]) {
- __soa_to_aos4_float(v0, v1, v2, v3, a);
- }
- static inline void
- aos_to_soa3(uniform int32 a[], varying int32 * uniform v0,
- varying int32 * uniform v1, varying int32 * uniform v2) {
- aos_to_soa3((uniform float * uniform)a, (varying float * uniform)v0,
- (varying float * uniform)v1, (varying float * uniform)v2);
- }
- static inline void
- soa_to_aos3(int32 v0, int32 v1, int32 v2, uniform int32 a[]) {
- soa_to_aos3(floatbits(v0), floatbits(v1), floatbits(v2),
- (uniform float * uniform)a);
- }
- static inline void
- aos_to_soa4(uniform int32 a[], varying int32 * uniform v0,
- varying int32 * uniform v1, varying int32 * uniform v2,
- varying int32 * uniform v3) {
- aos_to_soa4((uniform float * uniform)a, (varying float * uniform )v0,
- (varying float * uniform)v1, (varying float * uniform)v2,
- (varying float * uniform)v3);
- }
- static inline void
- soa_to_aos4(int32 v0, int32 v1, int32 v2, int32 v3, uniform int32 a[]) {
- soa_to_aos4(floatbits(v0), floatbits(v1), floatbits(v2), floatbits(v3),
- (uniform float * uniform)a);
- }
- ///////////////////////////////////////////////////////////////////////////
- // Prefetching
- __declspec(safe,cost1)
- static inline void prefetch_l1(const void * uniform ptr) {
- __prefetch_read_uniform_1((uniform int8 * uniform)ptr);
- }
- __declspec(safe,cost1)
- static inline void prefetch_l2(const void * uniform ptr) {
- __prefetch_read_uniform_2((uniform int8 * uniform)ptr);
- }
- __declspec(safe,cost1)
- static inline void prefetch_l3(const void * uniform ptr) {
- __prefetch_read_uniform_3((uniform int8 * uniform)ptr);
- }
- __declspec(safe,cost1)
- static inline void prefetch_nt(const void * uniform ptr) {
- __prefetch_read_uniform_nt((uniform int8 * uniform)ptr);
- }
- static inline void prefetch_l1(const void * varying ptr) {
- __pseudo_prefetch_read_varying_1((int64)ptr, (IntMaskType)__mask);
- }
- static inline void prefetch_l2(const void * varying ptr) {
- __pseudo_prefetch_read_varying_2((int64)ptr, (IntMaskType)__mask);
- }
- static inline void prefetch_l3(const void * varying ptr) {
- __pseudo_prefetch_read_varying_3((int64)ptr, (IntMaskType)__mask);
- }
- static inline void prefetch_nt(const void * varying ptr) {
- __pseudo_prefetch_read_varying_nt((int64)ptr, (IntMaskType)__mask);
- }
- ///////////////////////////////////////////////////////////////////////////
- // non-short-circuiting alternatives
- __declspec(safe,cost1)
- static inline bool and(bool a, bool b) {
- return a && b;
- }
- __declspec(safe,cost1)
- static inline uniform bool and(uniform bool a, uniform bool b) {
- return a && b;
- }
- __declspec(safe,cost1)
- static inline bool or(bool a, bool b) {
- return a || b;
- }
- __declspec(safe,cost1)
- static inline uniform bool or(uniform bool a, uniform bool b) {
- return a || b;
- }
- __declspec(safe,cost1)
- static inline int8 select(bool c, int8 a, int8 b) {
- return c ? a : b;
- }
- __declspec(safe,cost1)
- static inline int8 select(uniform bool c, int8 a, int8 b) {
- return c ? a : b;
- }
- __declspec(safe,cost1)
- static inline uniform int8 select(uniform bool c, uniform int8 a,
- uniform int8 b) {
- return c ? a : b;
- }
- __declspec(safe,cost1)
- static inline int16 select(bool c, int16 a, int16 b) {
- return c ? a : b;
- }
- __declspec(safe,cost1)
- static inline int16 select(uniform bool c, int16 a, int16 b) {
- return c ? a : b;
- }
- __declspec(safe,cost1)
- static inline uniform int16 select(uniform bool c, uniform int16 a,
- uniform int16 b) {
- return c ? a : b;
- }
- __declspec(safe,cost1)
- static inline int32 select(bool c, int32 a, int32 b) {
- return c ? a : b;
- }
- __declspec(safe,cost1)
- static inline int32 select(uniform bool c, int32 a, int32 b) {
- return c ? a : b;
- }
- __declspec(safe,cost1)
- static inline uniform int32 select(uniform bool c, uniform int32 a,
- uniform int32 b) {
- return c ? a : b;
- }
- __declspec(safe,cost1)
- static inline int64 select(bool c, int64 a, int64 b) {
- return c ? a : b;
- }
- __declspec(safe,cost1)
- static inline int64 select(uniform bool c, int64 a, int64 b) {
- return c ? a : b;
- }
- __declspec(safe,cost1)
- static inline uniform int64 select(uniform bool c, uniform int64 a,
- uniform int64 b) {
- return c ? a : b;
- }
- __declspec(safe,cost1)
- static inline float select(bool c, float a, float b) {
- return c ? a : b;
- }
- __declspec(safe,cost1)
- static inline float select(uniform bool c, float a, float b) {
- return c ? a : b;
- }
- __declspec(safe,cost1)
- static inline uniform float select(uniform bool c, uniform float a,
- uniform float b) {
- return c ? a : b;
- }
- __declspec(safe,cost1)
- static inline double select(bool c, double a, double b) {
- return c ? a : b;
- }
- __declspec(safe,cost1)
- static inline double select(uniform bool c, double a, double b) {
- return c ? a : b;
- }
- __declspec(safe,cost1)
- static inline uniform double select(uniform bool c, uniform double a,
- uniform double b) {
- return c ? a : b;
- }
- ///////////////////////////////////////////////////////////////////////////
- // Horizontal ops / reductions
- __declspec(safe)
- static inline uniform int16 reduce_add(int8 x) {
- return __reduce_add_int8(__mask ? x : (int8)0);
- }
- __declspec(safe)
- static inline uniform unsigned int16 reduce_add(unsigned int8 x) {
- return __reduce_add_int8(__mask ? x : (int8)0);
- }
- __declspec(safe)
- static inline uniform int32 reduce_add(int16 x) {
- return __reduce_add_int16(__mask ? x : (int16)0);
- }
- __declspec(safe)
- static inline uniform unsigned int32 reduce_add(unsigned int16 x) {
- return __reduce_add_int16(__mask ? x : (int16)0);
- }
- __declspec(safe)
- static inline uniform float reduce_add(float x) {
- // zero the lanes where the mask is off
- return __reduce_add_float(__mask ? x : 0.);
- }
- __declspec(safe)
- static inline uniform float reduce_min(float v) {
- // For the lanes where the mask is off, replace the given value with
- // infinity, so that it doesn't affect the result.
- int iflt_max = 0x7f800000; // infinity
- // unmasked block is needed to make sure that argument for unmasked
- // function __reduce_min_float() are calculated without a mask.
- bool test = __mask;
- uniform float result;
- unmasked {
- result = __reduce_min_float(test ? v : floatbits(iflt_max));
- }
- return result;
- }
- __declspec(safe)
- static inline uniform float reduce_max(float v) {
- // For the lanes where the mask is off, replace the given value with
- // negative infinity, so that it doesn't affect the result.
- const int iflt_neg_max = 0xff800000; // -infinity
- // unmasked block is needed to make sure that argument for unmasked
- // function __reduce_max_float() are calculated without a mask.
- bool test = __mask;
- uniform float result;
- unmasked {
- result = __reduce_max_float(test ? v : floatbits(iflt_neg_max));
- }
- return result;
- }
- __declspec(safe)
- static inline uniform int64 reduce_add(int32 x) {
- // Zero out the values for lanes that aren't running
- return __reduce_add_int32(__mask ? x : 0);
- }
- __declspec(safe)
- static inline uniform int reduce_min(int v) {
- // Set values for non-running lanes to the maximum integer value so
- // they don't affect the result.
- int int_max = 0x7fffffff;
- return __reduce_min_int32(__mask ? v : int_max);
- }
- __declspec(safe)
- static inline uniform int reduce_max(int v) {
- // Set values for non-running lanes to the minimum integer value so
- // they don't affect the result.
- int int_min = 0x80000000;
- return __reduce_max_int32(__mask ? v : int_min);
- }
- __declspec(safe)
- static inline uniform unsigned int64 reduce_add(unsigned int32 x) {
- // Set values for non-running lanes to zero so they don't affect the
- // result.
- return __reduce_add_int32(__mask ? x : 0);
- }
- __declspec(safe)
- static inline uniform unsigned int reduce_min(unsigned int v) {
- // Set values for non-running lanes to the maximum unsigned integer
- // value so they don't affect the result.
- unsigned int uint_max = 0xffffffff;
- return __reduce_min_uint32(__mask ? v : uint_max);
- }
- __declspec(safe)
- static inline uniform unsigned int reduce_max(unsigned int v) {
- // Set values for non-running lanes to zero so they don't affect the
- // result.
- return __reduce_max_uint32(__mask ? v : 0);
- }
- __declspec(safe)
- static inline uniform double reduce_add(double x) {
- // zero the lanes where the mask is off
- return __reduce_add_double(__mask ? x : 0.);
- }
- __declspec(safe)
- static inline uniform double reduce_min(double v) {
- int64 iflt_max = 0x7ff0000000000000; // infinity
- // unmasked block is needed to make sure that argument for unmasked
- // function __reduce_min_double() are calculated without a mask.
- bool test = __mask;
- uniform double result;
- unmasked {
- result = __reduce_min_double(test ? v : doublebits(iflt_max));
- }
- return result;
- }
- __declspec(safe)
- static inline uniform double reduce_max(double v) {
- const int64 iflt_neg_max = 0xfff0000000000000; // -infinity
- // unmasked block is needed to make sure that argument for unmasked
- // function __reduce_max_double() are calculated without a mask.
- bool test = __mask;
- uniform double result;
- unmasked {
- result = __reduce_max_double(test ? v : doublebits(iflt_neg_max));
- }
- return result;
- }
- __declspec(safe)
- static inline uniform int64 reduce_add(int64 x) {
- // Zero out the values for lanes that aren't running
- return __reduce_add_int64(__mask ? x : 0);
- }
- __declspec(safe)
- static inline uniform int64 reduce_min(int64 v) {
- // Set values for non-running lanes to the maximum integer value so
- // they don't affect the result.
- int64 int_max = 0x7fffffffffffffff;
- return __reduce_min_int64(__mask ? v : int_max);
- }
- __declspec(safe)
- static inline uniform int64 reduce_max(int64 v) {
- // Set values for non-running lanes to the minimum integer value so
- // they don't affect the result.
- int64 int_min = 0x8000000000000000;
- return __reduce_max_int64(__mask ? v : int_min);
- }
- __declspec(safe)
- static inline uniform unsigned int64 reduce_add(unsigned int64 x) {
- // Set values for non-running lanes to zero so they don't affect the
- // result.
- return __reduce_add_int64(__mask ? x : 0);
- }
- __declspec(safe)
- static inline uniform unsigned int64 reduce_min(unsigned int64 v) {
- // Set values for non-running lanes to the maximum unsigned integer
- // value so they don't affect the result.
- unsigned int64 uint_max = 0xffffffffffffffff;
- return __reduce_min_uint64(__mask ? v : uint_max);
- }
- __declspec(safe)
- static inline uniform unsigned int64 reduce_max(unsigned int64 v) {
- // Set values for non-running lanes to zero so they don't affect the
- // result.
- return __reduce_max_uint64(__mask ? v : 0);
- }
- #define REDUCE_EQUAL(TYPE, FUNCTYPE, MASKTYPE) \
- __declspec(safe) \
- static inline uniform bool reduce_equal(TYPE v) { \
- uniform TYPE unusedValue; \
- return __reduce_equal_##FUNCTYPE(v, &unusedValue, (MASKTYPE)__mask); \
- } \
- __declspec(safe) \
- static inline uniform bool reduce_equal(TYPE v, uniform TYPE * uniform value) { \
- return __reduce_equal_##FUNCTYPE(v, value, (MASKTYPE)__mask); \
- }
- REDUCE_EQUAL(int32, int32, IntMaskType)
- REDUCE_EQUAL(unsigned int32, int32, UIntMaskType)
- REDUCE_EQUAL(float, float, IntMaskType)
- REDUCE_EQUAL(int64, int64, IntMaskType)
- REDUCE_EQUAL(unsigned int64, int64, UIntMaskType)
- REDUCE_EQUAL(double, double, IntMaskType)
- static int32 exclusive_scan_add(int32 v) {
- return __exclusive_scan_add_i32(v, (IntMaskType)__mask);
- }
- static unsigned int32 exclusive_scan_add(unsigned int32 v) {
- return __exclusive_scan_add_i32((int32)v, (IntMaskType)__mask);
- }
- static float exclusive_scan_add(float v) {
- return __exclusive_scan_add_float(v, __mask);
- }
- static int64 exclusive_scan_add(int64 v) {
- return __exclusive_scan_add_i64(v, (IntMaskType)__mask);
- }
- static unsigned int64 exclusive_scan_add(unsigned int64 v) {
- return __exclusive_scan_add_i64(v, (UIntMaskType)__mask);
- }
- static double exclusive_scan_add(double v) {
- return __exclusive_scan_add_double(v, __mask);
- }
- static int32 exclusive_scan_and(int32 v) {
- return __exclusive_scan_and_i32(v, (IntMaskType)__mask);
- }
- static unsigned int32 exclusive_scan_and(unsigned int32 v) {
- return __exclusive_scan_and_i32(v, (UIntMaskType)__mask);
- }
- static int64 exclusive_scan_and(int64 v) {
- return __exclusive_scan_and_i64(v, (IntMaskType)__mask);
- }
- static unsigned int64 exclusive_scan_and(unsigned int64 v) {
- return __exclusive_scan_and_i64(v, (UIntMaskType)__mask);
- }
- static int32 exclusive_scan_or(int32 v) {
- return __exclusive_scan_or_i32(v, (IntMaskType)__mask);
- }
- static unsigned int32 exclusive_scan_or(unsigned int32 v) {
- return __exclusive_scan_or_i32(v, (UIntMaskType)__mask);
- }
- static int64 exclusive_scan_or(int64 v) {
- return __exclusive_scan_or_i64(v, (IntMaskType)__mask);
- }
- static unsigned int64 exclusive_scan_or(unsigned int64 v) {
- return __exclusive_scan_or_i64(v, (UIntMaskType)__mask);
- }
- ///////////////////////////////////////////////////////////////////////////
- // packed load, store
- static inline uniform int
- packed_load_active(uniform unsigned int a[],
- varying unsigned int * uniform vals) {
- return __packed_load_active(a, vals, (UIntMaskType)__mask);
- }
- static inline uniform int
- packed_store_active(uniform unsigned int a[],
- unsigned int vals) {
- return __packed_store_active(a, vals, (UIntMaskType)__mask);
- }
- static inline uniform int
- packed_store_active2(uniform unsigned int a[],
- unsigned int vals) {
- return __packed_store_active2(a, vals, (UIntMaskType)__mask);
- }
- static inline uniform int
- packed_load_active(uniform int a[], varying int * uniform vals) {
- return __packed_load_active(a, vals, (IntMaskType)__mask);
- }
- static inline uniform int
- packed_store_active(uniform int a[], int vals) {
- return __packed_store_active(a, vals, (IntMaskType)__mask);
- }
- static inline uniform int
- packed_store_active(bool active, uniform int a[], int vals) {
- return __packed_store_active(a, vals, (IntMaskType)(-(int)active));
- }
- static inline uniform int
- packed_store_active2(uniform int a[], int vals) {
- return __packed_store_active2(a, vals, (IntMaskType)__mask);
- }
- ///////////////////////////////////////////////////////////////////////////
- // System information
- static inline uniform int num_cores() {
- if (__is_nvptx_target)
- return 15*32; // K20/K20X/K40 - 15SMX x 32 warps/smx (max is 64 warps/smx)
- else
- return __num_cores();
- }
- __declspec(safe)
- static inline uniform int64 clock() {
- return __clock();
- }
- ///////////////////////////////////////////////////////////////////////////
- // Floating-Point Math
- __declspec(safe,cost1)
- static inline uniform bool isnan(uniform float v) {
- return v != v;
- }
- __declspec(safe,cost1)
- static inline bool isnan(float v) {
- return v != v;
- }
- __declspec(safe,cost1)
- static inline uniform bool isnan(uniform double v) {
- return v != v;
- }
- __declspec(safe,cost1)
- static inline bool isnan(double v) {
- return v != v;
- }
- __declspec(safe,cost1)
- static inline float abs(float a) {
- // Floating-point hack: zeroing the high bit clears the sign
- unsigned int i = intbits(a);
- i &= 0x7fffffff;
- return floatbits(i);
- }
- __declspec(safe,cost1)
- static inline uniform float abs(uniform float a) {
- uniform unsigned int i = intbits(a);
- i &= 0x7fffffff;
- return floatbits(i);
- }
- __declspec(safe,cost1)
- static inline double abs(double a) {
- // zeroing the high bit clears the sign
- unsigned int64 i = intbits(a);
- i &= 0x7fffffffffffffff;
- return doublebits(i);
- }
- __declspec(safe,cost1)
- static inline uniform double abs(uniform double a) {
- uniform unsigned int64 i = intbits(a);
- i &= 0x7fffffffffffffff;
- return doublebits(i);
- }
- __declspec(safe,cost1)
- static inline unsigned int signbits(float x) {
- unsigned int i = intbits(x);
- return (i & 0x80000000);
- }
- __declspec(safe,cost1)
- static inline uniform unsigned int signbits(uniform float x) {
- uniform unsigned int i = intbits(x);
- return (i & 0x80000000);
- }
- __declspec(safe,cost1)
- static inline unsigned int64 signbits(double x) {
- unsigned int64 i = intbits(x);
- return (i & 0x8000000000000000);
- }
- __declspec(safe,cost1)
- static inline uniform unsigned int64 signbits(uniform double x) {
- uniform unsigned int64 i = intbits(x);
- return (i & 0x8000000000000000);
- }
- __declspec(safe,cost2)
- static inline float round(float x) {
- return __round_varying_float(x);
- }
- __declspec(safe,cost2)
- static inline uniform float round(uniform float x) {
- return __round_uniform_float(x);
- }
- __declspec(safe,cost2)
- static inline double round(double x) {
- return __round_varying_double(x);
- }
- __declspec(safe,cost2)
- static inline uniform double round(uniform double x) {
- return __round_uniform_double(x);
- }
- __declspec(safe,cost2)
- static inline float floor(float x) {
- return __floor_varying_float(x);
- }
- __declspec(safe,cost2)
- static inline uniform float floor(uniform float x) {
- return __floor_uniform_float(x);
- }
- __declspec(safe,cost2)
- static inline double floor(double x) {
- return __floor_varying_double(x);
- }
- __declspec(safe,cost2)
- static inline uniform double floor(uniform double x) {
- return __floor_uniform_double(x);
- }
- __declspec(safe,cost2)
- static inline float ceil(float x) {
- return __ceil_varying_float(x);
- }
- __declspec(safe,cost2)
- static inline uniform float ceil(uniform float x) {
- return __ceil_uniform_float(x);
- }
- __declspec(safe,cost2)
- static inline double ceil(double x) {
- return __ceil_varying_double(x);
- }
- __declspec(safe,cost2)
- static inline uniform double ceil(uniform double x) {
- return __ceil_uniform_double(x);
- }
- __declspec(safe)
- static inline float rcp(float v) {
- return __rcp_varying_float(v);
- }
- __declspec(safe)
- static inline uniform float rcp(uniform float v) {
- return __rcp_uniform_float(v);
- }
- #define RCPD(QUAL) \
- __declspec(safe) \
- static inline QUAL double __rcp_iterate_##QUAL##_double(QUAL double v, QUAL double iv) \
- { \
- iv = iv * (2.0d - v*iv); \
- iv = iv * (2.0d - v*iv); \
- return iv; \
- } \
- __declspec(safe) \
- static inline QUAL double __rcp_safe_##QUAL##_double(QUAL double x) \
- { \
- if (x <= 1.0d+33 && x >= 1.0d-33) \
- return __rcp_iterate_##QUAL##_double(x, rcp((QUAL float)x)); \
- QUAL int64 ex = intbits(x) & 0x7fe0000000000000; \
- QUAL double exp = doublebits( 0x7fd0000000000000 + ~ex ); \
- QUAL double y = rcp((QUAL float)(x*exp)); \
- return __rcp_iterate_##QUAL##_double(x, y*exp); \
- }
- RCPD(varying)
- __declspec(safe)
- static inline double rcp(double v) {
- if (__have_native_rcpd)
- return __rcp_varying_double(v);
- else
- return __rcp_safe_varying_double(v);
- }
- RCPD(uniform)
- __declspec(safe)
- static inline uniform double rcp(uniform double v) {
- if (__have_native_rcpd)
- return __rcp_uniform_double(v);
- else
- return __rcp_safe_uniform_double(v);
- }
- ///////////////////////////////////////////////////////////////////////////
- // min/max
- // float
- __declspec(safe,cost1)
- static inline float min(float a, float b) {
- return __min_varying_float(a, b);
- }
- __declspec(safe,cost1)
- static inline uniform float min(uniform float a, uniform float b) {
- return __min_uniform_float(a, b);
- }
- __declspec(safe,cost1)
- static inline float max(float a, float b) {
- return __max_varying_float(a, b);
- }
- __declspec(safe,cost1)
- static inline uniform float max(uniform float a, uniform float b) {
- return __max_uniform_float(a, b);
- }
- // double
- __declspec(safe)
- static inline double min(double a, double b) {
- return __min_varying_double(a, b);
- }
- __declspec(safe)
- static inline uniform double min(uniform double a, uniform double b) {
- return __min_uniform_double(a, b);
- }
- __declspec(safe)
- static inline double max(double a, double b) {
- return __max_varying_double(a, b);
- }
- __declspec(safe)
- static inline uniform double max(uniform double a, uniform double b) {
- return __max_uniform_double(a, b);
- }
- // int8
- __declspec(safe,cost1)
- static inline uniform unsigned int8 min(uniform unsigned int8 a,
- uniform unsigned int8 b) {
- return (a < b) ? a : b;
- }
- __declspec(safe,cost1)
- static inline uniform unsigned int8 max(uniform unsigned int8 a,
- uniform unsigned int8 b) {
- return (a > b) ? a : b;
- }
- __declspec(safe,cost1)
- static inline uniform int8 min(uniform int8 a, uniform int8 b) {
- return (a < b) ? a : b;
- }
- __declspec(safe,cost1)
- static inline uniform int8 max(uniform int8 a, uniform int8 b) {
- return (a > b) ? a : b;
- }
- __declspec(safe,cost1)
- static inline unsigned int8 min(unsigned int8 a, unsigned int8 b) {
- return (a < b) ? a : b;
- }
- __declspec(safe,cost1)
- static inline unsigned int8 max(unsigned int8 a, unsigned int8 b) {
- return (a > b) ? a : b;
- }
- __declspec(safe,cost1)
- static inline int8 min(int8 a, int8 b) {
- return (a < b) ? a : b;
- }
- __declspec(safe,cost1)
- static inline int8 max(int8 a, int8 b) {
- return (a > b) ? a : b;
- }
- // int16
- __declspec(safe,cost1)
- static inline uniform unsigned int16 min(uniform unsigned int16 a,
- uniform unsigned int16 b) {
- return (a < b) ? a : b;
- }
- __declspec(safe,cost1)
- static inline uniform unsigned int16 max(uniform unsigned int16 a,
- uniform unsigned int16 b) {
- return (a > b) ? a : b;
- }
- __declspec(safe,cost1)
- static inline uniform int16 min(uniform int16 a, uniform int16 b) {
- return (a < b) ? a : b;
- }
- __declspec(safe,cost1)
- static inline uniform int16 max(uniform int16 a, uniform int16 b) {
- return (a > b) ? a : b;
- }
- __declspec(safe,cost1)
- static inline unsigned int16 min(unsigned int16 a, unsigned int16 b) {
- return (a < b) ? a : b;
- }
- __declspec(safe,cost1)
- static inline unsigned int16 max(unsigned int16 a, unsigned int16 b) {
- return (a > b) ? a : b;
- }
- __declspec(safe,cost1)
- static inline int16 min(int16 a, int16 b) {
- return (a < b) ? a : b;
- }
- __declspec(safe,cost1)
- static inline int16 max(int16 a, int16 b) {
- return (a > b) ? a : b;
- }
- // int32
- __declspec(safe,cost1)
- static inline unsigned int min(unsigned int a, unsigned int b) {
- return __min_varying_uint32(a, b);
- }
- __declspec(safe,cost1)
- static inline uniform unsigned int min(uniform unsigned int a, uniform unsigned int b) {
- return __min_uniform_uint32(a, b);
- }
- __declspec(safe,cost1)
- static inline unsigned int max(unsigned int a, unsigned int b) {
- return __max_varying_uint32(a, b);
- }
- __declspec(safe,cost1)
- static inline uniform unsigned int max(uniform unsigned int a, uniform unsigned int b) {
- return __max_uniform_uint32(a, b);
- }
- __declspec(safe,cost1)
- static inline int min(int a, int b) {
- return __min_varying_int32(a, b);
- }
- __declspec(safe,cost1)
- static inline uniform int min(uniform int a, uniform int b) {
- return __min_uniform_int32(a, b);
- }
- __declspec(safe,cost1)
- static inline int max(int a, int b) {
- return __max_varying_int32(a, b);
- }
- __declspec(safe,cost1)
- static inline uniform int max(uniform int a, uniform int b) {
- return __max_uniform_int32(a, b);
- }
- // int64
- __declspec(safe,cost1)
- static inline unsigned int64 min(unsigned int64 a, unsigned int64 b) {
- return __min_varying_uint64(a, b);
- }
- __declspec(safe,cost1)
- static inline uniform unsigned int64 min(uniform unsigned int64 a, uniform unsigned int64 b) {
- return __min_uniform_uint64(a, b);
- }
- __declspec(safe,cost1)
- static inline unsigned int64 max(unsigned int64 a, unsigned int64 b) {
- return __max_varying_uint64(a, b);
- }
- __declspec(safe,cost1)
- static inline uniform unsigned int64 max(uniform unsigned int64 a, uniform unsigned int64 b) {
- return __max_uniform_uint64(a, b);
- }
- __declspec(safe,cost1)
- static inline int64 min(int64 a, int64 b) {
- return __min_varying_int64(a, b);
- }
- __declspec(safe,cost1)
- static inline uniform int64 min(uniform int64 a, uniform int64 b) {
- return __min_uniform_int64(a, b);
- }
- __declspec(safe,cost1)
- static inline int64 max(int64 a, int64 b) {
- return __max_varying_int64(a, b);
- }
- __declspec(safe,cost1)
- static inline uniform int64 max(uniform int64 a, uniform int64 b) {
- return __max_uniform_int64(a, b);
- }
- ///////////////////////////////////////////////////////////////////////////
- // clamps
- // float
- __declspec(safe,cost2)
- static inline float clamp(float v, float low, float high) {
- return min(max(v, low), high);
- }
- __declspec(safe,cost2)
- static inline uniform float clamp(uniform float v, uniform float low, uniform float high) {
- return min(max(v, low), high);
- }
- // double
- __declspec(safe,cost2)
- static inline double clamp(double v, double low, double high) {
- return min(max(v, low), high);
- }
- __declspec(safe,cost2)
- static inline uniform double clamp(uniform double v, uniform double low, uniform double high) {
- return min(max(v, low), high);
- }
- // int8
- __declspec(safe,cost2)
- static inline unsigned int8 clamp(unsigned int8 v, unsigned int8 low,
- unsigned int8 high) {
- return min(max(v, low), high);
- }
- __declspec(safe,cost2)
- static inline uniform unsigned int8 clamp(uniform unsigned int8 v,
- uniform unsigned int8 low,
- uniform unsigned int8 high) {
- return min(max(v, low), high);
- }
- __declspec(safe,cost2)
- static inline int8 clamp(int8 v, int8 low, int8 high) {
- return min(max(v, low), high);
- }
- __declspec(safe,cost2)
- static inline uniform int8 clamp(uniform int8 v, uniform int8 low,
- uniform int8 high) {
- return min(max(v, low), high);
- }
- // int16
- __declspec(safe,cost2)
- static inline unsigned int16 clamp(unsigned int16 v, unsigned int16 low,
- unsigned int16 high) {
- return min(max(v, low), high);
- }
- __declspec(safe,cost2)
- static inline uniform unsigned int16 clamp(uniform unsigned int16 v,
- uniform unsigned int16 low,
- uniform unsigned int16 high) {
- return min(max(v, low), high);
- }
- __declspec(safe,cost2)
- static inline int16 clamp(int16 v, int16 low, int16 high) {
- return min(max(v, low), high);
- }
- __declspec(safe,cost2)
- static inline uniform int16 clamp(uniform int16 v, uniform int16 low,
- uniform int16 high) {
- return min(max(v, low), high);
- }
- // int32
- __declspec(safe,cost2)
- static inline unsigned int clamp(unsigned int v, unsigned int low, unsigned int high) {
- return min(max(v, low), high);
- }
- __declspec(safe,cost2)
- static inline uniform unsigned int clamp(uniform unsigned int v, uniform unsigned int low,
- uniform unsigned int high) {
- return min(max(v, low), high);
- }
- __declspec(safe,cost2)
- static inline int clamp(int v, int low, int high) {
- return min(max(v, low), high);
- }
- __declspec(safe,cost2)
- static inline uniform int clamp(uniform int v, uniform int low, uniform int high) {
- return min(max(v, low), high);
- }
- // int64
- __declspec(safe,cost2)
- static inline unsigned int64 clamp(unsigned int64 v, unsigned int64 low,
- unsigned int64 high) {
- return min(max(v, low), high);
- }
- __declspec(safe,cost2)
- static inline uniform unsigned int64 clamp(uniform unsigned int64 v,
- uniform unsigned int64 low,
- uniform unsigned int64 high) {
- return min(max(v, low), high);
- }
- __declspec(safe,cost2)
- static inline int64 clamp(int64 v, int64 low, int64 high) {
- return min(max(v, low), high);
- }
- __declspec(safe,cost2)
- static inline uniform int64 clamp(uniform int64 v, uniform int64 low,
- uniform int64 high) {
- return min(max(v, low), high);
- }
- ///////////////////////////////////////////////////////////////////////////
- // Global atomics and memory barriers
- static inline void memory_barrier() {
- __memory_barrier();
- }
- #define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE,TC) \
- static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
- TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
- return ret; \
- } \
- static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
- uniform TA value) { \
- uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
- return ret; \
- } \
- static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
- if (__is_nvptx_target) { \
- TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
- return ret; \
- } else { \
- uniform TA * uniform ptrArray[programCount]; \
- ptrArray[programIndex] = ptr; \
- TA ret; \
- foreach_active (i) { \
- uniform TA * uniform p = ptrArray[i]; \
- uniform TA v = extract(value, i); \
- uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \
- ret = insert(ret, i, r); \
- } \
- return ret; \
- } \
- } \
- #define DEFINE_ATOMIC_SWAP(TA,TB,MASKTYPE,TC) \
- static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
- if (__is_nvptx_target) { \
- TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
- return ret; \
- } else { \
- uniform int i = 0; \
- TA ret[programCount]; \
- TA memVal; \
- uniform int lastSwap; \
- uniform unsigned int64 mask = lanemask(); \
- /* First, have the first running program instance (if any) perform \
- the swap with memory with its value of "value"; record the \
- value returned. */ \
- for (; i < programCount; ++i) { \
- if ((mask & (1ull << i)) == 0) \
- continue; \
- memVal = __atomic_swap_uniform_##TB##_global(ptr, extract(value, i)); \
- lastSwap = i; \
- break; \
- } \
- /* Now, for all of the remaining running program instances, set the \
- return value of the last instance that did a swap with this \
- instance's value of "value"; this gives the same effect as if the \
- current instance had executed a hardware atomic swap right before \
- the last one that did a swap. */ \
- for (; i < programCount; ++i) { \
- if ((mask & (1ull << i)) == 0) \
- continue; \
- ret[lastSwap] = extract(value, i); \
- lastSwap = i; \
- } \
- /* And the last instance that wanted to swap gets the value we \
- originally got back from memory... */ \
- ret[lastSwap] = memVal; \
- return ret[programIndex]; \
- }\
- } \
- static inline uniform TA atomic_swap_global(uniform TA * uniform ptr, \
- uniform TA value) { \
- uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value); \
- return ret; \
- } \
- static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
- if (__is_nvptx_target) { \
- TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
- return ret; \
- } else { \
- uniform TA * uniform ptrArray[programCount]; \
- ptrArray[programIndex] = ptr; \
- TA ret; \
- foreach_active (i) { \
- uniform TA * uniform p = ptrArray[i]; \
- uniform TA v = extract(value, i); \
- uniform TA r = __atomic_swap_uniform_##TB##_global(p, v); \
- ret = insert(ret, i, r); \
- } \
- return ret; \
- }\
- } \
- #define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB,MASKTYPE,TC) \
- static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
- uniform TA oneval = reduce_##OPA(value); \
- TA ret; \
- if (lanemask() != 0) \
- ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval); \
- return ret; \
- } \
- static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
- uniform TA value) { \
- uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
- return ret; \
- } \
- static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
- TA value) { \
- if (__is_nvptx_target) { \
- TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
- return ret; \
- } else { \
- uniform TA * uniform ptrArray[programCount]; \
- ptrArray[programIndex] = ptr; \
- TA ret; \
- foreach_active (i) { \
- uniform TA * uniform p = ptrArray[i]; \
- uniform TA v = extract(value, i); \
- uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v); \
- ret = insert(ret, i, r); \
- } \
- return ret; \
- } \
- }
- DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType,int64)
- DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType,int64)
- DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType,int64)
- DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType,int64)
- DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType,int64)
- DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType,int64)
- DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType,int64)
- DEFINE_ATOMIC_SWAP(int32,int32,IntMaskType,int64)
- // For everything but atomic min and max, we can use the same
- // implementations for unsigned as for signed.
- DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType, unsigned int64)
- DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType, unsigned int64)
- DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType,unsigned int64)
- DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType,unsigned int64)
- DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType, unsigned int64)
- DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType, unsigned int64)
- DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType, unsigned int64)
- DEFINE_ATOMIC_SWAP(unsigned int32,int32,UIntMaskType, unsigned int64)
- DEFINE_ATOMIC_SWAP(float,float,IntMaskType,int64)
- DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType,int64)
- DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType,int64)
- DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType,int64)
- DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType,int64)
- DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType,int64)
- DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType,int64)
- DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType,int64)
- DEFINE_ATOMIC_SWAP(int64,int64,IntMaskType, int64)
- // For everything but atomic min and max, we can use the same
- // implementations for unsigned as for signed.
- DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType,unsigned int64)
- DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType,unsigned int64)
- DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType,unsigned int64)
- DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType,unsigned int64)
- DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType,unsigned int64)
- DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType,unsigned int64)
- DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType,unsigned int64)
- DEFINE_ATOMIC_SWAP(unsigned int64,int64,UIntMaskType, unsigned int64)
- DEFINE_ATOMIC_SWAP(double,double,IntMaskType, int64)
- #undef DEFINE_ATOMIC_OP
- #undef DEFINE_ATOMIC_MINMAX_OP
- #undef DEFINE_ATOMIC_SWAP
- #define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE, TC) \
- static inline uniform TA atomic_compare_exchange_global( \
- uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
- uniform TA ret = \
- __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval); \
- return ret; \
- } \
- static inline TA atomic_compare_exchange_global( \
- uniform TA * uniform ptr, TA oldval, TA newval) { \
- TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval, \
- (MASKTYPE)__mask); \
- return ret; \
- } \
- static inline TA atomic_compare_exchange_global( \
- uniform TA * varying ptr, TA oldval, TA newval) { \
- if (__is_nvptx_target) { \
- TA ret = __atomic_compare_exchange_varying_##TB##_global((TC)ptr, oldval, newval, (MASKTYPE)__mask); \
- return ret; \
- } else { \
- uniform TA * uniform ptrArray[programCount]; \
- ptrArray[programIndex] = ptr; \
- TA ret; \
- foreach_active (i) { \
- uniform TA r = \
- __atomic_compare_exchange_uniform_##TB##_global(ptrArray[i], \
- extract(oldval, i), \
- extract(newval, i)); \
- ret = insert(ret, i, r); \
- } \
- return ret; \
- } \
- }
- ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType,int64)
- ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType,unsigned int64)
- ATOMIC_DECL_CMPXCHG(float, float, IntMaskType,int64)
- ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType,int64)
- ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType,unsigned int64)
- ATOMIC_DECL_CMPXCHG(double, double, IntMaskType,int64)
- #undef ATOMIC_DECL_CMPXCHG
- // void * variants of swap and compare exchange
- static inline void *atomic_swap_global(void ** uniform ptr,
- void * value) {
- return (void *)atomic_swap_global((intptr_t * uniform)ptr,
- (intptr_t)value);
- }
- static inline void * uniform atomic_swap_global(void ** uniform ptr,
- void * uniform value) {
- return (void * uniform)atomic_swap_global((intptr_t * uniform)ptr,
- (uniform intptr_t)value);
- }
- static inline void *atomic_swap_global(void ** ptr, void * value) {
- return (void *)atomic_swap_global((intptr_t *)ptr,
- (intptr_t)value);
- }
- static inline void *
- atomic_compare_exchange_global(void ** uniform ptr,
- void * oldval, void * newval) {
- return (void *)atomic_compare_exchange_global((intptr_t * uniform)ptr,
- (intptr_t)oldval,
- (intptr_t)newval);
- }
- static inline void * uniform
- atomic_compare_exchange_global(void ** uniform ptr, void * uniform oldval,
- void * uniform newval) {
- return (void * uniform)atomic_compare_exchange_global((intptr_t * uniform)ptr,
- (uniform intptr_t)oldval,
- (uniform intptr_t)newval);
- }
- static inline void *
- atomic_compare_exchange_global(void ** ptr, void * oldval,
- void * newval) {
- return (void *)atomic_compare_exchange_global((intptr_t *)ptr,
- (intptr_t)oldval,
- (intptr_t)newval);
- }
- ///////////////////////////////////////////////////////////////////////////
- // local atomics
- #define LOCAL_ATOMIC(TYPE,NAME,OPFUNC) \
- static inline uniform TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, \
- uniform TYPE value) { \
- uniform TYPE ret = *ptr; \
- *ptr = OPFUNC(*ptr, value); \
- return ret; \
- } \
- static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value) { \
- TYPE ret; \
- foreach_active (i) { \
- ret = insert(ret, i, *ptr); \
- *ptr = OPFUNC(*ptr, extract(value, i)); \
- } \
- return ret; \
- } \
- static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) { \
- TYPE ret; \
- if (__is_nvptx_target) { \
- foreach_active (i) { \
- uniform TYPE * uniform ptr = (uniform TYPE * uniform)extract((int64)p, i); \
- ret = insert(ret, i, *ptr); \
- *ptr = OPFUNC(*ptr, extract(value, i)); \
- } \
- } else { \
- uniform TYPE * uniform ptrs[programCount]; \
- ptrs[programIndex] = p; \
- foreach_active (i) { \
- ret = insert(ret, i, *ptrs[i]); \
- *ptrs[i] = OPFUNC(*ptrs[i], extract(value, i)); \
- } \
- } \
- return ret; \
- }
- static inline uniform int32 __add(uniform int32 a, uniform int32 b) { return a+b; }
- static inline uniform int32 __sub(uniform int32 a, uniform int32 b) { return a-b; }
- static inline uniform int32 __and(uniform int32 a, uniform int32 b) { return a & b; }
- static inline uniform int32 __or(uniform int32 a, uniform int32 b) { return a | b; }
- static inline uniform int32 __xor(uniform int32 a, uniform int32 b) { return a ^ b; }
- static inline uniform int32 __swap(uniform int32 a, uniform int32 b) { return b; }
- static inline uniform unsigned int32 __add(uniform unsigned int32 a,
- uniform unsigned int32 b) { return a+b; }
- static inline uniform unsigned int32 __sub(uniform unsigned int32 a,
- uniform unsigned int32 b) { return a-b; }
- static inline uniform unsigned int32 __and(uniform unsigned int32 a,
- uniform unsigned int32 b) { return a & b; }
- static inline uniform unsigned int32 __or(uniform unsigned int32 a,
- uniform unsigned int32 b) { return a | b; }
- static inline uniform unsigned int32 __xor(uniform unsigned int32 a,
- uniform unsigned int32 b) { return a ^ b; }
- static inline uniform unsigned int32 __swap(uniform unsigned int32 a,
- uniform unsigned int32 b) { return b; }
- static inline uniform float __add(uniform float a, uniform float b) { return a+b; }
- static inline uniform float __sub(uniform float a, uniform float b) { return a-b; }
- static inline uniform float __swap(uniform float a, uniform float b) { return b; }
- static inline uniform int64 __add(uniform int64 a, uniform int64 b) { return a+b; }
- static inline uniform int64 __sub(uniform int64 a, uniform int64 b) { return a-b; }
- static inline uniform int64 __and(uniform int64 a, uniform int64 b) { return a & b; }
- static inline uniform int64 __or(uniform int64 a, uniform int64 b) { return a | b; }
- static inline uniform int64 __xor(uniform int64 a, uniform int64 b) { return a ^ b; }
- static inline uniform int64 __swap(uniform int64 a, uniform int64 b) { return b; }
- static inline uniform unsigned int64 __add(uniform unsigned int64 a,
- uniform unsigned int64 b) { return a+b; }
- static inline uniform unsigned int64 __sub(uniform unsigned int64 a,
- uniform unsigned int64 b) { return a-b; }
- static inline uniform unsigned int64 __and(uniform unsigned int64 a,
- uniform unsigned int64 b) { return a & b; }
- static inline uniform unsigned int64 __or(uniform unsigned int64 a,
- uniform unsigned int64 b) { return a | b; }
- static inline uniform unsigned int64 __xor(uniform unsigned int64 a,
- uniform unsigned int64 b) { return a ^ b; }
- static inline uniform unsigned int64 __swap(uniform unsigned int64 a,
- uniform unsigned int64 b) { return b; }
- static inline uniform double __add(uniform double a, uniform double b) { return a+b; }
- static inline uniform double __sub(uniform double a, uniform double b) { return a-b; }
- static inline uniform double __swap(uniform double a, uniform double b) { return a-b; }
- LOCAL_ATOMIC(int32, add, __add)
- LOCAL_ATOMIC(int32, subtract, __sub)
- LOCAL_ATOMIC(int32, and, __and)
- LOCAL_ATOMIC(int32, or, __or)
- LOCAL_ATOMIC(int32, xor, __xor)
- LOCAL_ATOMIC(int32, min, min)
- LOCAL_ATOMIC(int32, max, max)
- LOCAL_ATOMIC(int32, swap, __swap)
- LOCAL_ATOMIC(unsigned int32, add, __add)
- LOCAL_ATOMIC(unsigned int32, subtract, __sub)
- LOCAL_ATOMIC(unsigned int32, and, __and)
- LOCAL_ATOMIC(unsigned int32, or, __or)
- LOCAL_ATOMIC(unsigned int32, xor, __xor)
- LOCAL_ATOMIC(unsigned int32, min, min)
- LOCAL_ATOMIC(unsigned int32, max, max)
- LOCAL_ATOMIC(unsigned int32, swap, __swap)
- LOCAL_ATOMIC(float, add, __add)
- LOCAL_ATOMIC(float, subtract, __sub)
- LOCAL_ATOMIC(float, min, min)
- LOCAL_ATOMIC(float, max, max)
- LOCAL_ATOMIC(float, swap, __swap)
- LOCAL_ATOMIC(int64, add, __add)
- LOCAL_ATOMIC(int64, subtract, __sub)
- LOCAL_ATOMIC(int64, and, __and)
- LOCAL_ATOMIC(int64, or, __or)
- LOCAL_ATOMIC(int64, xor, __xor)
- LOCAL_ATOMIC(int64, min, min)
- LOCAL_ATOMIC(int64, max, max)
- LOCAL_ATOMIC(int64, swap, __swap)
- LOCAL_ATOMIC(unsigned int64, add, __add)
- LOCAL_ATOMIC(unsigned int64, subtract, __sub)
- LOCAL_ATOMIC(unsigned int64, and, __and)
- LOCAL_ATOMIC(unsigned int64, or, __or)
- LOCAL_ATOMIC(unsigned int64, xor, __xor)
- LOCAL_ATOMIC(unsigned int64, min, min)
- LOCAL_ATOMIC(unsigned int64, max, max)
- LOCAL_ATOMIC(unsigned int64, swap, __swap)
- LOCAL_ATOMIC(double, add, __add)
- LOCAL_ATOMIC(double, subtract, __sub)
- LOCAL_ATOMIC(double, min, min)
- LOCAL_ATOMIC(double, max, max)
- LOCAL_ATOMIC(double, swap, __swap)
- // compare exchange
- #define LOCAL_CMPXCHG(TYPE) \
- static inline uniform TYPE atomic_compare_exchange_local(uniform TYPE * uniform ptr, \
- uniform TYPE cmp, \
- uniform TYPE update) { \
- uniform TYPE old = *ptr; \
- if (old == cmp) \
- *ptr = update; \
- return old; \
- } \
- static inline TYPE atomic_compare_exchange_local(uniform TYPE * uniform ptr, \
- TYPE cmp, TYPE update) { \
- TYPE ret; \
- foreach_active (i) { \
- uniform TYPE old = *ptr; \
- if (old == extract(cmp, i)) \
- *ptr = extract(update, i); \
- ret = insert(ret, i, old); \
- } \
- return ret; \
- } \
- static inline TYPE atomic_compare_exchange_local(uniform TYPE * varying p, \
- TYPE cmp, TYPE update) { \
- uniform TYPE * uniform ptrs[programCount]; \
- ptrs[programIndex] = p; \
- TYPE ret; \
- foreach_active (i) { \
- uniform TYPE old = *ptrs[i]; \
- if (old == extract(cmp, i)) \
- *ptrs[i] = extract(update, i); \
- ret = insert(ret, i, old); \
- } \
- return ret; \
- }
- LOCAL_CMPXCHG(int32)
- LOCAL_CMPXCHG(unsigned int32)
- LOCAL_CMPXCHG(float)
- LOCAL_CMPXCHG(int64)
- LOCAL_CMPXCHG(unsigned int64)
- LOCAL_CMPXCHG(double)
- #undef LOCAL_ATOMIC
- #undef LOCAL_CMPXCHG
- // void * variants of swap and compare exchange
- static inline void *atomic_swap_local(void ** uniform ptr,
- void * value) {
- return (void *)atomic_swap_local((intptr_t * uniform)ptr,
- (intptr_t)value);
- }
- static inline void * uniform atomic_swap_local(void ** uniform ptr,
- void * uniform value) {
- return (void * uniform)atomic_swap_local((intptr_t * uniform)ptr,
- (uniform intptr_t)value);
- }
- static inline void *atomic_swap_local(void ** ptr, void * value) {
- return (void *)atomic_swap_local((intptr_t *)ptr,
- (intptr_t)value);
- }
- static inline void *
- atomic_compare_exchange_local(void ** uniform ptr,
- void * oldval, void * newval) {
- return (void *)atomic_compare_exchange_local((intptr_t * uniform)ptr,
- (intptr_t)oldval,
- (intptr_t)newval);
- }
- static inline void * uniform
- atomic_compare_exchange_local(void ** uniform ptr, void * uniform oldval,
- void * uniform newval) {
- return (void * uniform)atomic_compare_exchange_local((intptr_t * uniform)ptr,
- (uniform intptr_t)oldval,
- (uniform intptr_t)newval);
- }
- static inline void *
- atomic_compare_exchange_local(void ** ptr, void * oldval,
- void * newval) {
- return (void *)atomic_compare_exchange_local((intptr_t *)ptr,
- (intptr_t)oldval,
- (intptr_t)newval);
- }
- ///////////////////////////////////////////////////////////////////////////
- // Transcendentals (float precision)
- __declspec(safe)
- static inline float sqrt(float v) {
- return __sqrt_varying_float(v);
- }
- __declspec(safe)
- static inline uniform float sqrt(uniform float v) {
- return __sqrt_uniform_float(v);
- }
- __declspec(safe)
- static inline float rsqrt(float v) {
- return __rsqrt_varying_float(v);
- }
- __declspec(safe)
- static inline uniform float rsqrt(uniform float v) {
- return __rsqrt_uniform_float(v);
- }
- __declspec(safe)
- static inline float ldexp(float x, int n) {
- unsigned int ex = 0x7F800000u;
- unsigned int ix = intbits(x);
- ex &= ix; // extract old exponent;
- ix = ix & ~0x7F800000u; // clear exponent
- n = (n << 23) + ex;
- ix |= n; // insert new exponent
- return floatbits(ix);
- }
- __declspec(safe)
- static inline uniform float ldexp(uniform float x, uniform int n) {
- uniform unsigned int ex = 0x7F800000u;
- uniform unsigned int ix = intbits(x);
- ex &= ix; // extract old exponent;
- ix = ix & ~0x7F800000u; // clear exponent
- n = (n << 23) + ex;
- ix |= n; // insert new exponent
- return floatbits(ix);
- }
- __declspec(safe)
- static inline float frexp(float x, varying int * uniform pw2) {
- unsigned int ex = 0x7F800000u; // exponent mask
- unsigned int ix = intbits(x);
- ex &= ix;
- ix &= ~0x7F800000u; // clear exponent
- *pw2 = (int)(ex >> 23) - 126; // compute exponent
- ix |= 0x3F000000u; // insert exponent +1 in x
- return floatbits(ix);
- }
- __declspec(safe)
- static inline uniform float frexp(uniform float x, uniform int * uniform pw2) {
- uniform unsigned int ex = 0x7F800000u; // exponent mask
- uniform unsigned int ix = intbits(x);
- ex &= ix;
- ix &= ~0x7F800000u; // clear exponent
- *pw2 = (uniform int)(ex >> 23) - 126; // compute exponent
- ix |= 0x3F000000u; // insert exponent +1 in x
- return floatbits(ix);
- }
- // Most of the transcendental implementations in ispc code here come from
- // Solomon Boulos's "syrah": https://github.com/boulos/syrah/
- __declspec(safe)
- static inline float sin(float x_full) {
- if (__have_native_trigonometry)
- {
- return __sin_varying_float(x_full);
- }
- else if (__math_lib == __math_lib_svml) {
- return __svml_sinf(x_full);
- }
- else if (__math_lib == __math_lib_system) {
- float ret;
- foreach_active (i) {
- uniform float r = __stdlib_sinf(extract(x_full, i));
- ret = insert(ret, i, r);
- }
- return ret;
- }
- else if (__math_lib == __math_lib_ispc ||
- __math_lib == __math_lib_ispc_fast) {
- static const float pi_over_two_vec = 1.57079637050628662109375;
- static const float two_over_pi_vec = 0.636619746685028076171875;
- float scaled = x_full * two_over_pi_vec;
- float k_real = floor(scaled);
- int k = (int)k_real;
- // Reduced range version of x
- float x = x_full - k_real * pi_over_two_vec;
- int k_mod4 = k & 3;
- bool sin_usecos = (k_mod4 == 1 || k_mod4 == 3);
- bool flip_sign = (k_mod4 > 1);
- // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
- // 4, 6, 8, 10|], [|single...|], [0;Pi/2]);
- static const float sin_c2 = -0.16666667163372039794921875;
- static const float sin_c4 = 8.333347737789154052734375e-3;
- static const float sin_c6 = -1.9842604524455964565277099609375e-4;
- static const float sin_c8 = 2.760012648650445044040679931640625e-6;
- static const float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
- static const float cos_c2 = -0.5;
- static const float cos_c4 = 4.166664183139801025390625e-2;
- static const float cos_c6 = -1.388833043165504932403564453125e-3;
- static const float cos_c8 = 2.47562347794882953166961669921875e-5;
- static const float cos_c10 = -2.59630184018533327616751194000244140625e-7;
- float outside = sin_usecos ? 1 : x;
- float c2 = sin_usecos ? cos_c2 : sin_c2;
- float c4 = sin_usecos ? cos_c4 : sin_c4;
- float c6 = sin_usecos ? cos_c6 : sin_c6;
- float c8 = sin_usecos ? cos_c8 : sin_c8;
- float c10 = sin_usecos ? cos_c10 : sin_c10;
- float x2 = x * x;
- float formula = x2 * c10 + c8;
- formula = x2 * formula + c6;
- formula = x2 * formula + c4;
- formula = x2 * formula + c2;
- formula = x2 * formula + 1;
- formula *= outside;
- formula = flip_sign ? -formula : formula;
- return formula;
- }
- }
- __declspec(safe)
- static inline uniform float sin(uniform float x_full) {
- if (__have_native_trigonometry)
- {
- return __sin_uniform_float(x_full);
- }
- else if (__math_lib == __math_lib_system ||
- __math_lib == __math_lib_svml) {
- return __stdlib_sinf(x_full);
- }
- else if (__math_lib == __math_lib_ispc ||
- __math_lib == __math_lib_ispc_fast) {
- static const uniform float pi_over_two_vec = 1.57079637050628662109375;
- static const uniform float two_over_pi_vec = 0.636619746685028076171875;
- uniform float scaled = x_full * two_over_pi_vec;
- uniform float k_real = floor(scaled);
- uniform int k = (int)k_real;
- // Reduced range version of x
- uniform float x = x_full - k_real * pi_over_two_vec;
- uniform int k_mod4 = k & 3;
- uniform bool sin_usecos = (k_mod4 == 1 || k_mod4 == 3);
- uniform bool flip_sign = (k_mod4 > 1);
- // These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
- // 4, 6, 8, 10|], [|single...|], [0;Pi/2]);
- static const uniform float sin_c2 = -0.16666667163372039794921875;
- static const uniform float sin_c4 = 8.333347737789154052734375e-3;
- static const uniform float sin_c6 = -1.9842604524455964565277099609375e-4;
- static const uniform float sin_c8 = 2.760012648650445044040679931640625e-6;
- static const uniform float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
- static const uniform float cos_c2 = -0.5;
- static const uniform float cos_c4 = 4.166664183139801025390625e-2;
- static const uniform float cos_c6 = -1.388833043165504932403564453125e-3;
- static const uniform float cos_c8 = 2.47562347794882953166961669921875e-5;
- static const uniform float cos_c10 = -2.59630184018533327616751194000244140625e-7;
- uniform float outside, c2, c4, c6, c8, c10;
- if (sin_usecos) {
- outside = 1.;
- c2 = cos_c2;
- c4 = cos_c4;
- c6 = cos_c6;
- c8 = cos_c8;
- c10 = cos_c10;
- }
- else {
- outside = x;
- c2 = sin_c2;
- c4 = sin_c4;
- c6 = sin_c6;
- c8 = sin_c8;
- c10 = sin_c10;
- }
- uniform float x2 = x * x;
- uniform float formula = x2 * c10 + c8;
- formula = x2 * formula + c6;
- formula = x2 * formula + c4;
- formula = x2 * formula + c2;
- formula = x2 * formula + 1.;
- formula *= outside;
- formula = flip_sign ? -formula : formula;
- return formula;
- }
- }
- __declspec(safe)
- static inline float asin(float x0) {
- bool isneg = x0< 0;
- float x = abs(x0);
- bool isnan = (x > 1);
- float v;
- if (__have_native_trigonometry)
- {
- return __asin_varying_float(x0);
- }
- else if (__math_lib == __math_lib_svml) {
- return __svml_asinf(x0);
- }
- else if (__math_lib == __math_lib_system) {
- float ret;
- foreach_active (i) {
- uniform float r = __stdlib_asinf(extract(x0, i));
- ret = insert(ret, i, r);
- }
- return ret;
- }
- else if (__math_lib == __math_lib_ispc)
- {
- // sollya
- // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5,6,7,8,9,10|],
- // [|single...|], [1e-20;.9999999999999999]);
- // avg error: 8.5716801e-09, max error: 2.1373853e-07
- v = 1.57079637050628662109375f +
- x * (-0.21460501849651336669921875f +
- x * (8.9116774499416351318359375e-2f +
- x * (-5.146093666553497314453125e-2f +
- x * (3.7269376218318939208984375e-2f +
- x * (-3.5882405936717987060546875e-2f +
- x * (4.14929799735546112060546875e-2f +
- x * (-4.25077490508556365966796875e-2f +
- x * (3.05023305118083953857421875e-2f +
- x * (-1.2897425331175327301025390625e-2f +
- x * 2.38926825113594532012939453125e-3f)))))))));
- }
- else if (__math_lib == __math_lib_ispc_fast)
- {
- // sollya
- // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|],
- // [1e-20;.9999999999999999]);
- // avg error: 1.1105439e-06, max error 1.3187528e-06
- v = 1.57079517841339111328125f +
- x * (-0.21450997889041900634765625f +
- x * (8.78556668758392333984375e-2f +
- x * (-4.489909112453460693359375e-2f +
- x * (1.928029954433441162109375e-2f +
- x * (-4.3095736764371395111083984375e-3f)))));
- }
- v *= -sqrt(1.f - x);
- v = v + 1.57079637050628662109375;
- if (v < 0) v = 0;
- // v = max(0, v);
- if (isneg) v = -v;
- if (isnan) v = floatbits(0x7fc00000);
- return v;
- }
- __declspec(safe)
- static inline uniform float asin(uniform float x0) {
- uniform bool isneg = x0 < 0;
- uniform float x = abs(x0);
- uniform bool isnan = (x > 1);
- uniform float v;
- if (__have_native_trigonometry)
- {
- return __asin_uniform_float(x0);
- }
- else if (__math_lib == __math_lib_svml ||
- __math_lib == __math_lib_system) {
- return __stdlib_asinf(x0);
- }
- else if (__math_lib == __math_lib_ispc)
- {
- // sollya
- // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5,6,7,8,9,10|],
- // [|single...|], [1e-20;.9999999999999999]);
- // avg error: 8.5716801e-09, max error: 2.1373853e-07
- v = 1.57079637050628662109375f +
- x * (-0.21460501849651336669921875f +
- x * (8.9116774499416351318359375e-2f +
- x * (-5.146093666553497314453125e-2f +
- x * (3.7269376218318939208984375e-2f +
- x * (-3.5882405936717987060546875e-2f +
- x * (4.14929799735546112060546875e-2f +
- x * (-4.25077490508556365966796875e-2f +
- x * (3.05023305118083953857421875e-2f +
- x * (-1.2897425331175327301025390625e-2f +
- x * 2.38926825113594532012939453125e-3f)))))))));
- }
- else if (__math_lib == __math_lib_ispc_fast)
- {
- // sollya
- // fpminimax(((asin(x)-pi/2)/-sqrt(1-x)), [|0,1,2,3,4,5|],[|single...|],
- // [1e-20;.9999999999999999]);
- // avg error: 1.1105439e-06, max error 1.3187528e-06
- v = 1.57079517841339111328125f +
- x * (-0.21450997889041900634765625f +
- x * (8.78556668758392333984375e-2f +
- x * (-4.489909112453460693359375e-2f +
- x * (1.928029954433441162109375e-2f +
- x * (-4.3095736764371395111083984375e-3f)))));
- }
- v *= -sqrt(1.f - x);
- v = v + 1.57079637050628662109375;
- if (v < 0) v = 0;
- // v = max(0, v);
- if (isneg) v = -v;
- if (isnan) v = floatbits(0x7fc00000);
- return v;
- }
- __declspec(safe)
- static inline float cos(float x_full) {
- if (__have_native_trigonometry)
- {
- return __cos_varying_float(x_full);
- }
- if (__math_lib == __math_lib_svml) {
- return __svml_cosf(x_full);
- }
- else if (__math_lib == __math_lib_system) {
- float ret;
- foreach_active (i) {
- uniform float r = __stdlib_cosf(extract(x_full, i));
- ret = insert(ret, i, r);
- }
- return ret;
- }
- else if (__math_lib == __math_lib_ispc ||
- __math_lib == __math_lib_ispc_fast) {
- static const float pi_over_two_vec = 1.57079637050628662109375;
- static const float two_over_pi_vec = 0.636619746685028076171875;
- float scaled = x_full * two_over_pi_vec;
- float k_real = floor(scaled);
- int k = (int)k_real;
- // Reduced range version of x
- float x = x_full - k_real * pi_over_two_vec;
- int k_mod4 = k & 3;
- bool cos_usecos = (k_mod4 == 0 || k_mod4 == 2);
- bool flip_sign = (k_mod4 == 1 || k_mod4 == 2);
- const float sin_c2 = -0.16666667163372039794921875;
- const float sin_c4 = 8.333347737789154052734375e-3;
- const float sin_c6 = -1.9842604524455964565277099609375e-4;
- const float sin_c8 = 2.760012648650445044040679931640625e-6;
- const float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
- const float cos_c2 = -0.5;
- const float cos_c4 = 4.166664183139801025390625e-2;
- const float cos_c6 = -1.388833043165504932403564453125e-3;
- const float cos_c8 = 2.47562347794882953166961669921875e-5;
- const float cos_c10 = -2.59630184018533327616751194000244140625e-7;
- float outside = cos_usecos ? 1. : x;
- float c2 = cos_usecos ? cos_c2 : sin_c2;
- float c4 = cos_usecos ? cos_c4 : sin_c4;
- float c6 = cos_usecos ? cos_c6 : sin_c6;
- float c8 = cos_usecos ? cos_c8 : sin_c8;
- float c10 = cos_usecos ? cos_c10 : sin_c10;
- float x2 = x * x;
- float formula = x2 * c10 + c8;
- formula = x2 * formula + c6;
- formula = x2 * formula + c4;
- formula = x2 * formula + c2;
- formula = x2 * formula + 1.;
- formula *= outside;
- formula = flip_sign ? -formula : formula;
- return formula;
- }
- }
- __declspec(safe)
- static inline uniform float cos(uniform float x_full) {
- if (__have_native_trigonometry)
- {
- return __cos_uniform_float(x_full);
- }
- else if (__math_lib == __math_lib_system ||
- __math_lib == __math_lib_svml) {
- return __stdlib_cosf(x_full);
- }
- else if (__math_lib == __math_lib_ispc ||
- __math_lib == __math_lib_ispc_fast) {
- static const uniform float pi_over_two_vec = 1.57079637050628662109375;
- static const uniform float two_over_pi_vec = 0.636619746685028076171875;
- uniform float scaled = x_full * two_over_pi_vec;
- uniform float k_real = floor(scaled);
- uniform int k = (int)k_real;
- // Reduced range version of x
- uniform float x = x_full - k_real * pi_over_two_vec;
- uniform int k_mod4 = k & 3;
- uniform bool cos_usecos = (k_mod4 == 0 || k_mod4 == 2);
- uniform bool flip_sign = (k_mod4 == 1 || k_mod4 == 2);
- const uniform float sin_c2 = -0.16666667163372039794921875;
- const uniform float sin_c4 = 8.333347737789154052734375e-3;
- const uniform float sin_c6 = -1.9842604524455964565277099609375e-4;
- const uniform float sin_c8 = 2.760012648650445044040679931640625e-6;
- const uniform float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
- const uniform float cos_c2 = -0.5;
- const uniform float cos_c4 = 4.166664183139801025390625e-2;
- const uniform float cos_c6 = -1.388833043165504932403564453125e-3;
- const uniform float cos_c8 = 2.47562347794882953166961669921875e-5;
- const uniform float cos_c10 = -2.59630184018533327616751194000244140625e-7;
- uniform float outside, c2, c4, c6, c8, c10;
- if (cos_usecos) {
- outside = 1.;
- c2 = cos_c2;
- c4 = cos_c4;
- c6 = cos_c6;
- c8 = cos_c8;
- c10 = cos_c10;
- }
- else {
- outside = x;
- c2 = sin_c2;
- c4 = sin_c4;
- c6 = sin_c6;
- c8 = sin_c8;
- c10 = sin_c10;
- }
- uniform float x2 = x * x;
- uniform float formula = x2 * c10 + c8;
- formula = x2 * formula + c6;
- formula = x2 * formula + c4;
- formula = x2 * formula + c2;
- formula = x2 * formula + 1.;
- formula *= outside;
- formula = flip_sign ? -formula : formula;
- return formula;
- }
- }
- __declspec(safe)
- static inline float acos(float v) {
- if (__have_native_trigonometry)
- return __acos_varying_float(v);
- else
- return 1.57079637050628662109375 - asin(v);
- }
- __declspec(safe)
- static inline double acos(const double v) {
- if (__have_native_trigonometry)
- return __acos_varying_double(v);
- else
- return 1.57079637050628662109375d0 - asin(v);
- }
- __declspec(safe)
- static inline uniform float acos(uniform float v) {
- if (__have_native_trigonometry)
- return __acos_uniform_float(v);
- else
- return 1.57079637050628662109375 - asin(v);
- }
- __declspec(safe)
- static inline uniform double acos(const uniform double v) {
- if (__have_native_trigonometry)
- return __acos_uniform_double(v);
- else
- return 1.57079637050628662109375d0 - asin(v);
- }
- __declspec(safe)
- static inline void sincos(float x_full, varying float * uniform sin_result,
- varying float * uniform cos_result) {
- if (__have_native_trigonometry)
- {
- __sincos_varying_float(x_full,sin_result,cos_result);
- }
- if (__math_lib == __math_lib_svml) {
- __svml_sincosf(x_full, sin_result, cos_result);
- }
- else if (__math_lib == __math_lib_system) {
- foreach_active (i) {
- uniform float s, c;
- __stdlib_sincosf(extract(x_full, i), &s, &c);
- *sin_result = insert(*sin_result, i, s);
- *cos_result = insert(*cos_result, i, c);
- }
- }
- else if (__math_lib == __math_lib_ispc ||
- __math_lib == __math_lib_ispc_fast) {
- const float pi_over_two_vec = 1.57079637050628662109375;
- const float two_over_pi_vec = 0.636619746685028076171875;
- float scaled = x_full * two_over_pi_vec;
- float k_real = floor(scaled);
- int k = (int)k_real;
- // Reduced range version of x
- float x = x_full - k_real * pi_over_two_vec;
- int k_mod4 = k & 3;
- bool cos_usecos = (k_mod4 == 0 || k_mod4 == 2);
- bool sin_usecos = (k_mod4 == 1 || k_mod4 == 3);
- bool sin_flipsign = (k_mod4 > 1);
- bool cos_flipsign = (k_mod4 == 1 || k_mod4 == 2);
- const float one_vec = 1.;
- const float sin_c2 = -0.16666667163372039794921875;
- const float sin_c4 = 8.333347737789154052734375e-3;
- const float sin_c6 = -1.9842604524455964565277099609375e-4;
- const float sin_c8 = 2.760012648650445044040679931640625e-6;
- const float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
- const float cos_c2 = -0.5;
- const float cos_c4 = 4.166664183139801025390625e-2;
- const float cos_c6 = -1.388833043165504932403564453125e-3;
- const float cos_c8 = 2.47562347794882953166961669921875e-5;
- const float cos_c10 = -2.59630184018533327616751194000244140625e-7;
- float x2 = x * x;
- float sin_formula = x2 * sin_c10 + sin_c8;
- float cos_formula = x2 * cos_c10 + cos_c8;
- sin_formula = x2 * sin_formula + sin_c6;
- cos_formula = x2 * cos_formula + cos_c6;
- sin_formula = x2 * sin_formula + sin_c4;
- cos_formula = x2 * cos_formula + cos_c4;
- sin_formula = x2 * sin_formula + sin_c2;
- cos_formula = x2 * cos_formula + cos_c2;
- sin_formula = x2 * sin_formula + one_vec;
- cos_formula = x2 * cos_formula + one_vec;
- sin_formula *= x;
- *sin_result = sin_usecos ? cos_formula : sin_formula;
- *cos_result = cos_usecos ? cos_formula : sin_formula;
- *sin_result = sin_flipsign ? -*sin_result : *sin_result;
- *cos_result = cos_flipsign ? -*cos_result : *cos_result;
- }
- }
- __declspec(safe)
- static inline void sincos(uniform float x_full, uniform float * uniform sin_result,
- uniform float * uniform cos_result) {
- if (__have_native_trigonometry)
- {
- __sincos_uniform_float(x_full, sin_result, cos_result);
- }
- if (__math_lib == __math_lib_system ||
- __math_lib == __math_lib_svml) {
- __stdlib_sincosf(x_full, sin_result, cos_result);
- }
- else if (__math_lib == __math_lib_ispc ||
- __math_lib == __math_lib_ispc_fast) {
- const uniform float pi_over_two_vec = 1.57079637050628662109375;
- const uniform float two_over_pi_vec = 0.636619746685028076171875;
- uniform float scaled = x_full * two_over_pi_vec;
- uniform float k_real = floor(scaled);
- uniform int k = (uniform int)k_real;
- // Reduced range version of x
- uniform float x = x_full - k_real * pi_over_two_vec;
- uniform int k_mod4 = k & 3;
- uniform bool cos_usecos = (k_mod4 == 0 || k_mod4 == 2);
- uniform bool sin_usecos = (k_mod4 == 1 || k_mod4 == 3);
- uniform bool sin_flipsign = (k_mod4 > 1);
- uniform bool cos_flipsign = (k_mod4 == 1 || k_mod4 == 2);
- const uniform float one_vec = 1.;
- const uniform float sin_c2 = -0.16666667163372039794921875;
- const uniform float sin_c4 = 8.333347737789154052734375e-3;
- const uniform float sin_c6 = -1.9842604524455964565277099609375e-4;
- const uniform float sin_c8 = 2.760012648650445044040679931640625e-6;
- const uniform float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
- const uniform float cos_c2 = -0.5;
- const uniform float cos_c4 = 4.166664183139801025390625e-2;
- const uniform float cos_c6 = -1.388833043165504932403564453125e-3;
- const uniform float cos_c8 = 2.47562347794882953166961669921875e-5;
- const uniform float cos_c10 = -2.59630184018533327616751194000244140625e-7;
- uniform float x2 = x * x;
- uniform float sin_formula = x2 * sin_c10 + sin_c8;
- uniform float cos_formula = x2 * cos_c10 + cos_c8;
- sin_formula = x2 * sin_formula + sin_c6;
- cos_formula = x2 * cos_formula + cos_c6;
- sin_formula = x2 * sin_formula + sin_c4;
- cos_formula = x2 * cos_formula + cos_c4;
- sin_formula = x2 * sin_formula + sin_c2;
- cos_formula = x2 * cos_formula + cos_c2;
- sin_formula = x2 * sin_formula + one_vec;
- cos_formula = x2 * cos_formula + one_vec;
- sin_formula *= x;
- *sin_result = sin_usecos ? cos_formula : sin_formula;
- *cos_result = cos_usecos ? cos_formula : sin_formula;
- *sin_result = sin_flipsign ? -*sin_result : *sin_result;
- *cos_result = cos_flipsign ? -*cos_result : *cos_result;
- }
- }
- __declspec(safe)
- static inline float tan(float x_full) {
- if (__have_native_trigonometry)
- {
- return __tan_varying_float(x_full);
- }
- else if (__math_lib == __math_lib_svml) {
- return __svml_tanf(x_full);
- }
- else if (__math_lib == __math_lib_system) {
- float ret;
- foreach_active (i) {
- uniform float r = __stdlib_tanf(extract(x_full, i));
- ret = insert(ret, i, r);
- }
- return ret;
- }
- else if (__math_lib == __math_lib_ispc ||
- __math_lib == __math_lib_ispc_fast) {
- const float pi_over_four_vec = 0.785398185253143310546875;
- const float four_over_pi_vec = 1.27323949337005615234375;
- bool x_lt_0 = x_full < 0.;
- float y = x_lt_0 ? -x_full : x_full;
- float scaled = y * four_over_pi_vec;
- float k_real = floor(scaled);
- int k = (int)k_real;
- float x = y - k_real * pi_over_four_vec;
- // if k & 1, x -= Pi/4
- bool need_offset = (k & 1) != 0;
- x = need_offset ? x - pi_over_four_vec : x;
- // if k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...
- int k_mod4 = k & 3;
- bool use_cotan = (k_mod4 == 1) || (k_mod4 == 2);
- const float one_vec = 1.0;
- const float tan_c2 = 0.33333075046539306640625;
- const float tan_c4 = 0.13339905440807342529296875;
- const float tan_c6 = 5.3348250687122344970703125e-2;
- const float tan_c8 = 2.46033705770969390869140625e-2;
- const float tan_c10 = 2.892402000725269317626953125e-3;
- const float tan_c12 = 9.500005282461643218994140625e-3;
- const float cot_c2 = -0.3333333432674407958984375;
- const float cot_c4 = -2.222204394638538360595703125e-2;
- const float cot_c6 = -2.11752182804048061370849609375e-3;
- const float cot_c8 = -2.0846328698098659515380859375e-4;
- const float cot_c10 = -2.548247357481159269809722900390625e-5;
- const float cot_c12 = -3.5257363606433500535786151885986328125e-7;
- float x2 = x * x;
- float z;
- cif (use_cotan) {
- float cot_val = x2 * cot_c12 + cot_c10;
- cot_val = x2 * cot_val + cot_c8;
- cot_val = x2 * cot_val + cot_c6;
- cot_val = x2 * cot_val + cot_c4;
- cot_val = x2 * cot_val + cot_c2;
- cot_val = x2 * cot_val + one_vec;
- // The equation is for x * cot(x) but we need -x * cot(x) for the tan part.
- cot_val /= -x;
- z = cot_val;
- } else {
- float tan_val = x2 * tan_c12 + tan_c10;
- tan_val = x2 * tan_val + tan_c8;
- tan_val = x2 * tan_val + tan_c6;
- tan_val = x2 * tan_val + tan_c4;
- tan_val = x2 * tan_val + tan_c2;
- tan_val = x2 * tan_val + one_vec;
- // Equation was for tan(x)/x
- tan_val *= x;
- z = tan_val;
- }
- return x_lt_0 ? -z : z;
- }
- }
- __declspec(safe)
- static inline uniform float tan(uniform float x_full) {
- if (__have_native_trigonometry)
- {
- return __tan_uniform_float(x_full);
- }
- else if (__math_lib == __math_lib_system ||
- __math_lib == __math_lib_svml) {
- return __stdlib_tanf(x_full);
- }
- else if (__math_lib == __math_lib_ispc ||
- __math_lib == __math_lib_ispc_fast) {
- const uniform float pi_over_four_vec = 0.785398185253143310546875;
- const uniform float four_over_pi_vec = 1.27323949337005615234375;
- uniform bool x_lt_0 = x_full < 0.;
- uniform float y = x_lt_0 ? -x_full : x_full;
- uniform float scaled = y * four_over_pi_vec;
- uniform float k_real = floor(scaled);
- uniform int k = (int)k_real;
- uniform float x = y - k_real * pi_over_four_vec;
- // if k & 1, x -= Pi/4
- uniform bool need_offset = (k & 1) != 0;
- x = need_offset ? x - pi_over_four_vec : x;
- // if k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...
- uniform int k_mod4 = k & 3;
- uniform bool use_cotan = (k_mod4 == 1) || (k_mod4 == 2);
- const uniform float one_vec = 1.0;
- const uniform float tan_c2 = 0.33333075046539306640625;
- const uniform float tan_c4 = 0.13339905440807342529296875;
- const uniform float tan_c6 = 5.3348250687122344970703125e-2;
- const uniform float tan_c8 = 2.46033705770969390869140625e-2;
- const uniform float tan_c10 = 2.892402000725269317626953125e-3;
- const uniform float tan_c12 = 9.500005282461643218994140625e-3;
- const uniform float cot_c2 = -0.3333333432674407958984375;
- const uniform float cot_c4 = -2.222204394638538360595703125e-2;
- const uniform float cot_c6 = -2.11752182804048061370849609375e-3;
- const uniform float cot_c8 = -2.0846328698098659515380859375e-4;
- const uniform float cot_c10 = -2.548247357481159269809722900390625e-5;
- const uniform float cot_c12 = -3.5257363606433500535786151885986328125e-7;
- uniform float x2 = x * x;
- uniform float z;
- if (use_cotan) {
- uniform float cot_val = x2 * cot_c12 + cot_c10;
- cot_val = x2 * cot_val + cot_c8;
- cot_val = x2 * cot_val + cot_c6;
- cot_val = x2 * cot_val + cot_c4;
- cot_val = x2 * cot_val + cot_c2;
- cot_val = x2 * cot_val + one_vec;
- // The equation is for x * cot(x) but we need -x * cot(x) for the tan part.
- cot_val /= -x;
- z = cot_val;
- } else {
- uniform float tan_val = x2 * tan_c12 + tan_c10;
- tan_val = x2 * tan_val + tan_c8;
- tan_val = x2 * tan_val + tan_c6;
- tan_val = x2 * tan_val + tan_c4;
- tan_val = x2 * tan_val + tan_c2;
- tan_val = x2 * tan_val + one_vec;
- // Equation was for tan(x)/x
- tan_val *= x;
- z = tan_val;
- }
- return x_lt_0 ? -z : z;
- }
- }
- __declspec(safe)
- static inline float atan(float x_full) {
- if (__have_native_trigonometry)
- {
- return __atan_varying_float(x_full);
- }
- else if (__math_lib == __math_lib_svml) {
- return __svml_atanf(x_full);
- }
- else if (__math_lib == __math_lib_system) {
- float ret;
- foreach_active (i) {
- uniform float r = __stdlib_atanf(extract(x_full, i));
- ret = insert(ret, i, r);
- }
- return ret;
- }
- else if (__math_lib == __math_lib_ispc ||
- __math_lib == __math_lib_ispc_fast) {
- const float pi_over_two_vec = 1.57079637050628662109375;
- // atan(-x) = -atan(x) (so flip from negative to positive first)
- // if x > 1 -> atan(x) = Pi/2 - atan(1/x)
- bool x_neg = x_full < 0;
- float x_flipped = x_neg ? -x_full : x_full;
- bool x_gt_1 = x_flipped > 1.;
- float x = x_gt_1 ? 1./x_flipped : x_flipped;
- // These coefficients approximate atan(x)/x
- const float atan_c0 = 0.99999988079071044921875;
- const float atan_c2 = -0.3333191573619842529296875;
- const float atan_c4 = 0.199689209461212158203125;
- const float atan_c6 = -0.14015688002109527587890625;
- const float atan_c8 = 9.905083477497100830078125e-2;
- const float atan_c10 = -5.93664981424808502197265625e-2;
- const float atan_c12 = 2.417283318936824798583984375e-2;
- const float atan_c14 = -4.6721356920897960662841796875e-3;
- float x2 = x * x;
- float result = x2 * atan_c14 + atan_c12;
- result = x2 * result + atan_c10;
- result = x2 * result + atan_c8;
- result = x2 * result + atan_c6;
- result = x2 * result + atan_c4;
- result = x2 * result + atan_c2;
- result = x2 * result + atan_c0;
- result *= x;
- result = x_gt_1 ? pi_over_two_vec - result : result;
- result = x_neg ? -result : result;
- return result;
- }
- }
- __declspec(safe)
- static inline uniform float atan(uniform float x_full) {
- if (__have_native_trigonometry)
- {
- return __atan_uniform_float(x_full);
- }
- else if (__math_lib == __math_lib_system ||
- __math_lib == __math_lib_svml) {
- return __stdlib_atanf(x_full);
- }
- else if (__math_lib == __math_lib_ispc ||
- __math_lib == __math_lib_ispc_fast) {
- const uniform float pi_over_two_vec = 1.57079637050628662109375;
- // atan(-x) = -atan(x) (so flip from negative to positive first)
- // if x > 1 -> atan(x) = Pi/2 - atan(1/x)
- uniform bool x_neg = x_full < 0;
- uniform float x_flipped = x_neg ? -x_full : x_full;
- uniform bool x_gt_1 = x_flipped > 1.;
- uniform float x = x_gt_1 ? 1./x_flipped : x_flipped;
- // These coefficients approximate atan(x)/x
- const uniform float atan_c0 = 0.99999988079071044921875;
- const uniform float atan_c2 = -0.3333191573619842529296875;
- const uniform float atan_c4 = 0.199689209461212158203125;
- const uniform float atan_c6 = -0.14015688002109527587890625;
- const uniform float atan_c8 = 9.905083477497100830078125e-2;
- const uniform float atan_c10 = -5.93664981424808502197265625e-2;
- const uniform float atan_c12 = 2.417283318936824798583984375e-2;
- const uniform float atan_c14 = -4.6721356920897960662841796875e-3;
- uniform float x2 = x * x;
- uniform float result = x2 * atan_c14 + atan_c12;
- result = x2 * result + atan_c10;
- result = x2 * result + atan_c8;
- result = x2 * result + atan_c6;
- result = x2 * result + atan_c4;
- result = x2 * result + atan_c2;
- result = x2 * result + atan_c0;
- result *= x;
- result = x_gt_1 ? pi_over_two_vec - result : result;
- result = x_neg ? -result : result;
- return result;
- }
- }
- __declspec(safe)
- static inline float atan2(float y, float x) {
- if (__have_native_trigonometry)
- {
- return __atan2_varying_float(y,x);
- }
- else if (__math_lib == __math_lib_svml) {
- return __svml_atan2f(y, x);
- }
- else if (__math_lib == __math_lib_system) {
- float ret;
- foreach_active (i) {
- uniform float r = __stdlib_atan2f(extract(y, i), extract(x, i));
- ret = insert(ret, i, r);
- }
- return ret;
- }
- else if (__math_lib == __math_lib_ispc ||
- __math_lib == __math_lib_ispc_fast) {
- const float pi_vec = 3.1415926536;
- const float pi_over_two_vec = 1.5707963267;
- // atan2(y, x) =
- //
- // atan2(y > 0, x = +-0) -> Pi/2
- // atan2(y < 0, x = +-0) -> -Pi/2
- // atan2(y = +-0, x < +0) -> +-Pi
- // atan2(y = +-0, x >= +0) -> +-0
- //
- // atan2(y >= 0, x < 0) -> Pi + atan(y/x)
- // atan2(y < 0, x < 0) -> -Pi + atan(y/x)
- // atan2(y, x > 0) -> atan(y/x)
- //
- // and then a bunch of code for dealing with infinities.
- float y_over_x = y/x;
- float atan_arg = atan(y_over_x);
- bool x_lt_0 = x < 0;
- bool y_lt_0 = y < 0;
- float offset = x_lt_0 ? (y_lt_0 ? -pi_vec : pi_vec) : 0;
- return offset + atan_arg;
- }
- }
- __declspec(safe)
- static inline uniform float atan2(uniform float y, uniform float x) {
- if (__have_native_trigonometry)
- {
- return __atan2_uniform_float(y,x);
- }
- else if (__math_lib == __math_lib_system ||
- __math_lib == __math_lib_svml) {
- return __stdlib_atan2f(y, x);
- }
- else if (__math_lib == __math_lib_ispc ||
- __math_lib == __math_lib_ispc_fast) {
- const uniform float pi_vec = 3.1415927410125732421875;
- const uniform float pi_over_two_vec = 1.57079637050628662109375;
- uniform float y_over_x = y/x;
- uniform float atan_arg = atan(y_over_x);
- uniform bool x_lt_0 = x < 0;
- uniform bool y_lt_0 = y < 0;
- uniform float offset = x_lt_0 ? (y_lt_0 ? -pi_vec : pi_vec) : 0;
- return offset + atan_arg;
- }
- }
- __declspec(safe)
- static inline float exp(float x_full) {
- if (__have_native_transcendentals) {
- return __exp_varying_float(x_full);
- }
- else if (__math_lib == __math_lib_svml) {
- return __svml_expf(x_full);
- }
- else if (__math_lib == __math_lib_system) {
- float ret;
- foreach_active (i) {
- uniform float r = __stdlib_expf(extract(x_full, i));
- ret = insert(ret, i, r);
- }
- return ret;
- }
- else if (__math_lib == __math_lib_ispc_fast) {
- float z = floor(1.44269504088896341f * x_full + 0.5f);
- int n;
- x_full -= z * 0.693359375f;
- x_full -= z * -2.12194440e-4f;
- n = (int)z;
- z = x_full * x_full;
- z = (((((1.9875691500E-4f * x_full + 1.3981999507E-3f) * x_full +
- 8.3334519073E-3f) * x_full + 4.1665795894E-2f) * x_full +
- 1.6666665459E-1f) * x_full + 5.0000001201E-1f) * z + x_full + 1.f;
- x_full = ldexp(z, n);
- return x_full;
- }
- else if (__math_lib == __math_lib_ispc) {
- const float ln2_part1 = 0.6931457519;
- const float ln2_part2 = 1.4286067653e-6;
- const float one_over_ln2 = 1.44269502162933349609375;
- float scaled = x_full * one_over_ln2;
- float k_real = floor(scaled);
- int k = (int)k_real;
- // Reduced range version of x
- float x = x_full - k_real * ln2_part1;
- x -= k_real * ln2_part2;
- // These coefficients are for e^x in [0, ln(2)]
- const float one = 1.;
- const float c2 = 0.4999999105930328369140625;
- const float c3 = 0.166668415069580078125;
- const float c4 = 4.16539050638675689697265625e-2;
- const float c5 = 8.378830738365650177001953125e-3;
- const float c6 = 1.304379315115511417388916015625e-3;
- const float c7 = 2.7555381529964506626129150390625e-4;
- float result = x * c7 + c6;
- result = x * result + c5;
- result = x * result + c4;
- result = x * result + c3;
- result = x * result + c2;
- result = x * result + one;
- result = x * result + one;
- // Compute 2^k (should differ for float and double, but I'll avoid
- // it for now and just do floats)
- const int fpbias = 127;
- int biased_n = k + fpbias;
- bool overflow = k > fpbias;
- // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
- // we've got underflow. -127 * ln(2) -> -88.02. So the most
- // negative float input that doesn't result in zero is like -88.
- bool underflow = (biased_n <= 0);
- const int InfBits = 0x7f800000;
- biased_n <<= 23;
- // Reinterpret this thing as float
- float two_to_the_n = floatbits(biased_n);
- // Handle both doubles and floats (hopefully eliding the copy for float)
- float elemtype_2n = two_to_the_n;
- result *= elemtype_2n;
- result = overflow ? floatbits(InfBits) : result;
- result = underflow ? 0. : result;
- return result;
- }
- }
- __declspec(safe)
- static inline uniform float exp(uniform float x_full) {
- if (__have_native_transcendentals) {
- return __exp_uniform_float(x_full);
- }
- else if (__math_lib == __math_lib_system ||
- __math_lib == __math_lib_svml) {
- return __stdlib_expf(x_full);
- }
- else if (__math_lib == __math_lib_ispc_fast) {
- uniform float z = floor(1.44269504088896341f * x_full + 0.5f);
- uniform int n;
- x_full -= z * 0.693359375f;
- x_full -= z * -2.12194440e-4f;
- n = (int)z;
- z = x_full * x_full;
- z = (((((1.9875691500E-4f * x_full + 1.3981999507E-3f) * x_full +
- 8.3334519073E-3f) * x_full + 4.1665795894E-2f) * x_full +
- 1.6666665459E-1f) * x_full + 5.0000001201E-1f) * z + x_full + 1.f;
- x_full = ldexp(z, n);
- return x_full;
- }
- else if (__math_lib == __math_lib_ispc) {
- const uniform float ln2_part1 = 0.6931457519;
- const uniform float ln2_part2 = 1.4286067653e-6;
- const uniform float one_over_ln2 = 1.44269502162933349609375;
- uniform float scaled = x_full * one_over_ln2;
- uniform float k_real = floor(scaled);
- uniform int k = (uniform int)k_real;
- // Reduced range version of x
- uniform float x = x_full - k_real * ln2_part1;
- x -= k_real * ln2_part2;
- // These coefficients are for e^x in [0, ln(2)]
- const uniform float one = 1.;
- const uniform float c2 = 0.4999999105930328369140625;
- const uniform float c3 = 0.166668415069580078125;
- const uniform float c4 = 4.16539050638675689697265625e-2;
- const uniform float c5 = 8.378830738365650177001953125e-3;
- const uniform float c6 = 1.304379315115511417388916015625e-3;
- const uniform float c7 = 2.7555381529964506626129150390625e-4;
- uniform float result = x * c7 + c6;
- result = x * result + c5;
- result = x * result + c4;
- result = x * result + c3;
- result = x * result + c2;
- result = x * result + one;
- result = x * result + one;
- // Compute 2^k (should differ for uniform float and double, but I'll avoid
- // it for now and just do uniform floats)
- const uniform int fpbias = 127;
- uniform int biased_n = k + fpbias;
- uniform bool overflow = k > fpbias;
- // Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
- // we've got underflow. -127 * ln(2) -> -88.02. So the most
- // negative uniform float input that doesn't result in zero is like -88.
- uniform bool underflow = (biased_n <= 0);
- const uniform int InfBits = 0x7f800000;
- biased_n <<= 23;
- // Reuniform interpret this thing as uniform float
- uniform float two_to_the_n = floatbits(biased_n);
- // Handle both doubles and uniform floats (hopefully eliding the copy for uniform float)
- uniform float elemtype_2n = two_to_the_n;
- result *= elemtype_2n;
- result = overflow ? floatbits(InfBits) : result;
- result = underflow ? 0. : result;
- return result;
- }
- }
- // Range reduction for logarithms takes log(x) -> log(2^n * y) -> n
- // * log(2) + log(y) where y is the reduced range (usually in [1/2,
- // 1)).
- __declspec(safe)
- static inline void __range_reduce_log(float input, varying float * uniform reduced,
- varying int * uniform exponent) {
- int int_version = intbits(input);
- // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
- // exponent mask = 0111 1111 1000 0000 0000 0000 0000 0000
- // 0x7 0xF 0x8 0x0 0x0 0x0 0x0 0x0
- // non-exponent = 1000 0000 0111 1111 1111 1111 1111 1111
- // = 0x8 0x0 0x7 0xF 0xF 0xF 0xF 0xF
- //const int exponent_mask(0x7F800000)
- static const int nonexponent_mask = 0x807FFFFF;
- // We want the reduced version to have an exponent of -1 which is -1 + 127 after biasing or 126
- static const int exponent_neg1 = (126l << 23);
- // NOTE(boulos): We don't need to mask anything out since we know
- // the sign bit has to be 0. If it's 1, we need to return infinity/nan
- // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
- int biased_exponent = int_version >> 23; // This number is [0, 255] but it means [-127, 128]
- int offset_exponent = biased_exponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
- *exponent = offset_exponent - 127; // get the real value
- // Blend the offset_exponent with the original input (do this in
- // int for now, until I decide if float can have & and ¬)
- int blended = (int_version & nonexponent_mask) | (exponent_neg1);
- *reduced = floatbits(blended);
- }
- __declspec(safe)
- static inline void __range_reduce_log(uniform float input, uniform float * uniform reduced,
- uniform int * uniform exponent) {
- uniform int int_version = intbits(input);
- static const uniform int nonexponent_mask = 0x807FFFFF;
- static const uniform int exponent_neg1 = (126ul << 23);
- uniform int biased_exponent = int_version >> 23;
- uniform int offset_exponent = biased_exponent + 1;
- *exponent = offset_exponent - 127; // get the real value
- uniform int blended = (int_version & nonexponent_mask) | (exponent_neg1);
- *reduced = floatbits(blended);
- }
- __declspec(safe)
- static inline float log(float x_full) {
- if (__have_native_transcendentals) {
- return __log_varying_float(x_full);
- }
- else if (__math_lib == __math_lib_svml) {
- return __svml_logf(x_full);
- }
- else if (__math_lib == __math_lib_system) {
- float ret;
- foreach_active (i) {
- uniform float r = __stdlib_logf(extract(x_full, i));
- ret = insert(ret, i, r);
- }
- return ret;
- }
- else if (__math_lib == __math_lib_ispc_fast) {
- int e;
- x_full = frexp(x_full, &e);
- int x_smaller_SQRTHF = (0.707106781186547524f > x_full) ? 0xffffffff : 0;
- e += x_smaller_SQRTHF;
- int ix_add = intbits(x_full);
- ix_add &= x_smaller_SQRTHF;
- x_full += floatbits(ix_add) - 1.f;
- float z = x_full * x_full;
- float y =
- ((((((((7.0376836292E-2f * x_full
- + -1.1514610310E-1f) * x_full
- + 1.1676998740E-1f) * x_full
- + -1.2420140846E-1f) * x_full
- + 1.4249322787E-1f) * x_full
- + -1.6668057665E-1f) * x_full
- + 2.0000714765E-1f) * x_full
- + -2.4999993993E-1f) * x_full
- + 3.3333331174E-1f) * x_full * z;
- float fe = (float)e;
- y += fe * -2.12194440e-4;
- y -= 0.5f * z;
- z = x_full + y;
- return z + 0.693359375 * fe;
- }
- else if (__math_lib == __math_lib_ispc) {
- float reduced;
- int exponent;
- const int NaN_bits = 0x7fc00000;
- const int Neg_Inf_bits = 0xFF800000;
- const float NaN = floatbits(NaN_bits);
- const float neg_inf = floatbits(Neg_Inf_bits);
- bool use_nan = x_full < 0.;
- bool use_inf = x_full == 0.;
- bool exceptional = use_nan || use_inf;
- const float one = 1.0;
- float patched = exceptional ? one : x_full;
- __range_reduce_log(patched, &reduced, &exponent);
- const float ln2 = 0.693147182464599609375;
- float x1 = one - reduced;
- const float c1 = 0.50000095367431640625;
- const float c2 = 0.33326041698455810546875;
- const float c3 = 0.2519190013408660888671875;
- const float c4 = 0.17541764676570892333984375;
- const float c5 = 0.3424419462680816650390625;
- const float c6 = -0.599632322788238525390625;
- const float c7 = +1.98442304134368896484375;
- const float c8 = -2.4899270534515380859375;
- const float c9 = +1.7491014003753662109375;
- float result = x1 * c9 + c8;
- result = x1 * result + c7;
- result = x1 * result + c6;
- result = x1 * result + c5;
- result = x1 * result + c4;
- result = x1 * result + c3;
- result = x1 * result + c2;
- result = x1 * result + c1;
- result = x1 * result + one;
- // Equation was for -(ln(red)/(1-red))
- result *= -x1;
- result += (float)(exponent) * ln2;
- return exceptional ? (use_nan ? NaN : neg_inf) : result;
- }
- }
- __declspec(safe)
- static inline uniform float log(uniform float x_full) {
- if (__have_native_transcendentals) {
- return __log_uniform_float(x_full);
- }
- else if (__math_lib == __math_lib_system ||
- __math_lib == __math_lib_svml) {
- return __stdlib_logf(x_full);
- }
- else if (__math_lib == __math_lib_ispc_fast) {
- uniform int e;
- x_full = frexp(x_full, &e);
- uniform int x_smaller_SQRTHF = (0.707106781186547524f > x_full) ? 0xffffffff : 0;
- e += x_smaller_SQRTHF;
- uniform int ix_add = intbits(x_full);
- ix_add &= x_smaller_SQRTHF;
- x_full += floatbits(ix_add) - 1.f;
- uniform float z = x_full * x_full;
- uniform float y =
- ((((((((7.0376836292E-2f * x_full
- + -1.1514610310E-1f) * x_full
- + 1.1676998740E-1f) * x_full
- + -1.2420140846E-1f) * x_full
- + 1.4249322787E-1f) * x_full
- + -1.6668057665E-1f) * x_full
- + 2.0000714765E-1f) * x_full
- + -2.4999993993E-1f) * x_full
- + 3.3333331174E-1f) * x_full * z;
- uniform float fe = (uniform float)e;
- y += fe * -2.12194440e-4;
- y -= 0.5f * z;
- z = x_full + y;
- return z + 0.693359375 * fe;
- }
- else if (__math_lib == __math_lib_ispc) {
- uniform float reduced;
- uniform int exponent;
- const uniform int NaN_bits = 0x7fc00000;
- const uniform int Neg_Inf_bits = 0xFF800000;
- const uniform float NaN = floatbits(NaN_bits);
- const uniform float neg_inf = floatbits(Neg_Inf_bits);
- uniform bool use_nan = x_full < 0.;
- uniform bool use_inf = x_full == 0.;
- uniform bool exceptional = use_nan || use_inf;
- const uniform float one = 1.0;
- uniform float patched = exceptional ? one : x_full;
- __range_reduce_log(patched, &reduced, &exponent);
- const uniform float ln2 = 0.693147182464599609375;
- uniform float x1 = one - reduced;
- const uniform float c1 = 0.50000095367431640625;
- const uniform float c2 = 0.33326041698455810546875;
- const uniform float c3 = 0.2519190013408660888671875;
- const uniform float c4 = 0.17541764676570892333984375;
- const uniform float c5 = 0.3424419462680816650390625;
- const uniform float c6 = -0.599632322788238525390625;
- const uniform float c7 = +1.98442304134368896484375;
- const uniform float c8 = -2.4899270534515380859375;
- const uniform float c9 = +1.7491014003753662109375;
- uniform float result = x1 * c9 + c8;
- result = x1 * result + c7;
- result = x1 * result + c6;
- result = x1 * result + c5;
- result = x1 * result + c4;
- result = x1 * result + c3;
- result = x1 * result + c2;
- result = x1 * result + c1;
- result = x1 * result + one;
- // Equation was for -(ln(red)/(1-red))
- result *= -x1;
- result += (uniform float)(exponent) * ln2;
- return exceptional ? (use_nan ? NaN : neg_inf) : result;
- }
- }
- __declspec(safe)
- static inline float pow(float a, float b) {
- if (__have_native_transcendentals) {
- return __pow_varying_float(a, b);
- }
- else if (__math_lib == __math_lib_svml) {
- return __svml_powf(a, b);
- }
- else if (__math_lib == __math_lib_system) {
- float ret;
- foreach_active (i) {
- uniform float r = __stdlib_powf(extract(a, i), extract(b, i));
- ret = insert(ret, i, r);
- }
- return ret;
- }
- else if (__math_lib == __math_lib_ispc ||
- __math_lib == __math_lib_ispc_fast) {
- return exp(b * log(a));
- }
- }
- __declspec(safe)
- static inline uniform float pow(uniform float a, uniform float b) {
- if (__have_native_transcendentals) {
- return __pow_uniform_float(a, b);
- }
- if (__math_lib == __math_lib_system ||
- __math_lib == __math_lib_svml) {
- return __stdlib_powf(a, b);
- }
- else if (__math_lib == __math_lib_ispc ||
- __math_lib == __math_lib_ispc_fast) {
- return exp(b * log(a));
- }
- }
- ///////////////////////////////////////////////////////////////////////////
- // Transcendentals (double precision)
- __declspec(safe)
- static inline double sqrt(double v) {
- return __sqrt_varying_double(v);
- }
- __declspec(safe)
- static inline uniform double sqrt(uniform double v) {
- return __sqrt_uniform_double(v);
- }
- #define RSQRTD(QUAL) \
- __declspec(safe) \
- static inline QUAL double __rsqrt_iterate_##QUAL##_double(QUAL double x, QUAL double y) \
- { \
- QUAL double xh = x*0.5d; \
- y += y*(0.5d0 - xh*y*y); \
- y += y*(0.5d0 - xh*y*y); \
- return y; \
- } \
- __declspec(safe) \
- static inline QUAL double __rsqrt_safe_##QUAL##_double (QUAL double x) \
- { \
- if (x <= 1.0d+33 && x >= 1.0d-33) \
- return __rsqrt_iterate_##QUAL##_double(x, rsqrt((QUAL float)x)); \
- QUAL int64 ex = intbits(x) & 0x7fe0000000000000; \
- QUAL double exp = doublebits( 0x7fd0000000000000 - ex ); /* 1.0d/exponent */ \
- QUAL double exph = doublebits( 0x5fe0000000000000 - (ex >> 1)); /* 1.0d/sqrt(exponent) */ \
- QUAL double y = rsqrt((QUAL float)(x*exp)); \
- return __rsqrt_iterate_##QUAL##_double(x, y*exph); \
- }
- RSQRTD(varying)
- __declspec(safe)
- static inline double rsqrt(double v) {
- if (__have_native_rsqrtd)
- return __rsqrt_varying_double(v);
- else
- return __rsqrt_safe_varying_double(v);
- }
- RSQRTD(uniform)
- __declspec(safe)
- static inline uniform double rsqrt(uniform double v) {
- if (__have_native_rsqrtd)
- return __rsqrt_uniform_double(v);
- else
- return __rsqrt_safe_uniform_double(v);
- }
- __declspec(safe)
- static inline double ldexp(double x, int n) {
- unsigned int64 ex = 0x7ff0000000000000;
- unsigned int64 ix = intbits(x);
- ex &= ix;
- ix = ix & ~0x7ff0000000000000; // clear exponent
- int64 n64 = ((int64)n << 52) + ex;
- ix |= n64; // insert new exponent
- return doublebits(ix);
- }
- __declspec(safe)
- static inline uniform double ldexp(uniform double x, uniform int n) {
- uniform unsigned int64 ex = 0x7ff0000000000000;
- uniform unsigned int64 ix = intbits(x);
- ex &= ix;
- ix = ix & ~0x7ff0000000000000; // clear exponent
- uniform int64 n64 = ((int64)n << 52) + ex;
- ix |= n64; // insert new exponent
- return doublebits(ix);
- }
- __declspec(safe)
- static inline double frexp(double x, varying int * uniform pw2) {
- unsigned int64 ex = 0x7ff0000000000000; // exponent mask
- unsigned int64 ix = intbits(x);
- ex &= ix;
- ix &= ~0x7ff0000000000000; // clear exponent
- *pw2 = (int)(ex >> 52) - 1022; // compute exponent
- ix |= 0x3fe0000000000000; // insert exponent +1 in x
- return doublebits(ix);
- }
- __declspec(safe)
- static inline uniform double frexp(uniform double x, uniform int * uniform pw2) {
- uniform unsigned int64 ex = 0x7ff0000000000000; // exponent mask
- uniform unsigned int64 ix = intbits(x);
- ex &= ix;
- ix &= ~0x7ff0000000000000; // clear exponent
- *pw2 = (int)(ex >> 52) - 1022; // compute exponent
- ix |= 0x3fe0000000000000; // insert exponent +1 in x
- return doublebits(ix);
- }
- __declspec(safe)
- static inline double sin(double x) {
- if (__have_native_trigonometry)
- {
- return __sin_varying_double(x);
- }
- else if (__math_lib == __math_lib_svml)
- {
- return __svml_sind(x);
- }
- else {
- double ret;
- foreach_active (i) {
- uniform double r = __stdlib_sin(extract(x, i));
- ret = insert(ret, i, r);
- }
- return ret;
- }
- }
- __declspec(safe)
- static inline uniform double asin(uniform double x) {
- if (__have_native_trigonometry)
- {
- return __asin_uniform_double(x);
- }
- else
- {
- return __stdlib_asin(x);
- }
- }
- __declspec(safe)
- static inline uniform double sin(uniform double x) {
- if (__have_native_trigonometry)
- {
- return __sin_uniform_double(x);
- }
- else
- return __stdlib_sin(x);
- }
- __declspec(safe)
- static inline double asin(const double x) {
- if (__have_native_trigonometry)
- {
- return __asin_varying_double(x);
- }
- else if (__math_lib == __math_lib_svml)
- {
- return __svml_asind(x);
- }
- else {
- double ret;
- foreach_active (i) {
- uniform double r = __stdlib_asin(extract(x, i));
- ret = insert(ret, i, r);
- }
- return ret;
- }
- }
- __declspec(safe)
- static inline double cos(const double x) {
- if (__have_native_trigonometry)
- {
- return __cos_varying_double(x);
- }
- if (__math_lib == __math_lib_svml)
- {
- return __svml_cosd(x);
- }
- else {
- double ret;
- foreach_active (i) {
- uniform double r = __stdlib_cos(extract(x, i));
- ret = insert(ret, i, r);
- }
- return ret;
- }
- }
- __declspec(safe)
- static inline uniform double cos(uniform double x) {
- if (__have_native_trigonometry)
- {
- return __cos_uniform_double(x);
- }
- else
- return __stdlib_cos(x);
- }
- __declspec(safe)
- static inline void sincos(double x, varying double * uniform sin_result,
- varying double * uniform cos_result) {
- if (__have_native_trigonometry)
- {
- __sincos_varying_double(x,sin_result,cos_result);
- }
- if (__math_lib == __math_lib_svml)
- {
- __svml_sincosd(x, sin_result, cos_result);
- }
- else {
- foreach_active (i) {
- uniform double sr, cr;
- __stdlib_sincos(extract(x, i), &sr, &cr);
- *sin_result = insert(*sin_result, i, sr);
- *cos_result = insert(*cos_result, i, cr);
- }
- }
- }
- __declspec(safe)
- static inline void sincos(uniform double x, uniform double * uniform sin_result,
- uniform double * uniform cos_result) {
- if (__have_native_trigonometry)
- {
- __sincos_uniform_double(x,sin_result, cos_result);
- }
- else
- __stdlib_sincos(x, sin_result, cos_result);
- }
- __declspec(safe)
- static inline double tan(double x) {
- if (__have_native_trigonometry)
- {
- return __tan_varying_double(x);
- }
- else if (__math_lib == __math_lib_svml)
- {
- return __svml_tand(x);
- }
- else {
- double ret;
- foreach_active (i) {
- uniform double r = __stdlib_tan(extract(x, i));
- ret = insert(ret, i, r);
- }
- return ret;
- }
- }
- __declspec(safe)
- static inline uniform double tan(uniform double x) {
- if (__have_native_trigonometry)
- {
- return __tan_uniform_double(x);
- }
- else
- return __stdlib_tan(x);
- }
- __declspec(safe)
- static inline double atan(double x) {
- if (__have_native_trigonometry)
- {
- return __atan_varying_double(x);
- }
- else {
- double ret;
- foreach_active (i) {
- uniform double r = __stdlib_atan(extract(x, i));
- ret = insert(ret, i, r);
- }
- return ret;
- }
- }
- __declspec(safe)
- static inline uniform double atan(uniform double x) {
- if (__have_native_trigonometry)
- {
- return __atan_uniform_double(x);
- }
- else
- return __stdlib_atan(x);
- }
- __declspec(safe)
- static inline double atan2(double y, double x) {
- if (__have_native_trigonometry)
- {
- return __atan2_varying_double(y,x);
- }
- else if (__math_lib == __math_lib_svml)
- {
- return __svml_atan2d(y,x);
- }
- else {
- double ret;
- foreach_active (i) {
- uniform double r = __stdlib_atan2(extract(y, i), extract(x, i));
- ret = insert(ret, i, r);
- }
- return ret;
- }
- }
- __declspec(safe)
- static inline uniform double atan2(uniform double y, uniform double x) {
- if (__have_native_trigonometry)
- {
- return __atan2_uniform_double(y,x);
- }
- else
- return __stdlib_atan2(y, x);
- }
- __declspec(safe)
- static inline double exp(double x) {
- if (__have_native_transcendentals) {
- return __exp_varying_double(x);
- }
- else if (__math_lib == __math_lib_svml)
- {
- return __svml_expd(x);
- }
- else {
- double ret;
- foreach_active (i) {
- uniform double r = __stdlib_exp(extract(x, i));
- ret = insert(ret, i, r);
- }
- return ret;
- }
- }
- __declspec(safe)
- static inline uniform double exp(uniform double x) {
- if (__have_native_transcendentals) {
- return __exp_uniform_double(x);
- }
- else
- return __stdlib_exp(x);
- }
- __declspec(safe)
- static inline double log(double x) {
- if (__have_native_transcendentals) {
- return __log_varying_double(x);
- }
- else if (__math_lib == __math_lib_svml)
- {
- return __svml_logd(x);
- }
- else {
- double ret;
- foreach_active (i) {
- uniform double r = __stdlib_log(extract(x, i));
- ret = insert(ret, i, r);
- }
- return ret;
- }
- }
- __declspec(safe)
- static inline uniform double log(uniform double x) {
- if (__have_native_transcendentals) {
- return __log_uniform_double(x);
- }
- else
- return __stdlib_log(x);
- }
- __declspec(safe)
- static inline double pow(double a, double b) {
- if (__have_native_transcendentals) {
- return __pow_varying_double(a,b);
- }
- else if (__math_lib == __math_lib_svml)
- {
- return __svml_powd(a,b);
- }
- else {
- double ret;
- foreach_active (i) {
- uniform double r = __stdlib_pow(extract(a, i), extract(b, i));
- ret = insert(ret, i, r);
- }
- return ret;
- }
- }
- __declspec(safe)
- static inline uniform double pow(uniform double a, uniform double b) {
- if (__have_native_transcendentals) {
- return __pow_uniform_double(a,b);
- }
- else
- return __stdlib_pow(a, b);
- }
- ///////////////////////////////////////////////////////////////////////////
- // half-precision floats
- __declspec(safe)
- static inline uniform float half_to_float(uniform unsigned int16 h) {
- if (__have_native_half) {
- return __half_to_float_uniform(h);
- }
- else {
- // https://gist.github.com/2144712
- // Fabian "ryg" Giesen.
- static const uniform unsigned int32 shifted_exp = 0x7c00ul << 13; // exponent mask after shift
- uniform int32 o = ((int32)(h & 0x7fff)) << 13; // exponent/mantissa bits
- uniform unsigned int32 exp = shifted_exp & o; // just the exponent
- o += (uniform int32)(127 - 15) << 23; // exponent adjust
- // handle exponent special cases
- if (exp == shifted_exp) // Inf/NaN?
- o += (uniform unsigned int32)(128 - 16) << 23; // extra exp adjust
- else if (exp == 0) { // Zero/Denormal?
- o += 1ul << 23; // extra exp adjust
- o = intbits(floatbits(o) - floatbits(113ul << 23)); // renormalize
- }
- o |= ((int32)(h & 0x8000)) << 16; // sign bit
- return floatbits(o);
- }
- }
- __declspec(safe)
- static inline float half_to_float(unsigned int16 h) {
- if (__have_native_half) {
- return __half_to_float_varying((unsigned int16)h);
- }
- else {
- // https://gist.github.com/2144712
- // Fabian "ryg" Giesen.
- const unsigned int32 shifted_exp = 0x7c00ul << 13; // exponent mask after shift
- int32 o = ((int32)(h & 0x7ffful)) << 13; // exponent/mantissa bits
- unsigned int32 exp = shifted_exp & o; // just the exponent
- o += (int32)(127 - 15) << 23; // exponent adjust
- int32 infnan_val = o + ((int32)(128 - 16) << 23);
- int32 zerodenorm_val = intbits(floatbits(o + (1ul<<23)) - floatbits(113ul << 23));
- int32 reg_val = (exp == 0) ? zerodenorm_val : o;
- int32 sign_bit = ((int32)(h & 0x8000ul)) << 16;
- return floatbits(((exp == shifted_exp) ? infnan_val : reg_val) | sign_bit);
- }
- }
- __declspec(safe)
- static inline uniform int16 float_to_half(uniform float f) {
- if (__have_native_half) {
- return __float_to_half_uniform(f);
- }
- else {
- // via Fabian "ryg" Giesen.
- // https://gist.github.com/2156668
- uniform unsigned int32 sign_mask = 0x80000000u;
- uniform int32 o;
- uniform int32 fint = intbits(f);
- uniform int32 sign = fint & sign_mask;
- fint ^= sign;
- // NOTE all the integer compares in this function can be safely
- // compiled into signed compares since all operands are below
- // 0x80000000. Important if you want fast straight SSE2 code (since
- // there's no unsigned PCMPGTD).
- // Inf or NaN (all exponent bits set)
- // NaN->qNaN and Inf->Inf
- // unconditional assignment here, will override with right value for
- // the regular case below.
- uniform int32 f32infty = 255ul << 23;
- o = (fint > f32infty) ? 0x7e00u : 0x7c00u;
- // (De)normalized number or zero
- // update fint unconditionally to save the blending; we don't need it
- // anymore for the Inf/NaN case anyway.
- const uniform unsigned int32 round_mask = ~0xffful;
- const uniform int32 magic = 15ul << 23;
- const uniform int32 f16infty = 31ul << 23;
- uniform int32 fint2 = intbits(floatbits(fint & round_mask) * floatbits(magic)) - round_mask;
- fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
- if (fint < f32infty)
- o = fint2 >> 13; // Take the bits!
- return (o | (sign >> 16));
- }
- }
- __declspec(safe)
- static inline int16 float_to_half(float f) {
- if (__have_native_half) {
- return __float_to_half_varying(f);
- }
- else {
- // via Fabian "ryg" Giesen.
- // https://gist.github.com/2156668
- unsigned int32 sign_mask = 0x80000000u;
- int32 o;
- int32 fint = intbits(f);
- int32 sign = fint & sign_mask;
- fint ^= sign;
- // NOTE all the integer compares in this function can be safely
- // compiled into signed compares since all operands are below
- // 0x80000000. Important if you want fast straight SSE2 code (since
- // there's no unsigned PCMPGTD).
- // Inf or NaN (all exponent bits set)
- // NaN->qNaN and Inf->Inf
- // unconditional assignment here, will override with right value for
- // the regular case below.
- int32 f32infty = 255ul << 23;
- o = (fint > f32infty) ? 0x7e00u : 0x7c00u;
- // (De)normalized number or zero
- // update fint unconditionally to save the blending; we don't need it
- // anymore for the Inf/NaN case anyway.
- const unsigned int32 round_mask = ~0xffful;
- const int32 magic = 15ul << 23;
- const int32 f16infty = 31ul << 23;
- // Shift exponent down, denormalize if necessary.
- // NOTE This represents half-float denormals using single precision denormals.
- // The main reason to do this is that there's no shift with per-lane variable
- // shifts in SSE*, which we'd otherwise need. It has some funky side effects
- // though:
- // - This conversion will actually respect the FTZ (Flush To Zero) flag in
- // MXCSR - if it's set, no half-float denormals will be generated. I'm
- // honestly not sure whether this is good or bad. It's definitely interesting.
- // - If the underlying HW doesn't support denormals (not an issue with Intel
- // CPUs, but might be a problem on GPUs or PS3 SPUs), you will always get
- // flush-to-zero behavior. This is bad, unless you're on a CPU where you don't
- // care.
- // - Denormals tend to be slow. FP32 denormals are rare in practice outside of things
- // like recursive filters in DSP - not a typical half-float application. Whether
- // FP16 denormals are rare in practice, I don't know. Whatever slow path your HW
- // may or may not have for denormals, this may well hit it.
- float fscale = floatbits(fint & round_mask) * floatbits(magic);
- fscale = min(fscale, floatbits((31ul << 23) - 0x1000ul));
- int32 fint2 = intbits(fscale) - round_mask;
- if (fint < f32infty)
- o = fint2 >> 13; // Take the bits!
- return (o | (sign >> 16));
- }
- }
- __declspec(safe)
- static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
- if (__have_native_half) {
- return __half_to_float_uniform(h);
- }
- else {
- uniform unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
- uniform unsigned int32 hem = h & (int32)0x7fffu; // Pick off exponent-mantissa bits
- uniform unsigned int32 xs = ((unsigned int32) hs) << 16;
- uniform unsigned int32 xem = ((unsigned int32) hem) << 13;
- xem += 0x38000000; // (127 - 15) << 23
- return floatbits(xs | xem);
- }
- }
- __declspec(safe)
- static inline float half_to_float_fast(unsigned int16 h) {
- if (__have_native_half) {
- return __half_to_float_varying(h);
- }
- else {
- unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
- unsigned int32 hem = h & (int32)0x7fffu; // Pick off exponent-mantissa bits
- unsigned int32 xs = ((unsigned int32) hs) << 16;
- unsigned int32 xem = ((unsigned int32) hem) << 13;
- return floatbits(xs | (xem + 0x38000000 /* (127 - 15) << 23 */));
- }
- }
- __declspec(safe)
- static inline uniform int16 float_to_half_fast(uniform float f) {
- if (__have_native_half) {
- return __float_to_half_uniform(f);
- }
- else {
- uniform int32 x = intbits(f);
- uniform unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
- uniform unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
- uniform unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits
- uniform unsigned int32 hs = (xs >> 16); // Sign bit
- // Exponent unbias the single, then bias the halfp
- uniform int32 hes = ((int)(xe >> 23)) - 127 + 15;
- uniform unsigned int32 he = (hes << 10); // Exponent
- uniform int32 hm = (xm >> 13); // Mantissa
- uniform int32 ret = (hs | he | hm);
- if (xm & 0x00001000u) // Check for rounding
- // Round, might overflow to inf, this is OK
- ret += 1u;
- return (int16)ret;
- }
- }
- __declspec(safe)
- static inline int16 float_to_half_fast(float f) {
- if (__have_native_half) {
- return __float_to_half_varying(f);
- }
- else {
- int32 x = intbits(f);
- unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
- unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
- unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits
- unsigned int32 hs = (xs >> 16); // Sign bit
- // Exponent unbias the single, then bias the halfp
- int32 hes = ((int)(xe >> 23)) - 127 + 15;
- unsigned int32 he = (hes << 10); // Exponent
- int32 hm = (xm >> 13); // Mantissa
- int32 ret = (hs | he | hm);
- if (xm & 0x00001000u) // Check for rounding
- // Round, might overflow to inf, this is OK
- ret += 1u;
- return (int16)ret;
- }
- }
- ///////////////////////////////////////////////////////////////////////////
- // float -> srgb8
- // https://gist.github.com/2246678, from Fabian "rygorous" Giesen.
- //
- // The basic ideas are still the same, only this time, we squeeze
- // everything into the table, even the linear part of the range; since we
- // are approximating the function as piecewise linear anyway, this is
- // fairly easy.
- //
- // In the exact version of the conversion, any value that produces an
- // output float less than 0.5 will be rounded to an integer of
- // zero. Inverting the linear part of the transform, we get:
- //
- // log2(0.5 / (255 * 12.92)) =~ -12.686
- //
- // which in turn means that any value smaller than about 2^(-12.687) will
- // return 0. What this means is that we can adapt the clamping code to
- // just clamp to [2^(-13), 1-eps] and we're covered. This means our table
- // needs to cover a range of 13 different exponents from -13 to -1.
- //
- // The table lookup, storage and interpolation works exactly the same way
- // as in the code above.
- //
- // Max error for the whole function (integer-rounded result minus "exact"
- // value, as computed in floats using the official formula): 0.544403 at
- // 0x3e9f8000
- __declspec(safe)
- static inline int
- float_to_srgb8(float inval)
- {
- static const uniform unsigned int table[104] = {
- 0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d,
- 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
- 0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a,
- 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
- 0x010e0033, 0x01280033, 0x01410033, 0x015b0033,
- 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
- 0x01dc0067, 0x020f0067, 0x02430067, 0x02760067,
- 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
- 0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce,
- 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
- 0x06970158, 0x07420142, 0x07e30130, 0x087b0120,
- 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
- 0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180,
- 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
- 0x11070264, 0x1238023e, 0x1357021d, 0x14660201,
- 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
- 0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad,
- 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
- 0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392,
- 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
- 0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5,
- 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
- 0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d,
- 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
- 0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f,
- 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
- };
- static const uniform unsigned int almost_one = 0x3f7fffff;
- // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
- inval = max(inval, 0.0f);
- inval = min(inval, floatbits(almost_one));
- // Do the table lookup and unpack bias, scale
- unsigned int tab = table[(intbits(inval) - 0x39000000u) >> 20];
- unsigned int bias = (tab >> 16) << 9;
- unsigned int scale = tab & 0xfffful;
- // Grab next-highest mantissa bits and perform linear interpolation
- unsigned int t = (intbits(inval) >> 12) & 0xff;
- return (bias + scale*t) >> 16;
- }
- __declspec(safe)
- static inline uniform int
- float_to_srgb8(uniform float inval)
- {
- static const uniform unsigned int table[104] = {
- 0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d,
- 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
- 0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a,
- 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
- 0x010e0033, 0x01280033, 0x01410033, 0x015b0033,
- 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
- 0x01dc0067, 0x020f0067, 0x02430067, 0x02760067,
- 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
- 0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce,
- 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
- 0x06970158, 0x07420142, 0x07e30130, 0x087b0120,
- 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
- 0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180,
- 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
- 0x11070264, 0x1238023e, 0x1357021d, 0x14660201,
- 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
- 0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad,
- 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
- 0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392,
- 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
- 0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5,
- 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
- 0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d,
- 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
- 0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f,
- 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
- };
- static const uniform unsigned int almost_one = 0x3f7fffff;
- // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
- inval = max(inval, 0.0f);
- inval = min(inval, floatbits(almost_one));
- // Do the table lookup and unpack bias, scale
- uniform unsigned int tab = table[(intbits(inval) - 0x39000000u) >> 20];
- uniform unsigned int bias = (tab >> 16) << 9;
- uniform unsigned int scale = tab & 0xfffful;
- // Grab next-highest mantissa bits and perform linear interpolation
- uniform unsigned int t = (intbits(inval) >> 12) & 0xff;
- return (bias + scale*t) >> 16;
- }
- ///////////////////////////////////////////////////////////////////////////
- // RNG stuff
- struct RNGState {
- unsigned int z1, z2, z3, z4;
- };
- static inline unsigned int random(varying RNGState * uniform state)
- {
- unsigned int b;
- b = ((state->z1 << 6) ^ state->z1) >> 13;
- state->z1 = ((state->z1 & 4294967294U) << 18) ^ b;
- b = ((state->z2 << 2) ^ state->z2) >> 27;
- state->z2 = ((state->z2 & 4294967288U) << 2) ^ b;
- b = ((state->z3 << 13) ^ state->z3) >> 21;
- state->z3 = ((state->z3 & 4294967280U) << 7) ^ b;
- b = ((state->z4 << 3) ^ state->z4) >> 12;
- state->z4 = ((state->z4 & 4294967168U) << 13) ^ b;
- return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4);
- }
- static inline uniform unsigned int random(uniform RNGState * uniform state)
- {
- uniform unsigned int b;
- b = ((state->z1 << 6) ^ state->z1) >> 13;
- state->z1 = ((state->z1 & 4294967294U) << 18) ^ b;
- b = ((state->z2 << 2) ^ state->z2) >> 27;
- state->z2 = ((state->z2 & 4294967288U) << 2) ^ b;
- b = ((state->z3 << 13) ^ state->z3) >> 21;
- state->z3 = ((state->z3 & 4294967280U) << 7) ^ b;
- b = ((state->z4 << 3) ^ state->z4) >> 12;
- state->z4 = ((state->z4 & 4294967168U) << 13) ^ b;
- return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4);
- }
- static inline float frandom(varying RNGState * uniform state)
- {
- unsigned int irand = random(state);
- irand &= (1ul<<23)-1;
- return floatbits(0x3F800000 | irand)-1.0f;
- }
- static inline uniform float frandom(uniform RNGState * uniform state)
- {
- uniform unsigned int irand = random(state);
- irand &= (1ul<<23)-1;
- return floatbits(0x3F800000 | irand)-1.0f;
- }
- static inline void seed_rng(varying RNGState * uniform state,
- unsigned int seed) {
- state->z1 = seed;
- state->z2 = seed ^ 0xbeeff00d;
- state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16);
- state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul) << 8) |
- ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24);
- }
- static inline void seed_rng(uniform RNGState * uniform state,
- uniform unsigned int seed) {
- state->z1 = seed;
- state->z2 = seed ^ 0xbeeff00d;
- state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16);
- state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul) << 8) |
- ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24);
- }
- static inline void fastmath() {
- __fastmath();
- }
- ///////////////////////////////////////////////////////////////////////////
- // saturation arithmetic
- static inline uniform int8 saturating_add(uniform int8 a, uniform int8 b) {
- uniform unsigned int8 a_unsig = a, b_unsig = b;
- uniform unsigned int8 result = a_unsig + b_unsig;
- a_unsig = (a_unsig >> 7) + INT8_MAX;
- if ((uniform int8) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
- result = a_unsig;
- return result;
- }
- static inline varying int8 saturating_add(varying int8 a, varying int8 b) {
- return __padds_vi8(a, b);
- }
- static inline uniform int16 saturating_add(uniform int16 a, uniform int16 b) {
- uniform unsigned int16 a_unsig = a, b_unsig = b;
- uniform unsigned int16 result = a_unsig + b_unsig;
- a_unsig = (a_unsig >> 15) + INT16_MAX;
- if ((uniform int16) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
- result = a_unsig;
- return result;
- }
- static inline varying int16 saturating_add(varying int16 a, varying int16 b) {
- return __padds_vi16(a, b);
- }
- static inline uniform int32 saturating_add(uniform int32 a, uniform int32 b) {
- uniform unsigned int32 a_unsig = a, b_unsig = b;
- uniform unsigned int32 result = a_unsig + b_unsig;
- a_unsig = (a_unsig >> 31) + INT32_MAX;
- if ((uniform int32) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
- result = a_unsig;
- return result;
- }
- static inline varying int32 saturating_add(varying int32 a, varying int32 b) {
- varying unsigned int32 a_unsig = a, b_unsig = b;
- varying unsigned int32 result = a_unsig + b_unsig;
- a_unsig = (a_unsig >> 31) + INT32_MAX;
- if ((varying int32) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
- result = a_unsig;
- return result;
- }
- static inline uniform int64 saturating_add(uniform int64 a, uniform int64 b) {
- uniform unsigned int64 a_unsig = a, b_unsig = b;
- uniform unsigned int64 result = a_unsig + b_unsig;
- a_unsig = (a_unsig >> 63) + INT64_MAX;
- if ((uniform int64) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
- result = a_unsig;
- return result;
- }
- static inline varying int64 saturating_add(varying int64 a, varying int64 b) {
- varying unsigned int64 a_unsig = a, b_unsig = b;
- varying unsigned int64 result = a_unsig + b_unsig;
- a_unsig = (a_unsig >> 63) + INT64_MAX;
- if ((varying int64) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
- result = a_unsig;
- return result;
- }
- static inline uniform unsigned int8 saturating_add(uniform unsigned int8 a,
- uniform unsigned int8 b) {
- uniform unsigned int8 result = a + b;
- result |= (-(uniform int8)(result < a));
- return result;
- }
- static inline varying unsigned int8 saturating_add(varying unsigned int8 a,
- varying unsigned int8 b) {
- return __paddus_vi8(a, b);
- }
- static inline uniform unsigned int16 saturating_add(uniform unsigned int16 a,
- uniform unsigned int16 b) {
- uniform unsigned int16 result = a + b;
- result |= (-(uniform int16)(result < a));
- return result;
- }
- static inline varying unsigned int16 saturating_add(varying unsigned int16 a,
- varying unsigned int16 b) {
- return __paddus_vi16(a, b);
- }
- static inline uniform unsigned int32 saturating_add(uniform unsigned int32 a,
- uniform unsigned int32 b) {
- uniform unsigned int32 result = a + b;
- result |= (-(uniform int32)(result < a));
- return result;
- }
- static inline varying unsigned int32 saturating_add(varying unsigned int32 a,
- varying unsigned int32 b) {
- varying unsigned int32 result = a + b;
- result |= (-(varying int32)(result < a));
- return result;
- }
- static inline uniform unsigned int64 saturating_add(uniform unsigned int64 a,
- uniform unsigned int64 b) {
- uniform unsigned int64 result = a + b;
- result |= (-(uniform int64)(result < a));
- return result;
- }
- static inline varying unsigned int64 saturating_add(varying unsigned int64 a,
- varying unsigned int64 b) {
- varying unsigned int64 result = a + b;
- result |= (-(varying int64)(result < a));
- return result;
- }
- static inline uniform int8 saturating_sub(uniform int8 a, uniform int8 b) {
- uniform unsigned int8 a_unsig = a, b_unsig = b;
- uniform unsigned int8 result = a_unsig - b_unsig;
- a_unsig = (a_unsig >> 7) + INT8_MAX;
- if ((uniform int8) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
- result = a_unsig;
- return result;
- }
- static inline varying int8 saturating_sub(varying int8 a, varying int8 b) {
- return __psubs_vi8(a, b);
- }
- static inline uniform int16 saturating_sub(uniform int16 a, uniform int16 b) {
- uniform unsigned int16 a_unsig = a, b_unsig = b;
- uniform unsigned int16 result = a_unsig - b_unsig;
- a_unsig = (a_unsig >> 15) + INT16_MAX;
- if ((uniform int16) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
- result = a_unsig;
- return result;
- }
- static inline varying int16 saturating_sub(varying int16 a, varying int16 b) {
- return __psubs_vi16(a, b);
- }
- static inline uniform int32 saturating_sub(uniform int32 a, uniform int32 b) {
- uniform unsigned int32 a_unsig = a, b_unsig = b;
- uniform unsigned int32 result = a_unsig - b_unsig;
- a_unsig = (a_unsig >> 31) + INT32_MAX;
- if ((uniform int32) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
- result = a_unsig;
- return result;
- }
- static inline varying int32 saturating_sub(varying int32 a, varying int32 b) {
- varying unsigned int32 a_unsig = a, b_unsig = b;
- varying unsigned int32 result = a_unsig - b_unsig;
- a_unsig = (a_unsig >> 31) + INT32_MAX;
- if ((varying int32) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
- result = a_unsig;
- return result;
- }
- static inline uniform int64 saturating_sub(uniform int64 a, uniform int64 b) {
- uniform unsigned int64 a_unsig = a, b_unsig = b;
- uniform unsigned int64 result = a_unsig - b_unsig;
- a_unsig = (a_unsig >> 63) + INT64_MAX;
- if ((uniform int64) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
- result = a_unsig;
- return result;
- }
- static inline varying int64 saturating_sub(varying int64 a, varying int64 b) {
- varying unsigned int64 a_unsig = a, b_unsig = b;
- varying unsigned int64 result = a_unsig - b_unsig;
- a_unsig = (a_unsig >> 63) + INT64_MAX;
- if ((varying int64) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
- result = a_unsig;
- return result;
- }
- static inline uniform unsigned int8 saturating_sub(uniform unsigned int8 a,
- uniform unsigned int8 b) {
- uniform unsigned int8 result = a - b;
- result &= (-(uniform int8)(result <= a));
- return result;
- }
- static inline varying unsigned int8 saturating_sub(varying unsigned int8 a,
- varying unsigned int8 b) {
- return __psubus_vi8(a, b);
- }
- static inline uniform unsigned int16 saturating_sub(uniform unsigned int16 a,
- uniform unsigned int16 b) {
- uniform unsigned int16 result = a - b;
- result &= (-(uniform int16)(result <= a));
- return result;
- }
- static inline varying unsigned int16 saturating_sub(varying unsigned int16 a,
- varying unsigned int16 b) {
- return __psubus_vi16(a, b);
- }
- static inline uniform unsigned int32 saturating_sub(uniform unsigned int32 a,
- uniform unsigned int32 b) {
- uniform unsigned int32 result = a - b;
- result &= (-(uniform int32)(result <= a));
- return result;
- }
- static inline varying unsigned int32 saturating_sub(varying unsigned int32 a,
- varying unsigned int32 b) {
- varying unsigned int32 result = a - b;
- result &= (-(varying int32)(result <= a));
- return result;
- }
- static inline uniform unsigned int64 saturating_sub(uniform unsigned int64 a,
- uniform unsigned int64 b) {
- uniform unsigned int64 result = a - b;
- result &= (-(uniform int64)(result <= a));
- return result;
- }
- static inline varying unsigned int64 saturating_sub(varying unsigned int64 a,
- varying unsigned int64 b) {
- varying unsigned int64 result = a - b;
- result &= (-(varying int64)(result <= a));
- return result;
- }
- static inline uniform int8 saturating_div(uniform int8 a, uniform int8 b) {
- /* Only one way to overflow, so test for and prevent it. */
- a += !((b + 1) | ((uniform unsigned int8) a + INT8_MIN));
- return a / b;
- }
- static inline varying int8 saturating_div(varying int8 a, varying int8 b) {
- /* Only one way to overflow, so test for and prevent it. */
- a += !((b + 1) | ((varying unsigned int8) a + INT8_MIN));
- return a / b;
- }
- static inline uniform int16 saturating_div(uniform int16 a, uniform int16 b) {
- /* Only one way to overflow, so test for and prevent it. */
- a += !((b + 1) | ((uniform unsigned int16) a + INT16_MIN));
- return a / b;
- }
- static inline varying int16 saturating_div(varying int16 a, varying int16 b) {
- /* Only one way to overflow, so test for and prevent it. */
- a += !((b + 1) | ((varying unsigned int16) a + INT16_MIN));
- return a / b;
- }
- static inline uniform int32 saturating_div(uniform int32 a, uniform int32 b) {
- /* Only one way to overflow, so test for and prevent it. */
- a += !((b + 1) | ((uniform unsigned int32) a + INT32_MIN));
- return a / b;
- }
- static inline varying int32 saturating_div(varying int32 a, varying int32 b) {
- /* Only one way to overflow, so test for and prevent it. */
- a += !((b + 1) | ((varying unsigned int32) a + INT32_MIN));
- return a / b;
- }
- static inline uniform int64 saturating_div(uniform int64 a, uniform int64 b) {
- /* Only one way to overflow, so test for and prevent it. */
- a += !((b + 1) | ((uniform unsigned int64) a + INT64_MIN));
- return a / b;
- }
- static inline varying int64 saturating_div(varying int64 a, varying int64 b) {
- /* Only one way to overflow, so test for and prevent it. */
- a += !((b + 1) | ((varying unsigned int64) a + INT64_MIN));
- return a / b;
- }
- static inline uniform unsigned int8 saturating_div(uniform unsigned int8 a,
- uniform unsigned int8 b) {
- /* No overflow possible */
- return a / b;
- }
- static inline varying unsigned int8 saturating_div(varying unsigned int8 a,
- varying unsigned int8 b) {
- /* No overflow possible */
- return a / b;
- }
- static inline uniform unsigned int16 saturating_div(uniform unsigned int16 a,
- uniform unsigned int16 b) {
- /* No overflow possible */
- return a / b;
- }
- static inline varying unsigned int16 saturating_div(varying unsigned int16 a,
- varying unsigned int16 b) {
- /* No overflow possible */
- return a / b;
- }
- static inline uniform unsigned int32 saturating_div(uniform unsigned int32 a,
- uniform unsigned int32 b) {
- /* No overflow possible */
- return a / b;
- }
- static inline varying unsigned int32 saturating_div(varying unsigned int32 a,
- varying unsigned int32 b) {
- /* No overflow possible */
- return a / b;
- }
- static inline uniform unsigned int64 saturating_div(uniform unsigned int64 a,
- uniform unsigned int64 b) {
- /* No overflow possible */
- return a / b;
- }
- static inline varying unsigned int64 saturating_div(varying unsigned int64 a,
- varying unsigned int64 b) {
- /* No overflow possible */
- return a / b;
- }
- static inline uniform int8 saturating_mul(uniform int8 a, uniform int8 b) {
- uniform int16 result = (uniform int16) a * (uniform int16) b;
- uniform unsigned int8 result2 = ((uniform unsigned int8) (a ^ b) >> 7) + INT8_MAX;
- uniform int8 hi = result >> 8;
- uniform int8 lo = result;
- if (hi != (lo >> 7))
- result = result2;
- return result;
- }
- static inline varying int8 saturating_mul(varying int8 a, varying int8 b) {
- varying int16 result = (varying int16) a * (varying int16) b;
- varying unsigned int8 result2 = ((varying unsigned int8) (a ^ b) >> 7) + INT8_MAX;
- varying int8 hi = result >> 8;
- varying int8 lo = result;
- if (hi != (lo >> 7))
- result = result2;
- return result;
- }
- static inline uniform int16 saturating_mul(uniform int16 a, uniform int16 b) {
- uniform int32 result = (uniform int32) a * (uniform int32) b;
- uniform unsigned int16 result2 = ((uniform unsigned int16) (a ^ b) >> 15) + INT16_MAX;
- uniform int16 hi = result >> 16;
- uniform int16 lo = result;
- if (hi != (lo >> 15))
- result = result2;
- return result;
- }
- static inline varying int16 saturating_mul(varying int16 a, varying int16 b) {
- varying int32 result = (varying int32) a * (varying int32) b;
- varying unsigned int16 result2 = ((varying unsigned int16) (a ^ b) >> 15) + INT16_MAX;
- varying int16 hi = result >> 16;
- varying int16 lo = result;
- if (hi != (lo >> 15))
- result = result2;
- return result;
- }
- static inline uniform int32 saturating_mul(uniform int32 a, uniform int32 b) {
- uniform int64 result = (uniform int64) a * (uniform int64) b;
- uniform unsigned int32 result2 = ((uniform unsigned int32) (a ^ b) >> 31) + INT32_MAX;
- uniform int32 hi = result >> 32;
- uniform int32 lo = result;
- if (hi != (lo >> 31))
- result = result2;
- return result;
- }
- static inline varying int32 saturating_mul(varying int32 a, varying int32 b) {
- varying int64 result = (varying int64) a * (varying int64) b;
- varying unsigned int32 result2 = ((varying unsigned int32) (a ^ b) >> 31) + INT32_MAX;
- varying int32 hi = result >> 32;
- varying int32 lo = result;
- if (hi != (lo >> 31))
- result = result2;
- return result;
- }
- static inline uniform unsigned int8 saturating_mul(uniform unsigned int8 a,
- uniform unsigned int8 b) {
- uniform unsigned int16 result = (uniform unsigned int16) a *
- (uniform unsigned int16) b;
- uniform unsigned int8 hi = result >> 8;
- uniform unsigned int8 lo = result;
- return lo | - (uniform int8) !! hi;
- }
- static inline varying unsigned int8 saturating_mul(varying unsigned int8 a,
- varying unsigned int8 b) {
- varying unsigned int16 result = (varying unsigned int16) a *
- (varying unsigned int16) b;
- varying unsigned int8 hi = result >> 8;
- varying unsigned int8 lo = result;
- return lo | - (varying int8) !! hi;
- }
- static inline uniform unsigned int16 saturating_mul(uniform unsigned int16 a,
- uniform unsigned int16 b) {
- uniform unsigned int32 result = (uniform unsigned int32) a *
- (uniform unsigned int32) b;
- uniform unsigned int16 hi = result >> 16;
- uniform unsigned int16 lo = result;
- return lo | - (uniform int16) !! hi;
- }
- static inline varying unsigned int16 saturating_mul(varying unsigned int16 a,
- varying unsigned int16 b) {
- varying unsigned int32 result = (varying unsigned int32) a *
- (varying unsigned int32) b;
- varying unsigned int16 hi = result >> 16;
- varying unsigned int16 lo = result;
- return lo | - (varying int16) !! hi;
- }
- static inline uniform unsigned int32 saturating_mul(uniform unsigned int32 a,
- uniform unsigned int32 b) {
- uniform unsigned int64 result = (uniform unsigned int64) a *
- (uniform unsigned int64) b;
- uniform unsigned int32 hi = result >> 32;
- uniform unsigned int32 lo = result;
- return lo | - (uniform int32) !! hi;
- }
- static inline varying unsigned int32 saturating_mul(varying unsigned int32 a,
- varying unsigned int32 b) {
- varying unsigned int64 result = (varying unsigned int64) a *
- (varying unsigned int64) b;
- varying unsigned int32 hi = result >> 32;
- varying unsigned int32 lo = result;
- return lo | - (varying int32) !! hi;
- }
- static inline uniform int64 saturating_mul(uniform int64 a, uniform int64 b) {
- uniform unsigned int64 ret = 0;
- uniform int8 sign = (((a > 0) && (b > 0)) || ((a < 0) && (b < 0))) ? 1 : -1;
- uniform unsigned int64 a_abs = 0;
- uniform unsigned int64 b_abs = 0;
- if (a == INT64_MIN)
- // Operation "-" is undefined for "INT64_MIN", as it causes overflow.
- // But converting INT64_MIN to unsigned type yields the correct result,
- // i.e. it will be positive value -INT64_MIN.
- // See 6.3.1.3 section in C99 standart for more details (ISPC follows
- // C standard, unless it's specifically different in the language).
- a_abs = (uniform unsigned int64) INT64_MIN;
- else
- a_abs = (a > 0) ? a : -a;
- if (b == INT64_MIN)
- b_abs = (uniform unsigned int64) INT64_MIN;
- else
- b_abs = (b > 0) ? b : -b;
- uniform unsigned int32 a0 = a_abs & 0xFFFFFFFF;
- uniform unsigned int32 b0 = b_abs & 0xFFFFFFFF;
- uniform unsigned int32 a1 = a_abs >> 32;
- uniform unsigned int32 b1 = b_abs >> 32;
- if ((a1 != 0) && (b1 != 0)) {
- if (sign > 0) {
- return INT64_MAX;
- }
- else {
- return INT64_MIN;
- }
- } else if (a1 != 0) {
- ret = saturating_add ((uniform unsigned int64) saturating_mul (b0, a1) << 32 ,
- (uniform unsigned int64) (a0) * b0);
- } else if (b1 != 0) {
- ret = saturating_add ((uniform unsigned int64) saturating_mul (a0, b1) << 32 ,
- (uniform unsigned int64) (a0) * b0);
- } else {
- ret = a_abs * b_abs;
- }
- if ((sign < 0) && (ret >= (uniform unsigned int64) INT64_MIN)) {
- return INT64_MIN;
- } else if ((sign > 0) && (ret >= INT64_MAX)) {
- return INT64_MAX;
- } else {
- return ret * sign;
- }
- }
- static inline varying int64 saturating_mul(varying int64 a, varying int64 b) {
- varying unsigned int64 ret = 0;
- varying int8 sign = (((a > 0) && (b > 0)) || ((a < 0) && (b < 0))) ? 1 : -1;
- varying unsigned int64 a_abs = 0;
- varying unsigned int64 b_abs = 0;
- if (a == INT64_MIN)
- // Operation "-" is undefined for "INT64_MIN", as it causes overflow.
- // But converting INT64_MIN to unsigned type yields the correct result,
- // i.e. it will be positive value -INT64_MIN.
- // See 6.3.1.3 section in C99 standart for more details (ISPC follows
- // C standard, unless it's specifically different in the language).
- a_abs = (varying unsigned int64) INT64_MIN;
- else
- a_abs = (a > 0) ? a : -a;
- if (b == INT64_MIN)
- b_abs = (varying unsigned int64) INT64_MIN;
- else
- b_abs = (b > 0) ? b : -b;
- varying unsigned int32 a0 = a_abs & 0xFFFFFFFF;
- varying unsigned int32 b0 = b_abs & 0xFFFFFFFF;
- varying unsigned int32 a1 = a_abs >> 32;
- varying unsigned int32 b1 = b_abs >> 32;
- if ((a1 != 0) && (b1 != 0)) {
- if (sign > 0) {
- return INT64_MAX;
- }
- else {
- return INT64_MIN;
- }
- } else if (a1 != 0) {
- ret = saturating_add ((varying unsigned int64) saturating_mul (b0, a1) << 32 ,
- (varying unsigned int64) (a0) * b0);
- } else if (b1 != 0) {
- ret = saturating_add ((varying unsigned int64) saturating_mul (a0, b1) << 32 ,
- (varying unsigned int64) (a0) * b0);
- } else {
- ret = a_abs * b_abs;
- }
- if ((sign < 0) && (ret >= (varying unsigned int64) INT64_MIN)) {
- return INT64_MIN;
- } else if ((sign > 0) && (ret >= INT64_MAX)) {
- return INT64_MAX;
- } else {
- return ret * sign;
- }
- }
- static inline uniform unsigned int64 saturating_mul(uniform unsigned int64 a,
- uniform unsigned int64 b) {
- uniform unsigned int32 a0 = a & 0xFFFFFFFF;
- uniform unsigned int32 b0 = b & 0xFFFFFFFF;
- uniform unsigned int32 a1 = a >> 32;
- uniform unsigned int32 b1 = b >> 32;
- if ((a1 != 0) && (b1 != 0)) {
- return UINT64_MAX;
- } else if (a1 != 0) {
- return saturating_add ((uniform unsigned int64) saturating_mul (b0, a1) << 32 ,
- (uniform unsigned int64) (a0) * b0);
- } else if (b1 != 0) {
- return saturating_add ((uniform unsigned int64) saturating_mul (a0, b1) << 32 ,
- (uniform unsigned int64) (a0) * b0);
- } else {
- return a * b;
- }
- }
- static inline varying unsigned int64 saturating_mul(varying unsigned int64 a,
- varying unsigned int64 b) {
- varying unsigned int32 a0 = a & 0xFFFFFFFF;
- varying unsigned int32 b0 = b & 0xFFFFFFFF;
- varying unsigned int32 a1 = a >> 32;
- varying unsigned int32 b1 = b >> 32;
- if ((a1 != 0) && (b1 != 0)) {
- return UINT64_MAX;
- } else if (a1 != 0) {
- return saturating_add ((varying unsigned int64) saturating_mul (b0, a1) << 32 ,
- (varying unsigned int64) (a0) * b0);
- } else if (b1 != 0) {
- return saturating_add ((varying unsigned int64) saturating_mul (a0, b1) << 32 ,
- (varying unsigned int64) (a0) * b0);
- } else {
- return a * b;
- }
- }
- ///////////////////////////////////////////////////////////////////////////
- // rdrand
- static inline uniform bool rdrand(float * uniform ptr) {
- if (__have_native_rand == false)
- return false;
- else {
- uniform int32 irand;
- uniform bool success = __rdrand_i32(&irand);
- if (success) {
- irand &= (1ul<<23)-1;
- *ptr = floatbits(0x3F800000 | irand)-1.0f;
- }
- return success;
- }
- }
- static inline bool rdrand(varying float * uniform ptr) {
- if (__have_native_rand == false)
- return false;
- else {
- bool success = false;
- foreach_active (index) {
- uniform int32 irand;
- if (__rdrand_i32(&irand)) {
- // FIXME: it probably would be preferable, here and in the
- // following rdrand() function, to do the int->float stuff
- // in vector form. However, we need to be careful to not
- // clobber any existing already-set values in *ptr with
- // inactive lanes here...
- irand &= (1ul<<23)-1;
- *ptr = floatbits(0x3F800000 | irand)-1.0f;
- success = true;
- }
- }
- return success;
- }
- }
- static inline bool rdrand(float * ptr) {
- if (__have_native_rand == false)
- return false;
- else {
- float * uniform ptrs[programCount];
- ptrs[programIndex] = ptr;
- bool success = false;
- foreach_active (index) {
- uniform int32 irand;
- if (__rdrand_i32(&irand)) {
- irand &= (1ul<<23)-1;
- *ptrs[index] = floatbits(0x3F800000 | irand)-1.0f;
- success = true;
- }
- }
- return success;
- }
- }
- static inline uniform bool rdrand(int16 * uniform ptr) {
- if (__have_native_rand == false)
- return false;
- else
- return __rdrand_i16(ptr);
- }
- static inline bool rdrand(varying int16 * uniform ptr) {
- if (__have_native_rand == false)
- return false;
- else {
- bool success = false;
- foreach_active (index) {
- uniform int16 irand;
- if (__rdrand_i16(&irand)) {
- *ptr = irand;
- success = true;
- }
- }
- return success;
- }
- }
- static inline bool rdrand(int16 * ptr) {
- if (__have_native_rand == false)
- return false;
- else {
- int16 * uniform ptrs[programCount];
- ptrs[programIndex] = ptr;
- bool success = false;
- foreach_active (index) {
- uniform int16 irand;
- if (__rdrand_i16(&irand)) {
- *ptrs[index] = irand;
- success = true;
- }
- }
- return success;
- }
- }
- static inline uniform bool rdrand(int32 * uniform ptr) {
- if (__have_native_rand == false)
- return false;
- else
- return __rdrand_i32(ptr);
- }
- static inline bool rdrand(varying int32 * uniform ptr) {
- if (__have_native_rand == false)
- return false;
- else {
- bool success = false;
- foreach_active (index) {
- uniform int32 irand;
- if (__rdrand_i32(&irand)) {
- *ptr = irand;
- success = true;
- }
- }
- return success;
- }
- }
- static inline bool rdrand(int32 * ptr) {
- if (__have_native_rand == false)
- return false;
- else {
- int32 * uniform ptrs[programCount];
- ptrs[programIndex] = ptr;
- bool success = false;
- foreach_active (index) {
- uniform int32 irand;
- if (__rdrand_i32(&irand)) {
- *ptrs[index] = irand;
- success = true;
- }
- }
- return success;
- }
- }
- static inline uniform bool rdrand(int64 * uniform ptr) {
- if (__have_native_rand == false)
- return false;
- else
- return __rdrand_i64(ptr);
- }
- static inline bool rdrand(varying int64 * uniform ptr) {
- if (__have_native_rand == false)
- return false;
- else {
- bool success = false;
- foreach_active (index) {
- uniform int64 irand;
- if (__rdrand_i64(&irand)) {
- *ptr = irand;
- success = true;
- }
- }
- return success;
- }
- }
- static inline bool rdrand(int64 * ptr) {
- if (__have_native_rand == false)
- return false;
- else {
- int64 * uniform ptrs[programCount];
- ptrs[programIndex] = ptr;
- bool success = false;
- foreach_active (index) {
- uniform int64 irand;
- if (__rdrand_i64(&irand)) {
- *ptrs[index] = irand;
- success = true;
- }
- }
- return success;
- }
- }
- ///////////////////////////////////////////////////////////////////////////
- // Fast vector integer division
- /* These tables and the algorithms in the __fast_idiv() functions below are
- from Halide; the idea is based on the paper "Division by Invariant
- Integers using Multiplication" by Granlund and Montgomery.
- Copyright (c) 2012 MIT CSAIL
- Developed by:
- The Halide team
- MIT CSAIL
- http://halide-lang.org
- Permission is hereby granted, free of charge, to any person obtaining a
- copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- The above copyright notice and this permission notice shall be included
- in all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
- static const uniform int64 __idiv_table_u8[][3] = {
- {0, 0LL, 1}, {1, 171LL, 1}, {0, 0LL, 2},
- {1, 205LL, 2}, {1, 171LL, 2}, {2, 37LL, 2},
- {0, 0LL, 3}, {1, 57LL, 1}, {1, 205LL, 3},
- {2, 117LL, 3}, {1, 171LL, 3}, {1, 79LL, 2},
- {2, 37LL, 3}, {1, 137LL, 3}, {0, 0LL, 4},
- {1, 241LL, 4}, {1, 57LL, 2}, {1, 27LL, 1},
- {1, 205LL, 4}, {2, 135LL, 4}, {2, 117LL, 4},
- {2, 101LL, 4}, {1, 171LL, 4}, {1, 41LL, 2},
- {1, 79LL, 3}, {1, 19LL, 1}, {2, 37LL, 4},
- {2, 27LL, 4}, {1, 137LL, 4}, {2, 9LL, 4},
- {0, 0LL, 5}, {1, 249LL, 5}, {1, 241LL, 5},
- {1, 235LL, 5}, {1, 57LL, 3}, {1, 111LL, 4},
- {1, 27LL, 2}, {2, 165LL, 5}, {1, 205LL, 5},
- {1, 25LL, 2}, {2, 135LL, 5}, {1, 191LL, 5},
- {1, 187LL, 5}, {2, 109LL, 5}, {2, 101LL, 5},
- {1, 175LL, 5}, {1, 171LL, 5}, {2, 79LL, 5},
- {1, 41LL, 3}, {1, 161LL, 5}, {1, 79LL, 4},
- {1, 155LL, 5}, {1, 19LL, 2}, {1, 149LL, 5},
- {2, 37LL, 5}, {1, 9LL, 1}, {2, 27LL, 5},
- {1, 139LL, 5}, {1, 137LL, 5}, {2, 13LL, 5},
- {2, 9LL, 5}, {2, 5LL, 5}, {0, 0LL, 6},
- {1, 253LL, 6}, {1, 249LL, 6}, {1, 245LL, 6},
- {1, 121LL, 5}, {1, 119LL, 5}, {1, 235LL, 6},
- {1, 231LL, 6}, {1, 57LL, 4}, {1, 225LL, 6},
- {1, 111LL, 5}, {1, 219LL, 6}, {1, 27LL, 3},
- {1, 213LL, 6}, {2, 165LL, 6}, {1, 13LL, 2},
- {1, 205LL, 6}, {1, 203LL, 6}, {1, 25LL, 3},
- {1, 99LL, 5}, {2, 135LL, 6}, {1, 193LL, 6},
- {1, 191LL, 6}, {1, 189LL, 6}, {1, 187LL, 6},
- {1, 185LL, 6}, {1, 183LL, 6}, {1, 181LL, 6},
- {1, 179LL, 6}, {1, 177LL, 6}, {1, 175LL, 6},
- {1, 173LL, 6}, {1, 171LL, 6}, {1, 169LL, 6},
- {1, 21LL, 3}, {1, 83LL, 5}, {1, 41LL, 4},
- {1, 163LL, 6}, {1, 161LL, 6}, {2, 63LL, 6},
- {1, 79LL, 5}, {2, 57LL, 6}, {1, 155LL, 6},
- {2, 51LL, 6}, {1, 19LL, 3}, {1, 151LL, 6},
- {1, 149LL, 6}, {1, 37LL, 4}, {2, 37LL, 6},
- {1, 145LL, 6}, {1, 9LL, 2}, {1, 143LL, 6},
- {2, 27LL, 6}, {2, 25LL, 6}, {1, 139LL, 6},
- {1, 69LL, 5}, {1, 137LL, 6}, {2, 15LL, 6},
- {2, 13LL, 6}, {2, 11LL, 6}, {2, 9LL, 6},
- {2, 7LL, 6}, {2, 5LL, 6}, {2, 3LL, 6},
- {0, 0LL, 7}, {1, 255LL, 7}, {1, 127LL, 6},
- {1, 63LL, 5}, {1, 125LL, 6}, {1, 31LL, 4},
- {1, 123LL, 6}, {1, 61LL, 5}, {1, 121LL, 6},
- {1, 15LL, 3}, {1, 119LL, 6}, {1, 59LL, 5},
- {1, 235LL, 7}, {1, 117LL, 6}, {1, 29LL, 4},
- {1, 115LL, 6}, {1, 57LL, 5}, {1, 113LL, 6},
- {1, 225LL, 7}, {1, 7LL, 2}, {1, 111LL, 6},
- {1, 55LL, 5}, {1, 219LL, 7}, {1, 109LL, 6},
- {1, 27LL, 4}, {1, 215LL, 7}, {1, 107LL, 6},
- {1, 53LL, 5}, {1, 211LL, 7}, {1, 105LL, 6},
- {1, 13LL, 3}, {1, 207LL, 7}, {1, 103LL, 6},
- {1, 51LL, 5}, {1, 203LL, 7}, {1, 101LL, 6},
- {1, 25LL, 4}, {1, 199LL, 7}, {1, 99LL, 6},
- {1, 197LL, 7}, {1, 49LL, 5}, {1, 97LL, 6},
- {1, 193LL, 7}, {1, 3LL, 1}, {1, 191LL, 7},
- {1, 95LL, 6}, {1, 189LL, 7}, {1, 47LL, 5},
- {1, 187LL, 7}, {1, 93LL, 6}, {1, 185LL, 7},
- {1, 23LL, 4}, {1, 183LL, 7}, {1, 91LL, 6},
- {1, 181LL, 7}, {1, 45LL, 5}, {1, 179LL, 7},
- {1, 89LL, 6}, {1, 177LL, 7}, {1, 11LL, 3},
- {1, 175LL, 7}, {1, 87LL, 6}, {1, 173LL, 7},
- {1, 43LL, 5}, {1, 171LL, 7}, {1, 85LL, 6},
- {1, 169LL, 7}, {2, 81LL, 7}, {1, 21LL, 4},
- {1, 167LL, 7}, {1, 83LL, 6}, {1, 165LL, 7},
- {1, 41LL, 5}, {2, 71LL, 7}, {1, 163LL, 7},
- {1, 81LL, 6}, {1, 161LL, 7}, {1, 5LL, 2},
- {2, 63LL, 7}, {1, 159LL, 7}, {1, 79LL, 6},
- {1, 157LL, 7}, {2, 57LL, 7}, {1, 39LL, 5},
- {1, 155LL, 7}, {1, 77LL, 6}, {2, 51LL, 7},
- {1, 153LL, 7}, {1, 19LL, 4}, {2, 47LL, 7},
- {1, 151LL, 7}, {1, 75LL, 6}, {1, 149LL, 7},
- {2, 41LL, 7}, {1, 37LL, 5}, {1, 147LL, 7},
- {2, 37LL, 7}, {1, 73LL, 6}, {1, 145LL, 7},
- {2, 33LL, 7}, {1, 9LL, 3}, {2, 31LL, 7},
- {1, 143LL, 7}, {1, 71LL, 6}, {2, 27LL, 7},
- {1, 141LL, 7}, {2, 25LL, 7}, {1, 35LL, 5},
- {1, 139LL, 7}, {2, 21LL, 7}, {1, 69LL, 6},
- {2, 19LL, 7}, {1, 137LL, 7}, {1, 17LL, 4},
- {2, 15LL, 7}, {1, 135LL, 7}, {2, 13LL, 7},
- {1, 67LL, 6}, {2, 11LL, 7}, {1, 133LL, 7},
- {2, 9LL, 7}, {1, 33LL, 5}, {2, 7LL, 7},
- {1, 131LL, 7}, {2, 5LL, 7}, {1, 65LL, 6},
- {2, 3LL, 7}, {1, 129LL, 7}, {0, 0LL, 8},
- };
- static const uniform int64 __idiv_table_s8[][3] = {
- {0, 0LL, 1}, {1, 86LL, 0}, {0, 0LL, 2},
- {1, 103LL, 1}, {1, 43LL, 0}, {1, 147LL, 2},
- {0, 0LL, 3}, {1, 57LL, 1}, {1, 103LL, 2},
- {1, 187LL, 3}, {1, 43LL, 1}, {1, 79LL, 2},
- {1, 147LL, 3}, {1, 137LL, 3}, {0, 0LL, 4},
- {1, 121LL, 3}, {1, 57LL, 2}, {1, 27LL, 1},
- {1, 103LL, 3}, {1, 49LL, 2}, {1, 187LL, 4},
- {1, 179LL, 4}, {1, 43LL, 2}, {1, 41LL, 2},
- {1, 79LL, 3}, {1, 19LL, 1}, {1, 147LL, 4},
- {1, 71LL, 3}, {1, 137LL, 4}, {1, 133LL, 4},
- {0, 0LL, 5}, {1, 125LL, 4}, {1, 121LL, 4},
- {1, 59LL, 3}, {1, 57LL, 3}, {1, 111LL, 4},
- {1, 27LL, 2}, {1, 211LL, 5}, {1, 103LL, 4},
- {1, 25LL, 2}, {1, 49LL, 3}, {1, 6LL, 0},
- {1, 47LL, 3}, {1, 23LL, 2}, {1, 45LL, 3},
- {1, 11LL, 1}, {1, 43LL, 3}, {1, 21LL, 2},
- {1, 41LL, 3}, {1, 81LL, 4}, {1, 79LL, 4},
- {1, 39LL, 3}, {1, 19LL, 2}, {1, 75LL, 4},
- {1, 147LL, 5}, {1, 9LL, 1}, {1, 71LL, 4},
- {1, 35LL, 3}, {1, 137LL, 5}, {1, 135LL, 5},
- {1, 133LL, 5}, {1, 131LL, 5}, {0, 0LL, 6},
- {1, 127LL, 5}, {1, 63LL, 4}, {1, 31LL, 3},
- {1, 61LL, 4}, {1, 15LL, 2}, {1, 59LL, 4},
- {1, 29LL, 3}, {1, 57LL, 4}, {1, 113LL, 5},
- {1, 7LL, 1}, {1, 55LL, 4}, {1, 27LL, 3},
- {1, 107LL, 5}, {1, 53LL, 4}, {1, 13LL, 2},
- {1, 103LL, 5}, {1, 51LL, 4}, {1, 25LL, 3},
- {1, 99LL, 5}, {1, 49LL, 4}, {1, 97LL, 5},
- {1, 3LL, 0}, {1, 95LL, 5}, {1, 47LL, 4},
- {1, 93LL, 5}, {1, 23LL, 3}, {1, 91LL, 5},
- {1, 45LL, 4}, {1, 89LL, 5}, {1, 11LL, 2},
- {1, 87LL, 5}, {1, 43LL, 4}, {1, 85LL, 5},
- {1, 21LL, 3}, {1, 83LL, 5}, {1, 41LL, 4},
- {1, 163LL, 6}, {1, 81LL, 5}, {1, 5LL, 1},
- {1, 79LL, 5}, {1, 157LL, 6}, {1, 39LL, 4},
- {1, 77LL, 5}, {1, 19LL, 3}, {1, 151LL, 6},
- {1, 75LL, 5}, {1, 37LL, 4}, {1, 147LL, 6},
- {1, 73LL, 5}, {1, 9LL, 2}, {1, 143LL, 6},
- {1, 71LL, 5}, {1, 141LL, 6}, {1, 35LL, 4},
- {1, 69LL, 5}, {1, 137LL, 6}, {1, 17LL, 3},
- {1, 135LL, 6}, {1, 67LL, 5}, {1, 133LL, 6},
- {1, 33LL, 4}, {1, 131LL, 6}, {1, 65LL, 5},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7},
- };
- static const uniform int64 __idiv_table_u16[][3] = {
- {0, 0LL, 1}, {1, 43691LL, 1}, {0, 0LL, 2},
- {1, 52429LL, 2}, {1, 43691LL, 2}, {2, 9363LL, 2},
- {0, 0LL, 3}, {1, 58255LL, 3}, {1, 52429LL, 3},
- {1, 47663LL, 3}, {1, 43691LL, 3}, {1, 20165LL, 2},
- {2, 9363LL, 3}, {1, 34953LL, 3}, {0, 0LL, 4},
- {1, 61681LL, 4}, {1, 58255LL, 4}, {1, 55189LL, 4},
- {1, 52429LL, 4}, {2, 34329LL, 4}, {1, 47663LL, 4},
- {2, 25645LL, 4}, {1, 43691LL, 4}, {2, 18351LL, 4},
- {1, 20165LL, 3}, {2, 12137LL, 4}, {2, 9363LL, 4},
- {1, 18079LL, 3}, {1, 34953LL, 4}, {2, 2115LL, 4},
- {0, 0LL, 5}, {1, 63551LL, 5}, {1, 61681LL, 5},
- {1, 59919LL, 5}, {1, 58255LL, 5}, {1, 7085LL, 2},
- {1, 55189LL, 5}, {2, 42011LL, 5}, {1, 52429LL, 5},
- {2, 36765LL, 5}, {2, 34329LL, 5}, {1, 48771LL, 5},
- {1, 47663LL, 5}, {1, 11651LL, 3}, {2, 25645LL, 5},
- {2, 23705LL, 5}, {1, 43691LL, 5}, {2, 20063LL, 5},
- {2, 18351LL, 5}, {1, 41121LL, 5}, {1, 20165LL, 4},
- {1, 39569LL, 5}, {2, 12137LL, 5}, {2, 10725LL, 5},
- {2, 9363LL, 5}, {2, 8049LL, 5}, {1, 18079LL, 4},
- {1, 35545LL, 5}, {1, 34953LL, 5}, {1, 8595LL, 3},
- {2, 2115LL, 5}, {2, 1041LL, 5}, {0, 0LL, 6},
- {1, 4033LL, 2}, {1, 63551LL, 6}, {1, 31301LL, 5},
- {1, 61681LL, 6}, {2, 56039LL, 6}, {1, 59919LL, 6},
- {1, 59075LL, 6}, {1, 58255LL, 6}, {1, 57457LL, 6},
- {1, 7085LL, 3}, {2, 46313LL, 6}, {1, 55189LL, 6},
- {1, 6809LL, 3}, {2, 42011LL, 6}, {1, 53093LL, 6},
- {1, 52429LL, 6}, {1, 25891LL, 5}, {2, 36765LL, 6},
- {1, 25267LL, 5}, {2, 34329LL, 6}, {1, 49345LL, 6},
- {1, 48771LL, 6}, {1, 48211LL, 6}, {1, 47663LL, 6},
- {2, 28719LL, 6}, {1, 11651LL, 4}, {2, 26647LL, 6},
- {2, 25645LL, 6}, {2, 24665LL, 6}, {2, 23705LL, 6},
- {1, 44151LL, 6}, {1, 43691LL, 6}, {2, 20945LL, 6},
- {2, 20063LL, 6}, {1, 42367LL, 6}, {2, 18351LL, 6},
- {1, 5191LL, 3}, {1, 41121LL, 6}, {1, 20361LL, 5},
- {1, 20165LL, 5}, {1, 19973LL, 5}, {1, 39569LL, 6},
- {2, 12863LL, 6}, {2, 12137LL, 6}, {1, 2405LL, 2},
- {2, 10725LL, 6}, {1, 37787LL, 6}, {2, 9363LL, 6},
- {1, 18559LL, 5}, {2, 8049LL, 6}, {2, 7409LL, 6},
- {1, 18079LL, 5}, {1, 35849LL, 6}, {1, 35545LL, 6},
- {2, 4957LL, 6}, {1, 34953LL, 6}, {1, 4333LL, 3},
- {1, 8595LL, 4}, {2, 2665LL, 6}, {2, 2115LL, 6},
- {2, 1573LL, 6}, {2, 1041LL, 6}, {2, 517LL, 6},
- {0, 0LL, 7}, {1, 16257LL, 5}, {1, 4033LL, 3},
- {1, 16009LL, 5}, {1, 63551LL, 7}, {1, 63073LL, 7},
- {1, 31301LL, 6}, {1, 31069LL, 6}, {1, 61681LL, 7},
- {1, 61231LL, 7}, {2, 56039LL, 7}, {1, 30175LL, 6},
- {1, 59919LL, 7}, {1, 29747LL, 6}, {1, 59075LL, 7},
- {1, 29331LL, 6}, {1, 58255LL, 7}, {1, 57853LL, 7},
- {1, 57457LL, 7}, {1, 28533LL, 6}, {1, 7085LL, 4},
- {1, 14075LL, 5}, {2, 46313LL, 7}, {1, 27777LL, 6},
- {1, 55189LL, 7}, {1, 13707LL, 5}, {1, 6809LL, 4},
- {2, 42705LL, 7}, {2, 42011LL, 7}, {1, 53431LL, 7},
- {1, 53093LL, 7}, {1, 52759LL, 7}, {1, 52429LL, 7},
- {2, 38671LL, 7}, {1, 25891LL, 6}, {1, 6433LL, 4},
- {2, 36765LL, 7}, {2, 36145LL, 7}, {1, 25267LL, 6},
- {2, 34927LL, 7}, {2, 34329LL, 7}, {1, 49637LL, 7},
- {1, 49345LL, 7}, {2, 32577LL, 7}, {1, 48771LL, 7},
- {2, 31443LL, 7}, {1, 48211LL, 7}, {1, 47935LL, 7},
- {1, 47663LL, 7}, {2, 29251LL, 7}, {2, 28719LL, 7},
- {1, 2929LL, 3}, {1, 11651LL, 5}, {1, 23173LL, 6},
- {2, 26647LL, 7}, {1, 2865LL, 3}, {2, 25645LL, 7},
- {1, 1417LL, 2}, {2, 24665LL, 7}, {1, 44859LL, 7},
- {2, 23705LL, 7}, {2, 23233LL, 7}, {1, 44151LL, 7},
- {1, 2745LL, 3}, {1, 43691LL, 7}, {2, 21393LL, 7},
- {2, 20945LL, 7}, {1, 43019LL, 7}, {2, 20063LL, 7},
- {1, 21291LL, 6}, {1, 42367LL, 7}, {1, 21077LL, 6},
- {2, 18351LL, 7}, {1, 41735LL, 7}, {1, 5191LL, 4},
- {2, 17111LL, 7}, {1, 41121LL, 7}, {2, 16305LL, 7},
- {1, 20361LL, 6}, {1, 40525LL, 7}, {1, 20165LL, 6},
- {1, 40137LL, 7}, {1, 19973LL, 6}, {1, 39757LL, 7},
- {1, 39569LL, 7}, {2, 13231LL, 7}, {2, 12863LL, 7},
- {1, 39017LL, 7}, {2, 12137LL, 7}, {2, 11779LL, 7},
- {1, 2405LL, 3}, {2, 11073LL, 7}, {2, 10725LL, 7},
- {1, 18979LL, 6}, {1, 37787LL, 7}, {2, 9699LL, 7},
- {2, 9363LL, 7}, {1, 37283LL, 7}, {1, 18559LL, 6},
- {2, 8373LL, 7}, {2, 8049LL, 7}, {1, 4579LL, 4},
- {2, 7409LL, 7}, {2, 7093LL, 7}, {1, 18079LL, 6},
- {1, 36003LL, 7}, {1, 35849LL, 7}, {2, 5857LL, 7},
- {1, 35545LL, 7}, {1, 35395LL, 7}, {2, 4957LL, 7},
- {1, 35099LL, 7}, {1, 34953LL, 7}, {1, 4351LL, 4},
- {1, 4333LL, 4}, {2, 3507LL, 7}, {1, 8595LL, 5},
- {2, 2943LL, 7}, {2, 2665LL, 7}, {1, 16981LL, 6},
- {2, 2115LL, 7}, {2, 1843LL, 7}, {2, 1573LL, 7},
- {1, 33421LL, 7}, {2, 1041LL, 7}, {1, 33157LL, 7},
- {2, 517LL, 7}, {1, 32897LL, 7}, {0, 0LL, 8},
- };
- static const uniform int64 __idiv_table_s16[][3] = {
- {0, 0LL, 1}, {1, 21846LL, 0}, {0, 0LL, 2},
- {1, 26215LL, 1}, {1, 10923LL, 0}, {1, 18725LL, 1},
- {0, 0LL, 3}, {1, 7282LL, 0}, {1, 26215LL, 2},
- {1, 5958LL, 0}, {1, 10923LL, 1}, {1, 20165LL, 2},
- {1, 18725LL, 2}, {1, 34953LL, 3}, {0, 0LL, 4},
- {1, 30841LL, 3}, {1, 3641LL, 0}, {1, 55189LL, 4},
- {1, 26215LL, 3}, {1, 49933LL, 4}, {1, 2979LL, 0},
- {1, 45591LL, 4}, {1, 10923LL, 2}, {1, 5243LL, 1},
- {1, 20165LL, 3}, {1, 38837LL, 4}, {1, 18725LL, 3},
- {1, 18079LL, 3}, {1, 34953LL, 4}, {1, 16913LL, 3},
- {0, 0LL, 5}, {1, 1986LL, 0}, {1, 30841LL, 4},
- {1, 3745LL, 1}, {1, 3641LL, 1}, {1, 7085LL, 2},
- {1, 55189LL, 5}, {1, 26887LL, 4}, {1, 26215LL, 4},
- {1, 51151LL, 5}, {1, 49933LL, 5}, {1, 12193LL, 3},
- {1, 2979LL, 1}, {1, 11651LL, 3}, {1, 45591LL, 5},
- {1, 44621LL, 5}, {1, 10923LL, 3}, {1, 2675LL, 1},
- {1, 5243LL, 2}, {1, 41121LL, 5}, {1, 20165LL, 4},
- {1, 19785LL, 4}, {1, 38837LL, 5}, {1, 38131LL, 5},
- {1, 18725LL, 4}, {1, 36793LL, 5}, {1, 18079LL, 4},
- {1, 17773LL, 4}, {1, 34953LL, 5}, {1, 8595LL, 3},
- {1, 16913LL, 4}, {1, 33289LL, 5}, {0, 0LL, 6},
- {1, 4033LL, 2}, {1, 993LL, 0}, {1, 31301LL, 5},
- {1, 30841LL, 5}, {1, 15197LL, 4}, {1, 3745LL, 2},
- {1, 14769LL, 4}, {1, 3641LL, 2}, {1, 57457LL, 6},
- {1, 7085LL, 3}, {1, 55925LL, 6}, {1, 55189LL, 6},
- {1, 6809LL, 3}, {1, 26887LL, 5}, {1, 26547LL, 5},
- {1, 26215LL, 5}, {1, 25891LL, 5}, {1, 51151LL, 6},
- {1, 25267LL, 5}, {1, 49933LL, 6}, {1, 24673LL, 5},
- {1, 12193LL, 4}, {1, 48211LL, 6}, {1, 2979LL, 2},
- {1, 5891LL, 3}, {1, 11651LL, 4}, {1, 11523LL, 4},
- {1, 45591LL, 6}, {1, 45101LL, 6}, {1, 44621LL, 6},
- {1, 44151LL, 6}, {1, 10923LL, 4}, {1, 43241LL, 6},
- {1, 2675LL, 2}, {1, 662LL, 0}, {1, 5243LL, 3},
- {1, 5191LL, 3}, {1, 41121LL, 6}, {1, 20361LL, 5},
- {1, 20165LL, 5}, {1, 19973LL, 5}, {1, 19785LL, 5},
- {1, 1225LL, 1}, {1, 38837LL, 6}, {1, 2405LL, 2},
- {1, 38131LL, 6}, {1, 37787LL, 6}, {1, 18725LL, 5},
- {1, 18559LL, 5}, {1, 36793LL, 6}, {1, 36473LL, 6},
- {1, 18079LL, 5}, {1, 35849LL, 6}, {1, 17773LL, 5},
- {1, 35247LL, 6}, {1, 34953LL, 6}, {1, 4333LL, 3},
- {1, 8595LL, 4}, {1, 34101LL, 6}, {1, 16913LL, 5},
- {1, 33555LL, 6}, {1, 33289LL, 6}, {1, 33027LL, 6},
- {0, 0LL, 7}, {1, 16257LL, 5}, {1, 4033LL, 3},
- {1, 16009LL, 5}, {1, 993LL, 1}, {1, 31537LL, 6},
- {1, 31301LL, 6}, {1, 31069LL, 6}, {1, 30841LL, 6},
- {1, 3827LL, 3}, {1, 15197LL, 5}, {1, 30175LL, 6},
- {1, 3745LL, 3}, {1, 29747LL, 6}, {1, 14769LL, 5},
- {1, 29331LL, 6}, {1, 3641LL, 3}, {1, 28927LL, 6},
- {1, 57457LL, 7}, {1, 28533LL, 6}, {1, 7085LL, 4},
- {1, 14075LL, 5}, {1, 55925LL, 7}, {1, 27777LL, 6},
- {1, 55189LL, 7}, {1, 13707LL, 5}, {1, 6809LL, 4},
- {1, 54121LL, 7}, {1, 26887LL, 6}, {1, 6679LL, 4},
- {1, 26547LL, 6}, {1, 6595LL, 4}, {1, 26215LL, 6},
- {1, 6513LL, 4}, {1, 25891LL, 6}, {1, 6433LL, 4},
- {1, 51151LL, 7}, {1, 50841LL, 7}, {1, 25267LL, 6},
- {1, 6279LL, 4}, {1, 49933LL, 7}, {1, 24819LL, 6},
- {1, 24673LL, 6}, {1, 49057LL, 7}, {1, 12193LL, 5},
- {1, 24245LL, 6}, {1, 48211LL, 7}, {1, 749LL, 1},
- {1, 2979LL, 3}, {1, 23697LL, 6}, {1, 5891LL, 4},
- {1, 2929LL, 3}, {1, 11651LL, 5}, {1, 23173LL, 6},
- {1, 11523LL, 5}, {1, 2865LL, 3}, {1, 45591LL, 7},
- {1, 1417LL, 2}, {1, 45101LL, 7}, {1, 11215LL, 5},
- {1, 44621LL, 7}, {1, 44385LL, 7}, {1, 44151LL, 7},
- {1, 2745LL, 3}, {1, 10923LL, 5}, {1, 43465LL, 7},
- {1, 43241LL, 7}, {1, 43019LL, 7}, {1, 2675LL, 3},
- {1, 21291LL, 6}, {1, 331LL, 0}, {1, 21077LL, 6},
- {1, 5243LL, 4}, {1, 41735LL, 7}, {1, 5191LL, 4},
- {1, 10331LL, 5}, {1, 41121LL, 7}, {1, 40921LL, 7},
- {1, 20361LL, 6}, {1, 40525LL, 7}, {1, 20165LL, 6},
- {1, 20069LL, 6}, {1, 19973LL, 6}, {1, 39757LL, 7},
- {1, 19785LL, 6}, {1, 4923LL, 4}, {1, 1225LL, 2},
- {1, 39017LL, 7}, {1, 38837LL, 7}, {1, 19329LL, 6},
- {1, 2405LL, 3}, {1, 38305LL, 7}, {1, 38131LL, 7},
- {1, 18979LL, 6}, {1, 37787LL, 7}, {1, 18809LL, 6},
- {1, 18725LL, 6}, {1, 37283LL, 7}, {1, 18559LL, 6},
- {1, 36955LL, 7}, {1, 36793LL, 7}, {1, 4579LL, 4},
- {1, 36473LL, 7}, {1, 36315LL, 7}, {1, 18079LL, 6},
- {1, 36003LL, 7}, {1, 35849LL, 7}, {1, 35697LL, 7},
- {1, 17773LL, 6}, {1, 8849LL, 5}, {1, 35247LL, 7},
- {1, 35099LL, 7}, {1, 34953LL, 7}, {1, 4351LL, 4},
- {1, 4333LL, 4}, {1, 17261LL, 6}, {1, 8595LL, 5},
- {1, 535LL, 1}, {1, 34101LL, 7}, {1, 16981LL, 6},
- {1, 16913LL, 6}, {1, 16845LL, 6}, {1, 33555LL, 7},
- {1, 33421LL, 7}, {1, 33289LL, 7}, {1, 33157LL, 7},
- {1, 33027LL, 7}, {1, 32897LL, 7}, {1, 32769LL, 7},
- };
- static const uniform int64 __idiv_table_u32[][3] = {
- {0, 0LL, 1}, {1, 2863311531LL, 1}, {0, 0LL, 2},
- {1, 3435973837LL, 2}, {1, 2863311531LL, 2}, {2, 613566757LL, 2},
- {0, 0LL, 3}, {1, 954437177LL, 1}, {1, 3435973837LL, 3},
- {1, 3123612579LL, 3}, {1, 2863311531LL, 3}, {1, 1321528399LL, 2},
- {2, 613566757LL, 3}, {1, 2290649225LL, 3}, {0, 0LL, 4},
- {1, 4042322161LL, 4}, {1, 954437177LL, 2}, {2, 2938661835LL, 4},
- {1, 3435973837LL, 4}, {2, 2249744775LL, 4}, {1, 3123612579LL, 4},
- {1, 2987803337LL, 4}, {1, 2863311531LL, 4}, {1, 1374389535LL, 3},
- {1, 1321528399LL, 3}, {2, 795364315LL, 4}, {2, 613566757LL, 4},
- {1, 2369637129LL, 4}, {1, 2290649225LL, 4}, {2, 138547333LL, 4},
- {0, 0LL, 5}, {1, 1041204193LL, 3}, {1, 4042322161LL, 5},
- {2, 3558687189LL, 5}, {1, 954437177LL, 3}, {2, 3134165325LL, 5},
- {2, 2938661835LL, 5}, {2, 2753184165LL, 5}, {1, 3435973837LL, 5},
- {1, 3352169597LL, 5}, {2, 2249744775LL, 5}, {1, 799063683LL, 3},
- {1, 3123612579LL, 5}, {2, 1813430637LL, 5}, {1, 2987803337LL, 5},
- {1, 2924233053LL, 5}, {1, 2863311531LL, 5}, {1, 1402438301LL, 4},
- {1, 1374389535LL, 4}, {1, 2694881441LL, 5}, {1, 1321528399LL, 4},
- {2, 891408307LL, 5}, {2, 795364315LL, 5}, {2, 702812831LL, 5},
- {2, 613566757LL, 5}, {2, 527452125LL, 5}, {1, 2369637129LL, 5},
- {1, 582368447LL, 3}, {1, 2290649225LL, 5}, {1, 1126548799LL, 4},
- {2, 138547333LL, 5}, {2, 68174085LL, 5}, {0, 0LL, 6},
- {1, 4228890877LL, 6}, {1, 1041204193LL, 4}, {1, 128207979LL, 1},
- {1, 4042322161LL, 6}, {1, 1991868891LL, 5}, {2, 3558687189LL, 6},
- {1, 3871519817LL, 6}, {1, 954437177LL, 4}, {2, 3235934265LL, 6},
- {2, 3134165325LL, 6}, {1, 458129845LL, 3}, {2, 2938661835LL, 6},
- {1, 892460737LL, 4}, {2, 2753184165LL, 6}, {1, 3479467177LL, 6},
- {1, 3435973837LL, 6}, {1, 3393554407LL, 6}, {1, 3352169597LL, 6},
- {1, 827945503LL, 4}, {2, 2249744775LL, 6}, {1, 3233857729LL, 6},
- {1, 799063683LL, 4}, {1, 789879043LL, 4}, {1, 3123612579LL, 6},
- {1, 3088515809LL, 6}, {2, 1813430637LL, 6}, {2, 1746305385LL, 6},
- {1, 2987803337LL, 6}, {1, 2955676419LL, 6}, {1, 2924233053LL, 6},
- {2, 1491936009LL, 6}, {1, 2863311531LL, 6}, {2, 1372618415LL, 6},
- {1, 1402438301LL, 5}, {1, 2776544515LL, 6}, {1, 1374389535LL, 5},
- {2, 1148159575LL, 6}, {1, 2694881441LL, 6}, {2, 1042467791LL, 6},
- {1, 1321528399LL, 5}, {2, 940802361LL, 6}, {2, 891408307LL, 6},
- {2, 842937507LL, 6}, {2, 795364315LL, 6}, {2, 748664025LL, 6},
- {2, 702812831LL, 6}, {2, 657787785LL, 6}, {2, 613566757LL, 6},
- {2, 570128403LL, 6}, {2, 527452125LL, 6}, {2, 485518043LL, 6},
- {1, 2369637129LL, 6}, {2, 403800345LL, 6}, {1, 582368447LL, 4},
- {1, 1154949189LL, 5}, {1, 2290649225LL, 6}, {2, 248469183LL, 6},
- {1, 1126548799LL, 5}, {2, 174592167LL, 6}, {2, 138547333LL, 6},
- {1, 274877907LL, 3}, {2, 68174085LL, 6}, {2, 33818641LL, 6},
- {0, 0LL, 7}, {1, 266354561LL, 3}, {1, 4228890877LL, 7},
- {1, 4196609267LL, 7}, {1, 1041204193LL, 5}, {1, 4133502361LL, 7},
- {1, 128207979LL, 2}, {1, 4072265289LL, 7}, {1, 4042322161LL, 7},
- {1, 125400505LL, 2}, {1, 1991868891LL, 6}, {1, 1977538899LL, 6},
- {2, 3558687189LL, 7}, {1, 974744351LL, 5}, {1, 3871519817LL, 7},
- {1, 3844446251LL, 7}, {1, 954437177LL, 5}, {1, 3791419407LL, 7},
- {2, 3235934265LL, 7}, {1, 3739835469LL, 7}, {2, 3134165325LL, 7},
- {1, 3689636335LL, 7}, {1, 458129845LL, 4}, {1, 910191745LL, 5},
- {2, 2938661835LL, 7}, {1, 3593175255LL, 7}, {1, 892460737LL, 5},
- {1, 3546811703LL, 7}, {2, 2753184165LL, 7}, {1, 875407347LL, 5},
- {1, 3479467177LL, 7}, {2, 2620200175LL, 7}, {1, 3435973837LL, 7},
- {1, 3414632385LL, 7}, {1, 3393554407LL, 7}, {1, 3372735055LL, 7},
- {1, 3352169597LL, 7}, {1, 1665926709LL, 6}, {1, 827945503LL, 5},
- {1, 1645975491LL, 6}, {2, 2249744775LL, 7}, {1, 1626496491LL, 6},
- {1, 3233857729LL, 7}, {2, 2134925265LL, 7}, {1, 799063683LL, 5},
- {2, 2060591247LL, 7}, {1, 789879043LL, 5}, {1, 1570730897LL, 6},
- {1, 3123612579LL, 7}, {2, 1916962805LL, 7}, {1, 3088515809LL, 7},
- {2, 1847555765LL, 7}, {2, 1813430637LL, 7}, {1, 3037324939LL, 7},
- {2, 1746305385LL, 7}, {1, 3004130131LL, 7}, {1, 2987803337LL, 7},
- {2, 1648338801LL, 7}, {1, 2955676419LL, 7}, {1, 2939870663LL, 7},
- {1, 2924233053LL, 7}, {2, 1522554545LL, 7}, {2, 1491936009LL, 7},
- {1, 2878302691LL, 7}, {1, 2863311531LL, 7}, {1, 356059465LL, 4},
- {2, 1372618415LL, 7}, {2, 1343553873LL, 7}, {1, 1402438301LL, 6},
- {2, 1286310003LL, 7}, {1, 2776544515LL, 7}, {1, 1381296015LL, 6},
- {1, 1374389535LL, 6}, {1, 42735993LL, 1}, {2, 1148159575LL, 7},
- {1, 2708156719LL, 7}, {1, 2694881441LL, 7}, {1, 1340867839LL, 6},
- {2, 1042467791LL, 7}, {1, 663956297LL, 5}, {1, 1321528399LL, 6},
- {1, 2630410593LL, 7}, {2, 940802361LL, 7}, {1, 2605477791LL, 7},
- {2, 891408307LL, 7}, {1, 2581013211LL, 7}, {2, 842937507LL, 7},
- {1, 1278501893LL, 6}, {2, 795364315LL, 7}, {2, 771906565LL, 7},
- {2, 748664025LL, 7}, {2, 725633745LL, 7}, {2, 702812831LL, 7},
- {2, 680198441LL, 7}, {2, 657787785LL, 7}, {2, 635578121LL, 7},
- {2, 613566757LL, 7}, {1, 2443359173LL, 7}, {2, 570128403LL, 7},
- {2, 548696263LL, 7}, {2, 527452125LL, 7}, {1, 1200340205LL, 6},
- {2, 485518043LL, 7}, {2, 464823301LL, 7}, {1, 2369637129LL, 7},
- {2, 423966729LL, 7}, {2, 403800345LL, 7}, {2, 383805589LL, 7},
- {1, 582368447LL, 5}, {2, 344322273LL, 7}, {1, 1154949189LL, 6},
- {1, 2300233531LL, 7}, {1, 2290649225LL, 7}, {1, 285143057LL, 4},
- {2, 248469183LL, 7}, {1, 2262369605LL, 7}, {1, 1126548799LL, 6},
- {2, 192835267LL, 7}, {2, 174592167LL, 7}, {2, 156496785LL, 7},
- {2, 138547333LL, 7}, {2, 120742053LL, 7}, {1, 274877907LL, 4},
- {1, 2190262207LL, 7}, {2, 68174085LL, 7}, {1, 2172947881LL, 7},
- {2, 33818641LL, 7}, {1, 2155905153LL, 7}, {0, 0LL, 8},
- };
- static const uniform int64 __idiv_table_s32[][3] = {
- {0, 0LL, 1}, {1, 1431655766LL, 0}, {0, 0LL, 2},
- {1, 1717986919LL, 1}, {1, 715827883LL, 0}, {1, 2454267027LL, 2},
- {0, 0LL, 3}, {1, 954437177LL, 1}, {1, 1717986919LL, 2},
- {1, 780903145LL, 1}, {1, 715827883LL, 1}, {1, 1321528399LL, 2},
- {1, 2454267027LL, 3}, {1, 2290649225LL, 3}, {0, 0LL, 4},
- {1, 2021161081LL, 3}, {1, 954437177LL, 2}, {1, 1808407283LL, 3},
- {1, 1717986919LL, 3}, {1, 818089009LL, 2}, {1, 780903145LL, 2},
- {1, 2987803337LL, 4}, {1, 715827883LL, 2}, {1, 1374389535LL, 3},
- {1, 1321528399LL, 3}, {1, 1272582903LL, 3}, {1, 2454267027LL, 4},
- {1, 2369637129LL, 4}, {1, 2290649225LL, 4}, {1, 2216757315LL, 4},
- {0, 0LL, 5}, {1, 1041204193LL, 3}, {1, 2021161081LL, 4},
- {1, 3926827243LL, 5}, {1, 954437177LL, 3}, {1, 3714566311LL, 5},
- {1, 1808407283LL, 4}, {1, 3524075731LL, 5}, {1, 1717986919LL, 4},
- {1, 1676084799LL, 4}, {1, 818089009LL, 3}, {1, 799063683LL, 3},
- {1, 780903145LL, 3}, {1, 3054198967LL, 5}, {1, 2987803337LL, 5},
- {1, 2924233053LL, 5}, {1, 715827883LL, 3}, {1, 1402438301LL, 4},
- {1, 1374389535LL, 4}, {1, 2694881441LL, 5}, {1, 1321528399LL, 4},
- {1, 1296593901LL, 4}, {1, 1272582903LL, 4}, {1, 156180629LL, 1},
- {1, 2454267027LL, 5}, {1, 2411209711LL, 5}, {1, 2369637129LL, 5},
- {1, 582368447LL, 3}, {1, 2290649225LL, 5}, {1, 1126548799LL, 4},
- {1, 2216757315LL, 5}, {1, 2181570691LL, 5}, {0, 0LL, 6},
- {1, 2114445439LL, 5}, {1, 1041204193LL, 4}, {1, 128207979LL, 1},
- {1, 2021161081LL, 5}, {1, 1991868891LL, 5}, {1, 3926827243LL, 6},
- {1, 3871519817LL, 6}, {1, 954437177LL, 4}, {1, 3765450781LL, 6},
- {1, 3714566311LL, 6}, {1, 458129845LL, 3}, {1, 1808407283LL, 5},
- {1, 892460737LL, 4}, {1, 3524075731LL, 6}, {1, 1739733589LL, 5},
- {1, 1717986919LL, 5}, {1, 424194301LL, 3}, {1, 1676084799LL, 5},
- {1, 827945503LL, 4}, {1, 818089009LL, 4}, {1, 1616928865LL, 5},
- {1, 799063683LL, 4}, {1, 789879043LL, 4}, {1, 780903145LL, 4},
- {1, 3088515809LL, 6}, {1, 3054198967LL, 6}, {1, 3020636341LL, 6},
- {1, 2987803337LL, 6}, {1, 738919105LL, 4}, {1, 2924233053LL, 6},
- {1, 2893451653LL, 6}, {1, 715827883LL, 4}, {1, 354224107LL, 3},
- {1, 1402438301LL, 5}, {1, 2776544515LL, 6}, {1, 1374389535LL, 5},
- {1, 680390859LL, 4}, {1, 2694881441LL, 6}, {1, 333589693LL, 3},
- {1, 1321528399LL, 5}, {1, 2617884829LL, 6}, {1, 1296593901LL, 5},
- {1, 1284476201LL, 5}, {1, 1272582903LL, 5}, {1, 2521815661LL, 6},
- {1, 156180629LL, 2}, {1, 2476377541LL, 6}, {1, 2454267027LL, 6},
- {1, 1216273925LL, 5}, {1, 2411209711LL, 6}, {1, 1195121335LL, 5},
- {1, 2369637129LL, 6}, {1, 2349383821LL, 6}, {1, 582368447LL, 4},
- {1, 1154949189LL, 5}, {1, 2290649225LL, 6}, {1, 70991195LL, 1},
- {1, 1126548799LL, 5}, {1, 558694933LL, 4}, {1, 2216757315LL, 6},
- {1, 274877907LL, 3}, {1, 2181570691LL, 6}, {1, 2164392969LL, 6},
- {0, 0LL, 7}, {1, 266354561LL, 3}, {1, 2114445439LL, 6},
- {1, 1049152317LL, 5}, {1, 1041204193LL, 5}, {1, 4133502361LL, 7},
- {1, 128207979LL, 2}, {1, 4072265289LL, 7}, {1, 2021161081LL, 6},
- {1, 125400505LL, 2}, {1, 1991868891LL, 6}, {1, 1977538899LL, 6},
- {1, 3926827243LL, 7}, {1, 974744351LL, 5}, {1, 3871519817LL, 7},
- {1, 961111563LL, 5}, {1, 954437177LL, 5}, {1, 3791419407LL, 7},
- {1, 3765450781LL, 7}, {1, 1869917735LL, 6}, {1, 3714566311LL, 7},
- {1, 230602271LL, 3}, {1, 458129845LL, 4}, {1, 910191745LL, 5},
- {1, 1808407283LL, 6}, {1, 3593175255LL, 7}, {1, 892460737LL, 5},
- {1, 443351463LL, 4}, {1, 3524075731LL, 7}, {1, 875407347LL, 5},
- {1, 1739733589LL, 6}, {1, 432197967LL, 4}, {1, 1717986919LL, 6},
- {1, 3414632385LL, 7}, {1, 424194301LL, 4}, {1, 210795941LL, 3},
- {1, 1676084799LL, 6}, {1, 1665926709LL, 6}, {1, 827945503LL, 5},
- {1, 1645975491LL, 6}, {1, 818089009LL, 5}, {1, 1626496491LL, 6},
- {1, 1616928865LL, 6}, {1, 3214946281LL, 7}, {1, 799063683LL, 5},
- {1, 397222409LL, 4}, {1, 789879043LL, 5}, {1, 1570730897LL, 6},
- {1, 780903145LL, 5}, {1, 3105965051LL, 7}, {1, 3088515809LL, 7},
- {1, 3071261531LL, 7}, {1, 3054198967LL, 7}, {1, 759331235LL, 5},
- {1, 3020636341LL, 7}, {1, 3004130131LL, 7}, {1, 2987803337LL, 7},
- {1, 2971653049LL, 7}, {1, 738919105LL, 5}, {1, 2939870663LL, 7},
- {1, 2924233053LL, 7}, {1, 2908760921LL, 7}, {1, 2893451653LL, 7},
- {1, 2878302691LL, 7}, {1, 715827883LL, 5}, {1, 356059465LL, 4},
- {1, 354224107LL, 4}, {1, 2819260585LL, 7}, {1, 1402438301LL, 6},
- {1, 1395319325LL, 6}, {1, 2776544515LL, 7}, {1, 1381296015LL, 6},
- {1, 1374389535LL, 6}, {1, 42735993LL, 1}, {1, 680390859LL, 5},
- {1, 2708156719LL, 7}, {1, 2694881441LL, 7}, {1, 1340867839LL, 6},
- {1, 333589693LL, 4}, {1, 663956297LL, 5}, {1, 1321528399LL, 6},
- {1, 2630410593LL, 7}, {1, 2617884829LL, 7}, {1, 81421181LL, 2},
- {1, 1296593901LL, 6}, {1, 2581013211LL, 7}, {1, 1284476201LL, 6},
- {1, 1278501893LL, 6}, {1, 1272582903LL, 6}, {1, 2533436931LL, 7},
- {1, 2521815661LL, 7}, {1, 2510300521LL, 7}, {1, 156180629LL, 3},
- {1, 2487582869LL, 7}, {1, 2476377541LL, 7}, {1, 2465272709LL, 7},
- {1, 2454267027LL, 7}, {1, 2443359173LL, 7}, {1, 1216273925LL, 6},
- {1, 605457945LL, 5}, {1, 2411209711LL, 7}, {1, 1200340205LL, 6},
- {1, 1195121335LL, 6}, {1, 2379895299LL, 7}, {1, 2369637129LL, 7},
- {1, 2359467013LL, 7}, {1, 2349383821LL, 7}, {1, 2339386443LL, 7},
- {1, 582368447LL, 5}, {1, 2319644785LL, 7}, {1, 1154949189LL, 6},
- {1, 2300233531LL, 7}, {1, 2290649225LL, 7}, {1, 285143057LL, 4},
- {1, 70991195LL, 2}, {1, 2262369605LL, 7}, {1, 1126548799LL, 6},
- {1, 1121950641LL, 6}, {1, 558694933LL, 5}, {1, 2225732041LL, 7},
- {1, 2216757315LL, 7}, {1, 2207854675LL, 7}, {1, 274877907LL, 4},
- {1, 2190262207LL, 7}, {1, 2181570691LL, 7}, {1, 2172947881LL, 7},
- {1, 2164392969LL, 7}, {1, 2155905153LL, 7}, {1, 2147483649LL, 7},
- };
- __declspec(safe)
- static unmasked inline unsigned int8
- __fast_idiv(unsigned int8 numerator, uniform unsigned int8 divisor) {
- uniform int64 method = __idiv_table_u8[divisor-2][0];
- uniform int64 multiplier = __idiv_table_u8[divisor-2][1];
- uniform int64 shift = __idiv_table_u8[divisor-2][2];
- unsigned int16 mult = multiplier;
- unsigned int16 val = numerator;
- if (method == 0)
- return numerator >> shift;
- else if (method == 1)
- return (val * mult) >> (8 + shift);
- else {
- val *= mult;
- val >>= 8;
- val += (numerator-val)>>1;
- return (val >> shift);
- }
- }
- __declspec(safe)
- static unmasked inline int8 __fast_idiv(int8 numerator, uniform int8 divisor) {
- uniform int8 method = __idiv_table_s8[divisor-2][0];
- uniform int16 multiplier = __idiv_table_s8[divisor-2][1];
- uniform int8 shift = __idiv_table_s8[divisor-2][2];
- if (method == 0)
- return numerator >> shift;
- else {
- unsigned int8 sign = numerator >> 7;
- numerator ^= sign;
- int16 mul = (int16)numerator * (int16)multiplier;
- mul >>= 8 + shift;
- return (int8)mul ^ sign;
- }
- }
- __declspec(safe)
- static unmasked inline unsigned int16 __fast_idiv(unsigned int16 numerator,
- uniform unsigned int16 divisor) {
- uniform int64 method = __idiv_table_u16[divisor-2][0];
- uniform int64 multiplier = __idiv_table_u16[divisor-2][1];
- uniform int64 shift = __idiv_table_u16[divisor-2][2];
- unsigned int32 mult = multiplier;
- unsigned int32 val = numerator;
- if (method == 0)
- return numerator >> shift;
- else if (method == 1)
- return (val * mult) >> (16 + shift);
- else {
- val *= mult;
- val >>= 16;
- val += (numerator-val)>>1;
- return val >> shift;
- }
- }
- __declspec(safe)
- static unmasked inline int16 __fast_idiv(int16 numerator, uniform int16 divisor) {
- uniform int64 method = __idiv_table_s16[divisor-2][0];
- uniform int64 multiplier = __idiv_table_s16[divisor-2][1];
- uniform int64 shift = __idiv_table_s16[divisor-2][2];
- if (method == 0)
- return numerator >> shift;
- else {
- unsigned int16 sign = numerator >> 15;
- numerator ^= sign;
- int32 mul = (int32)numerator * (int32)multiplier;
- mul >>= 16 + shift;
- int16 result = mul;
- return result ^ sign;
- }
- }
- __declspec(safe)
- static unmasked inline inline unsigned int32 __fast_idiv(unsigned int32 numerator,
- uniform unsigned int32 divisor) {
- uniform int64 method = __idiv_table_u32[divisor-2][0];
- uniform int64 multiplier = __idiv_table_u32[divisor-2][1];
- uniform int64 shift = __idiv_table_u32[divisor-2][2];
- unsigned int64 mult = multiplier;
- unsigned int64 val = numerator;
- if (method == 0)
- return numerator >> shift;
- else if (method == 1)
- return (val * mult) >> (32 + shift);
- else {
- val *= mult;
- val >>= 32;
- val += (numerator-val)>>1;
- return val >> shift;
- }
- }
- __declspec(safe)
- static unmasked inline int32 __fast_idiv(int32 numerator, uniform int32 divisor) {
- uniform int64 method = __idiv_table_s32[divisor-2][0];
- uniform int64 multiplier = __idiv_table_s32[divisor-2][1];
- uniform int64 shift = __idiv_table_s32[divisor-2][2];
- if (method == 0)
- return numerator >> shift;
- else {
- unsigned int32 sign = numerator >> 31;
- numerator ^= sign;
- int64 mul = (int64)numerator * (int64)multiplier;
- mul >>= 32 + shift;
- int32 result = mul;
- return result ^ sign;
- }
- }
- ///////////////////////////////////////////////////////////////////////////
- // Saturating int8/int16 ops
- __declspec(safe)
- static unmasked inline unsigned int8 avg_up(unsigned int8 a, unsigned int8 b) {
- return __avg_up_uint8(a, b);
- }
- __declspec(safe)
- static unmasked inline int8 avg_up(int8 a, int8 b) {
- return __avg_up_int8(a, b);
- }
- __declspec(safe)
- static unmasked inline unsigned int16 avg_up(unsigned int16 a, unsigned int16 b) {
- return __avg_up_uint16(a, b);
- }
- __declspec(safe)
- static unmasked inline int16 avg_up(int16 a, int16 b) {
- return __avg_up_int16(a, b);
- }
- __declspec(safe)
- static unmasked inline unsigned int8 avg_down(unsigned int8 a, unsigned int8 b) {
- return __avg_down_uint8(a, b);
- }
- __declspec(safe)
- static unmasked inline int8 avg_down(int8 a, int8 b) {
- return __avg_down_int8(a, b);
- }
- __declspec(safe)
- static unmasked inline unsigned int16 avg_down(unsigned int16 a, unsigned int16 b) {
- return __avg_down_uint16(a, b);
- }
- __declspec(safe)
- static unmasked inline int16 avg_down(int16 a, int16 b) {
- return __avg_down_int16(a, b);
- }